# Group 52 Project Aufgabe 3-3

Plan:

0. **Drop** all existing tables (from previous tries).
1. **Create** tables: `Airline`, `Flight`, `Passenger`, `Carries`.
2. **Insert** data with hundreds/thousands of rows, respecting foreign keys.
3. **Write** three SQL queries, each using at least two advanced constructs (GROUP BY, HAVING, ALL, or subselect).
4. **Create** and **call** two stored procedures (one UPDATE, one DELETE).
5. **Demonstrate** indexing and measure query performance before/after.


## 0) IMPORTS AND DB CONNECTION

In [25]:
#Connect to VPN beforehand
import oracledb
import random
from datetime import datetime, timedelta
import pandas as pd

oracledb.init_oracle_client(lib_dir=r"instantclient_19_11")

username = "a12236167"
password = "dbs24"
dsn = "oracle19.cs.univie.ac.at:1521/orclcdb"

connection = oracledb.connect(
    user=username,
    password=password,
    dsn=dsn,
)
cursor = connection.cursor()

print("Connected to Oracle DB successfully!")

Connected to Oracle DB successfully!


## 1) DROP ALL RELEVANT TABLES (FROM PREVIOUS TRIES)

In [19]:
tables_to_drop = ["Carries", "Flight", "Airline", "Passenger"]
for tbl in tables_to_drop:
    drop_sql = f"DROP TABLE {tbl} CASCADE CONSTRAINTS"
    try:
        cursor.execute(drop_sql)
        connection.commit()
        print(f"Dropped table {tbl}")
    except oracledb.Error as e:
        print(f"Skipping drop for {tbl}, possibly doesn't exist: {e}")

Dropped table Carries
Dropped table Flight
Dropped table Airline
Dropped table Passenger


## 2) CREATE TABLES

In [21]:
create_airline = """
CREATE TABLE Airline (
    airlineID      NUMBER PRIMARY KEY,
    Name           VARCHAR2(100),
    Headquarters   VARCHAR2(200)
)
"""
cursor.execute(create_airline)

create_flight = """
CREATE TABLE Flight (
    flightNumber   VARCHAR2(10) PRIMARY KEY,
    DepartureTime  TIMESTAMP,
    ArrivalTime    TIMESTAMP,
    airlineID      NUMBER,
    CONSTRAINT fk_flight_airline
        FOREIGN KEY (airlineID)
        REFERENCES Airline(airlineID)
        ON DELETE CASCADE
)
"""
cursor.execute(create_flight)

create_passenger = """
CREATE TABLE Passenger (
    passengerID    NUMBER PRIMARY KEY,
    FirstName      VARCHAR2(100),
    Surname        VARCHAR2(100),
    PassportNumber VARCHAR2(50)
)
"""
cursor.execute(create_passenger)

create_carries = """
CREATE TABLE Carries (
    flightNumber   VARCHAR2(10),
    passengerID    NUMBER,
    CONSTRAINT pk_carries
        PRIMARY KEY (flightNumber, passengerID),
    CONSTRAINT fk_carries_flight
        FOREIGN KEY (flightNumber) 
        REFERENCES Flight(flightNumber)
        ON DELETE CASCADE,
    CONSTRAINT fk_carries_passenger
        FOREIGN KEY (passengerID)
        REFERENCES Passenger(passengerID)
        ON DELETE CASCADE
)
"""
cursor.execute(create_carries)

connection.commit()
print("All tables created successfully!")


All tables created successfully!


## 3) INSERT LARGE VOLUMES OF DATA

We'll insert hundreds of Airlines, thousands of Passengers,
thousands of Flights, and link them via Carries.

In [23]:
# (3.1) Insert 500 Airlines
for i in range(1, 501):
    sql = """
        INSERT INTO Airline (airlineID, Name, Headquarters)
        VALUES (:1, :2, :3)
    """
    airline_id = i
    name = f"Airline_{i}"
    headquarters = f"Headquarters_{i}"
    cursor.execute(sql, [airline_id, name, headquarters])

# (3.2) Insert 3000 Passengers
for i in range(1, 3001):
    sql = """
        INSERT INTO Passenger (passengerID, FirstName, Surname, PassportNumber)
        VALUES (:1, :2, :3, :4)
    """
    passenger_id = i
    first_name = f"FName{i}"
    surname = f"LName{i}"
    passport = f"P{i:05d}"
    cursor.execute(sql, [passenger_id, first_name, surname, passport])

# (3.3) Insert 2000 Flights
for i in range(1, 2001):
    flight_num = f"FN{i}"
    hour_offset = i % 24
    chosen_airline = random.randint(1, 500)

    base_dep = datetime(2025, 1, 1, 8, 0, 0)
    base_arr = datetime(2025, 1, 1, 12, 0, 0)
    departure_time = base_dep + timedelta(hours=hour_offset)
    arrival_time = base_arr + timedelta(hours=hour_offset)
    
    sql = """
    INSERT INTO Flight (
        flightNumber, DepartureTime, ArrivalTime, airlineID
    )
    VALUES (:1, :2, :3, :4)
    """
    cursor.execute(sql, (
        flight_num, 
        departure_time,
        arrival_time, 
        chosen_airline
    ))

# (3.4) Insert into Carries: each of the 3000 passengers will take
#       1 or 2 random flights, so we get 3000 - 12000 rows in Carries.
for passenger_id in range(1, 3001):
    flight_count = random.randint(1, 4)
    flight_ids = random.sample(range(1, 2001), flight_count)
    for fid in flight_ids:
        flight_num = f"FN{fid}"
        sql = "INSERT INTO Carries (flightNumber, passengerID) VALUES (:1, :2)"
        cursor.execute(sql, (flight_num, passenger_id))

connection.commit()
print("Inserted 500 Airlines, 3000 Passengers, 2000 Flights. Carries data inserted. Large dataset created!")

Inserted 500 Airlines, 3000 Passengers, 2000 Flights. Carries data inserted. Large dataset created!


Now verify row counts

In [109]:
tables = ["Airline", "Passenger", "Flight", "Carries"]
for tbl in tables:
    cursor.execute(f"SELECT COUNT(*) FROM {tbl}")
    count = cursor.fetchone()[0]
    print(f"{tbl} has {count} rows.")

Airline has 500 rows.
Passenger has 3000 rows.
Flight has 2000 rows.
Carries has 7576 rows.


## 4) ADVANCED QUERIES

We will perform three queries, each using two of the following:
- GROUP BY
- HAVING
- ALL
- Subselect

In [111]:
def run_query_and_show_dataframe(sql_query, description):
    print(f"### {description}")
    print(f"SQL:\n{sql_query}\n")

    cursor.execute(sql_query)
    
    columns = [desc[0] for desc in cursor.description]
    rows = cursor.fetchall()
    
    df = pd.DataFrame(rows, columns=columns)
    
    display(df)
    
    print("\n---\n")

### 4.1) QUERY #1: GROUP BY + HAVING
Show how many flights each airline operates, but only those airlines that operate more than 8 flights

In [154]:
query_4_1 = """
SELECT
    A.airlineID,
    A.Name AS AirlineName,
    COUNT(F.flightNumber) AS FlightCount
FROM Airline A
JOIN Flight F ON A.airlineID = F.airlineID
GROUP BY A.airlineID, A.Name
HAVING COUNT(F.flightNumber) > 8
"""

run_query_and_show_dataframe(
    sql_query=query_4_1,
    description="QUERY 4.1 (GROUP BY + HAVING, now showing airlineID)"
)

### QUERY 4.1 (GROUP BY + HAVING, now showing airlineID)
SQL:

SELECT
    A.airlineID,
    A.Name AS AirlineName,
    COUNT(F.flightNumber) AS FlightCount
FROM Airline A
JOIN Flight F ON A.airlineID = F.airlineID
GROUP BY A.airlineID, A.Name
HAVING COUNT(F.flightNumber) > 8




Unnamed: 0,AIRLINEID,AIRLINENAME,FLIGHTCOUNT
0,71,Airline_71,10
1,212,Airline_212,9
2,36,Airline_36,10
3,500,Airline_500,9
4,494,Airline_494,9
5,436,Airline_436,9
6,290,Airline_290,9
7,23,Airline_23,9
8,209,Airline_209,10
9,237,Airline_237,9



---



### 4.2) QUERY #2: Subselect + ALL
Find passengers whose ID is greater than ALL passenger IDs who fly on flight 'FN100'

In [116]:
query_4_2 = """
SELECT
    p.passengerID,
    p.FirstName,
    p.Surname,
    (SELECT MAX(c2.passengerID) 
       FROM Carries c2 
       WHERE c2.flightNumber = 'FN100'
    ) AS max_passengerID_on_FN100
FROM Passenger p
WHERE p.passengerID > ALL (
    SELECT c.passengerID
    FROM Carries c
    WHERE c.flightNumber = 'FN100'
)
"""

run_query_and_show_dataframe(
    sql_query=query_4_2,
    description="QUERY 4.2 (Subselect + ALL) with visible max passengerID from FN100"
)

### QUERY 4.2 (Subselect + ALL) with visible max passengerID from FN100
SQL:

SELECT
    p.passengerID,
    p.FirstName,
    p.Surname,
    (SELECT MAX(c2.passengerID) 
       FROM Carries c2 
       WHERE c2.flightNumber = 'FN100'
    ) AS max_passengerID_on_FN100
FROM Passenger p
WHERE p.passengerID > ALL (
    SELECT c.passengerID
    FROM Carries c
    WHERE c.flightNumber = 'FN100'
)




Unnamed: 0,PASSENGERID,FIRSTNAME,SURNAME,MAX_PASSENGERID_ON_FN100
0,2974,FName2974,LName2974,2973
1,2975,FName2975,LName2975,2973
2,2976,FName2976,LName2976,2973
3,2977,FName2977,LName2977,2973
4,2978,FName2978,LName2978,2973
5,2979,FName2979,LName2979,2973
6,2980,FName2980,LName2980,2973
7,2981,FName2981,LName2981,2973
8,2982,FName2982,LName2982,2973
9,2983,FName2983,LName2983,2973



---



### 4.3 QUERY #3: GROUP BY + Subselect
Show passengers who have more flights than the overall average

In [118]:
query_4_3 = """
WITH overall AS (
    SELECT AVG(cnt) AS avg_cnt
    FROM (
        SELECT passengerID, COUNT(*) AS cnt
        FROM Carries
        GROUP BY passengerID
    )
)
SELECT 
    c.passengerID,
    COUNT(c.flightNumber) AS total_flights,
    o.avg_cnt AS overall_avg
    
FROM Carries c
CROSS JOIN overall o
GROUP BY c.passengerID, o.avg_cnt
HAVING COUNT(c.flightNumber) > o.avg_cnt
"""

run_query_and_show_dataframe(
    sql_query=query_4_3,
    description="QUERY 4.3 (GROUP BY + Subselect) showing each passenger's total flights vs. overall average"
)


### QUERY 4.3 (GROUP BY + Subselect) showing each passenger's total flights vs. overall average
SQL:

WITH overall AS (
    SELECT AVG(cnt) AS avg_cnt
    FROM (
        SELECT passengerID, COUNT(*) AS cnt
        FROM Carries
        GROUP BY passengerID
    )
)
SELECT 
    c.passengerID,
    COUNT(c.flightNumber) AS total_flights,
    o.avg_cnt AS overall_avg
FROM Carries c
CROSS JOIN overall o
GROUP BY c.passengerID, o.avg_cnt
HAVING COUNT(c.flightNumber) > o.avg_cnt




Unnamed: 0,PASSENGERID,TOTAL_FLIGHTS,OVERALL_AVG
0,416,3,2.525333
1,418,3,2.525333
2,425,4,2.525333
3,440,4,2.525333
4,445,4,2.525333
...,...,...,...
1523,2960,3,2.525333
1524,2967,3,2.525333
1525,2990,3,2.525333
1526,2991,4,2.525333



---



## 5) STORED PROCEDURES (Update & Delete)

We create two:
- `update_passenger_surname` (updates a passenger's surname).
- `delete_flights_with_prefix` (removes flights for a given airline when the flightNumber starts with a certain prefix).

Then we call them.

In [172]:
sp_update = """
CREATE OR REPLACE PROCEDURE update_passenger_surname(
    p_passengerID IN NUMBER,
    p_newSurname  IN VARCHAR2
) AS
BEGIN
    UPDATE Passenger
    SET Surname = p_newSurname
    WHERE passengerID = p_passengerID;
    COMMIT;
END;
"""

cursor.execute(sp_update)
print("Procedure update_passenger_surname created.")

Procedure update_passenger_surname created.


In [170]:
sp_delete = """
CREATE OR REPLACE PROCEDURE delete_flights_with_prefix(
    p_airlineID     IN NUMBER,
    p_flightPrefix  IN VARCHAR2
) AS
BEGIN
    -- Delete all flights for a given airline that match the prefix.
    DELETE FROM Flight
    WHERE airlineID = p_airlineID
      AND flightNumber LIKE p_flightPrefix || '%';
    COMMIT;
END;
"""

cursor.execute(sp_delete)
print("Procedure delete_flights_with_prefix created.")

Procedure delete_flights_with_prefix created.


In [174]:
# Example 1: Update passenger #10's surname to "NEWLNAME"
update_call = """
BEGIN
    update_passenger_surname(:1, :2);
END;
"""
cursor.execute(update_call, [10, "NEWLNAME"])
connection.commit()
print("Called update_passenger_surname for passengerID=10.")

# Example 2: Delete flights from airlineID=20 whose flightNumber starts with 'FN1'
delete_call = """
BEGIN
    delete_flights_with_prefix(:airlineID, :prefix);
END;
"""
cursor.execute(delete_call, airlineID=20, prefix="FN1")
connection.commit()
print("Called delete_flights_with_prefix for airlineID=20, prefix='FN1'.")


Called update_passenger_surname for passengerID=10.
Called delete_flights_with_prefix for airlineID=20, prefix='FN1'.


In [158]:
check_updated_sql = """
SELECT passengerID, FirstName, Surname
FROM Passenger
WHERE passengerID = 10
"""

cursor.execute(check_updated_sql)
row = cursor.fetchone()
print("Verification of update_passenger_surname:", row)

Verification of update_passenger_surname: (10, 'FName10', 'NEWLNAME')


In [176]:
final_tables = ["Airline", "Passenger", "Flight", "Carries"]
for tbl in final_tables:
    cursor.execute(f"SELECT COUNT(*) FROM {tbl}")
    cnt = cursor.fetchone()[0]
    print(f"Table {tbl} final count: {cnt}")

Table Airline final count: 500
Table Passenger final count: 3000
Table Flight final count: 1996
Table Carries final count: 7560


## 6) INDEXING & QUERY OPTIMIZATION

The following steps create a separate large table, `kunden`, with 100,000 rows.  
Then we compare query performance (or plans) **before** and **after** adding an index on the `stadt` column.

In [190]:
# First, let's drop it if it exists.
try:
    cursor.execute("DROP TABLE kunden CASCADE CONSTRAINTS")
    connection.commit()
    print("Dropped existing table kunden.")
except oracledb.Error as e:
    print("No existing table kunden. Proceeding with creation.")

create_kunden_table = """
CREATE TABLE kunden (
    id NUMBER GENERATED BY DEFAULT ON NULL AS IDENTITY PRIMARY KEY,
    vorname    VARCHAR2(50),
    nachname   VARCHAR2(50),
    email      VARCHAR2(100),
    geburtsdatum DATE,
    stadt      VARCHAR2(50)
)
"""

cursor.execute(create_kunden_table)
connection.commit()
print("Created table kunden.")

No existing table kunden. Proceeding with creation.
Created table kunden.


In [194]:
# We'll insert 100,000 customers. 
# For demonstration, we keep the logic straightforward.

batch_size = 10000  # for example, commit every 10k rows
total_rows = 100000

insert_sql = """
INSERT INTO kunden (vorname, nachname, email, geburtsdatum, stadt)
VALUES (:vorname, :nachname, :email, :geburtsdatum, :stadt)
"""

for i in range(1, total_rows + 1):
    vorname = f"Vorname{i}"
    nachname = f"Nachname{i}"
    email = f"email{i}@example.com"
    geburtsdatum = datetime.date(1990, 1, 1)  # or random dates
    # We'll randomly assign ~50% 'Wien' and 50% 'Graz'
    stadt = "Wien" if random.random() < 0.5 else "Graz"

    cursor.execute(insert_sql, [vorname, nachname, email, geburtsdatum, stadt])

    # Commit every 10k rows
    if i % batch_size == 0:
        connection.commit()
        print(f"{i} rows inserted...")

connection.commit()
print(f"Finished inserting {total_rows} rows into kunden.")

10000 rows inserted...
20000 rows inserted...
30000 rows inserted...
40000 rows inserted...
50000 rows inserted...
60000 rows inserted...
70000 rows inserted...
80000 rows inserted...
90000 rows inserted...
100000 rows inserted...
Finished inserting 100000 rows into kunden.


Check count

In [196]:
cursor.execute("SELECT COUNT(*) FROM kunden")
count_kunden = cursor.fetchone()[0]
print(f"'kunden' now has {count_kunden} rows.")

'kunden' now has 100000 rows.


In [213]:
explain_before_sql = """
EXPLAIN PLAN FOR
SELECT * 
FROM kunden
WHERE stadt = 'Wien'
"""
cursor.execute(explain_before_sql)

cursor.execute("SELECT PLAN_TABLE_OUTPUT FROM TABLE(DBMS_XPLAN.DISPLAY())")
plan_output = cursor.fetchall()

print("=== EXPLAIN PLAN (BEFORE INDEX) ===")
for line in plan_output:
    print(line[0])

=== EXPLAIN PLAN (BEFORE INDEX) ===
Plan hash value: 2881942826
 
----------------------------------------------------------------------------
| Id  | Operation         | Name   | Rows  | Bytes | Cost (%CPU)| Time     |
----------------------------------------------------------------------------
|   0 | SELECT STATEMENT  |        | 10000 |   625K|   102   (0)| 00:00:01 |
|*  1 |  TABLE ACCESS FULL| KUNDEN | 10000 |   625K|   102   (0)| 00:00:01 |
----------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   1 - filter("STADT"='Wien')


try:
    cursor.execute("CREATE INDEX idx_stadt ON kunden(stadt)")
    connection.commit()
    print("Index idx_stadt created on kunden(stadt).")
except oracledb.Error as e:
    print(f"Could not create index: {e}")

In [204]:
explain_after_sql = """
EXPLAIN PLAN FOR
SELECT * 
FROM kunden
WHERE stadt = 'Wien'
"""
cursor.execute(explain_after_sql)

cursor.execute("SELECT PLAN_TABLE_OUTPUT FROM TABLE(DBMS_XPLAN.DISPLAY())")
plan_output = cursor.fetchall()

print("=== EXPLAIN PLAN (AFTER INDEX) ===")
for line in plan_output:
    print(line[0])


=== EXPLAIN PLAN (AFTER INDEX) ===
Plan hash value: 2881942826
 
----------------------------------------------------------------------------
| Id  | Operation         | Name   | Rows  | Bytes | Cost (%CPU)| Time     |
----------------------------------------------------------------------------
|   0 | SELECT STATEMENT  |        | 10000 |   625K|   102   (0)| 00:00:01 |
|*  1 |  TABLE ACCESS FULL| KUNDEN | 10000 |   625K|   102   (0)| 00:00:01 |
----------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   1 - filter("STADT"='Wien')


## 8) Cleanup

In [27]:
cursor.close()
connection.close()
print("Closed Oracle connection. Done!")

Closed Oracle connection. Done!
