# Data preparation
## Read and show Excel data for one day from Flightradar

In [122]:
import pandas as pd

In [123]:
# XLSX-Datei laden
flightradar = pd.read_excel("250505_flightradar.xlsx")

# Erste 5 Zeilen anzeigen
print(flightradar.head(5))  

  web-scraper-order                              web-scraper-start-url  page  \
0      1746512368-1  https://www.flightradar24.com/data/airports/zr...   NaN   
1      1746512368-2  https://www.flightradar24.com/data/airports/zr...   NaN   
2      1746512368-3  https://www.flightradar24.com/data/airports/zr...   NaN   
3      1746512368-4  https://www.flightradar24.com/data/airports/zr...   NaN   
4      1746512368-5  https://www.flightradar24.com/data/airports/zr...   NaN   

   page2  page3  page4  page5  page6  page7      TIME  FLIGHT  \
0    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   WK370   
1    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00  W22160   
2    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   CS600   
3    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK134   
4    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK348   

        DESTINATION           AIRLINE       AIRCRAFT          STATUS  
0    Larnaca (LCA)-   Edelweiss Air -  A320 (HB-IJV)  Dep

## Show data types and shape of the DataFrame

In [124]:
print(flightradar.dtypes)
print(flightradar.shape)  

web-scraper-order         object
web-scraper-start-url     object
page                     float64
page2                    float64
page3                    float64
page4                    float64
page5                    float64
page6                    float64
page7                    float64
TIME                      object
FLIGHT                    object
DESTINATION               object
AIRLINE                   object
AIRCRAFT                  object
STATUS                    object
dtype: object
(390, 15)


## The Date is missing in the DataFrame. Create a new field DATETIME
### DATETIME is the planned departure time for that day

In [125]:
# Datum als Variable definieren
date_str = "2025-05-05"

In [126]:
# TIME als String sicherstellen (falls es z. B. float ist)
flightradar["TIME"] = flightradar["TIME"].astype(str)

# DATETIME-Feld erzeugen: 'date_str' + Zeit aus TIME-Feld
flightradar["DATETIME"] = date_str + " " + flightradar["TIME"]

# Kontrolle
print(flightradar[["TIME", "DATETIME"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

        TIME             DATETIME
0   05:45:00  2025-05-05 05:45:00
1   05:45:00  2025-05-05 05:45:00
2   05:45:00  2025-05-05 05:45:00
3   06:00:00  2025-05-05 06:00:00
4   06:00:00  2025-05-05 06:00:00
5   06:10:00  2025-05-05 06:10:00
6   06:10:00  2025-05-05 06:10:00
7   06:20:00  2025-05-05 06:20:00
8   06:20:00  2025-05-05 06:20:00
9   06:25:00  2025-05-05 06:25:00
10  06:25:00  2025-05-05 06:25:00
11  06:30:00  2025-05-05 06:30:00
12  06:40:00  2025-05-05 06:40:00
13  06:45:00  2025-05-05 06:45:00
14  06:45:00  2025-05-05 06:45:00
15  06:45:00  2025-05-05 06:45:00
16  06:45:00  2025-05-05 06:45:00
17  06:45:00  2025-05-05 06:45:00
18  06:50:00  2025-05-05 06:50:00
19  06:50:00  2025-05-05 06:50:00
20  06:55:00  2025-05-05 06:55:00
21  06:55:00  2025-05-05 06:55:00
22  06:55:00  2025-05-05 06:55:00
23  06:55:00  2025-05-05 06:55:00
24  07:00:00  2025-05-05 07:00:00
25  07:00:00  2025-05-05 07:00:00
26  07:05:00  2025-05-05 07:05:00
27  07:05:00  2025-05-05 07:05:00
28  07:05:00  

## Create two new fields: DESTINATION_CLEAN and IATA_CODE using regex

In [127]:
# DESTINATION_CLEAN: Alles vor der Klammer
flightradar["DESTINATION_CLEAN"] = flightradar["DESTINATION"].str.extract(r"^(.*?)\s*\(")

# IATA-CODE: Inhalt in der Klammer
flightradar["IATA_CODE"] = flightradar["DESTINATION"].str.extract(r"\((\w{3})\)")

# Kontrolle
print(flightradar[["DESTINATION", "DESTINATION_CLEAN", "IATA_CODE"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                 DESTINATION  DESTINATION_CLEAN IATA_CODE
0             Larnaca (LCA)-            Larnaca       LCA
1            Pristina (PRN)-           Pristina       PRN
2            Pristina (PRN)-           Pristina       PRN
3            Hurghada (HRG)-           Hurghada       HRG
4           Heraklion (HER)-          Heraklion       HER
5             Antalya (AYT)-            Antalya       AYT
6        Gran Canaria (LPA)-       Gran Canaria       LPA
7              Lisbon (LIS)-             Lisbon       LIS
8            Tenerife (TFS)-           Tenerife       TFS
9           Marrakesh (RAK)-          Marrakesh       RAK
10              Olbia (OLB)-              Olbia       OLB
11           Pristina (PRN)-           Pristina       PRN
12              Ibiza (IBZ)-              Ibiza       IBZ
13             Bilbao (BIO)-             Bilbao       BIO
14            Funchal (FNC)-            Funchal       FNC
15           Hurghada (HRG)-           Hurghada       HRG
16           H

## Create three new fields: CANCELED, DEPART_TIME and DEPART_DATETIME using regex

In [128]:
# 1. CANCELED-Spalte (Boolean)
flightradar["CANCELED"] = flightradar["STATUS"].str.contains("Canceled", na=False)

# 2. DEPART_TIME-Spalte (nur wenn "Departed HH:MM" enthalten ist)
flightradar["DEPART_TIME"] = flightradar["STATUS"].str.extract(r"Departed\s*(\d{2}:\d{2})")[0]

# 3. DEPART_DATETIME-Spalte: date_str + DEPART_TIME (fallback "00:00") + ":00"
flightradar["DEPART_DATETIME"] = (
    date_str
    + " "
    + flightradar["DEPART_TIME"].fillna("00:00") # falls keine Zeit da ist, setze "00:00"
    + ":00" # Sekunden anhängen
)

# Kontrolle
print(flightradar[["STATUS","CANCELED","DEPART_TIME","DEPART_DATETIME"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                   STATUS  CANCELED DEPART_TIME      DEPART_DATETIME
0          Departed 06:09     False       06:09  2025-05-05 06:09:00
1                 Unknown     False         NaN  2025-05-05 00:00:00
2          Departed 06:00     False       06:00  2025-05-05 06:00:00
3          Departed 06:07     False       06:07  2025-05-05 06:07:00
4          Departed 06:12     False       06:12  2025-05-05 06:12:00
5          Departed 06:21     False       06:21  2025-05-05 06:21:00
6          Departed 06:18     False       06:18  2025-05-05 06:18:00
7          Departed 06:48     False       06:48  2025-05-05 06:48:00
8          Departed 06:30     False       06:30  2025-05-05 06:30:00
9          Departed 06:33     False       06:33  2025-05-05 06:33:00
10         Departed 06:36     False       06:36  2025-05-05 06:36:00
11         Departed 06:42     False       06:42  2025-05-05 06:42:00
12         Departed 07:00     False       07:00  2025-05-05 07:00:00
13         Departed 06:57     Fals

## Creating the AIRLINE_CLEAN field and removing unimportant parts

In [129]:
flightradar["AIRLINE_CLEAN"] = (
    flightradar["AIRLINE"]
      .str.extract(r'^(.+?)(?=\s*(?:\(|-))')[0]  # Gruppe 1: alles bis vor "(" oder "-"
      .str.strip()                               # führende/trailing Leerzeichen entfernen
)

# Kontrolle
print(flightradar[["AIRLINE","AIRLINE_CLEAN"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                                      AIRLINE     AIRLINE_CLEAN
0                             Edelweiss Air -     Edelweiss Air
1                                Flexflight -        Flexflight
2                            Chair Airlines -    Chair Airlines
3                             Edelweiss Air -     Edelweiss Air
4                             Edelweiss Air -     Edelweiss Air
5      Edelweiss Air (Help Alliance Livery) -     Edelweiss Air
6                             Edelweiss Air -     Edelweiss Air
7                          TAP Air Portugal -  TAP Air Portugal
8                             Edelweiss Air -     Edelweiss Air
9                             Edelweiss Air -     Edelweiss Air
10                            Edelweiss Air -     Edelweiss Air
11                            Edelweiss Air -     Edelweiss Air
12                                 Aeroways -          Aeroways
13                            Edelweiss Air -     Edelweiss Air
14                            Edelweiss 

## Creating the AIRCRAFT_SHORT field

In [130]:
# Create the AIRCRAFT_SHORT field by removing text inside parentheses
flightradar["AIRCRAFT_SHORT"] = (
    flightradar["AIRCRAFT"]
      .str.replace(r"\s*\(.*?\)", "", regex=True)  # non‑greedy, regex=True
      .str.strip()
)

# Check the result
print(flightradar[["AIRCRAFT", "AIRCRAFT_SHORT"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

         AIRCRAFT AIRCRAFT_SHORT
0   A320 (HB-IJV)           A320
1          320 ()            320
2   A320 (HB-JOS)           A320
3   A320 (HB-JJM)           A320
4   A320 (HB-JLR)           A320
5   A320 (HB-JLT)           A320
6   A320 (HB-IHX)           A320
7   A321 (CS-TJF)           A321
8   A320 (HB-JLP)           A320
9   A320 (HB-JJK)           A320
10  A320 (HB-IHY)           A320
11  A359 (HB-IHF)           A359
12  C25A (D-IBRN)           C25A
13  A320 (HB-JLS)           A320
14  A320 (HB-JJN)           A320
15         320 ()            320
16  A320 (HB-JOK)           A320
17  C56X (CS-DXO)           C56X
18  A320 (HB-IJJ)           A320
19  A320 (HB-IJM)           A320
20  E295 (HB-AZK)           E295
21  BCS3 (HB-JCU)           BCS3
22  BCS1 (HB-JBH)           BCS1
23  BCS3 (HB-JCT)           BCS3
24  B738 (PH-BCK)           B738
25  E290 (HB-AZC)           E290
26  A20N (HB-JDH)           A20N
27  E290 (HB-AZA)           E290
28  BCS1 (HB-JBE)           BCS1
29  A320 (

## Create a new field DELAY and DELAY_MINUTES and calculate it from DATETIME and DEPART_DATETIME

In [131]:
# 1) Stelle sicher, dass beide Spalten als datetime64 vorliegen
flightradar["DATETIME"] = pd.to_datetime(flightradar["DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")
flightradar["DEPART_DATETIME"] = pd.to_datetime(flightradar["DEPART_DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")

# 2) DELAY als Differenz berechnen
flightradar["DELAY"] = flightradar["DEPART_DATETIME"] - flightradar["DATETIME"]

# 3a) Variante A: Floor‑Division auf total_seconds, dann in int
flightradar["DELAY_MINUTES"] = (
    flightradar["DELAY"].dt.total_seconds() // 60
).astype(int)

# Kontrolle
print(flightradar[["DATETIME","DEPART_DATETIME","DELAY","DELAY_MINUTES"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

              DATETIME     DEPART_DATETIME             DELAY  DELAY_MINUTES
0  2025-05-05 05:45:00 2025-05-05 06:09:00   0 days 00:24:00             24
1  2025-05-05 05:45:00 2025-05-05 00:00:00 -1 days +18:15:00           -345
2  2025-05-05 05:45:00 2025-05-05 06:00:00   0 days 00:15:00             15
3  2025-05-05 06:00:00 2025-05-05 06:07:00   0 days 00:07:00              7
4  2025-05-05 06:00:00 2025-05-05 06:12:00   0 days 00:12:00             12
5  2025-05-05 06:10:00 2025-05-05 06:21:00   0 days 00:11:00             11
6  2025-05-05 06:10:00 2025-05-05 06:18:00   0 days 00:08:00              8
7  2025-05-05 06:20:00 2025-05-05 06:48:00   0 days 00:28:00             28
8  2025-05-05 06:20:00 2025-05-05 06:30:00   0 days 00:10:00             10
9  2025-05-05 06:25:00 2025-05-05 06:33:00   0 days 00:08:00              8
10 2025-05-05 06:25:00 2025-05-05 06:36:00   0 days 00:11:00             11
11 2025-05-05 06:30:00 2025-05-05 06:42:00   0 days 00:12:00             12
12 2025-05-0

## Showing full DataFrame

In [132]:
flightradar.head(3)

Unnamed: 0,web-scraper-order,web-scraper-start-url,page,page2,page3,page4,page5,page6,page7,TIME,...,DATETIME,DESTINATION_CLEAN,IATA_CODE,CANCELED,DEPART_TIME,DEPART_DATETIME,AIRLINE_CLEAN,AIRCRAFT_SHORT,DELAY,DELAY_MINUTES
0,1746512368-1,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-05 05:45:00,Larnaca,LCA,False,06:09,2025-05-05 06:09:00,Edelweiss Air,A320,0 days 00:24:00,24
1,1746512368-2,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-05 05:45:00,Pristina,PRN,False,,2025-05-05 00:00:00,Flexflight,320,-1 days +18:15:00,-345
2,1746512368-3,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-05 05:45:00,Pristina,PRN,False,06:00,2025-05-05 06:00:00,Chair Airlines,A320,0 days 00:15:00,15


## Creating a New DataFrame with Selected Columns

In [133]:
# Create a new DataFrame with selected columns
flightradar_clean = flightradar[[
    "DESTINATION_CLEAN", 
    "IATA_CODE", 
    "AIRLINE_CLEAN", 
    "AIRCRAFT_SHORT", 
    "CANCELED", 
    "DATETIME",
    "DEPART_TIME", 
    "DEPART_DATETIME", 
    "DELAY_MINUTES"
]].copy()

# Check the result
print(flightradar_clean.head(50))
print("\n")
print(flightradar_clean.dtypes)
print(flightradar_clean.shape)

    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Larnaca       LCA     Edelweiss Air           A320     False   
1            Pristina       PRN        Flexflight            320     False   
2            Pristina       PRN    Chair Airlines           A320     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4           Heraklion       HER     Edelweiss Air           A320     False   
5             Antalya       AYT     Edelweiss Air           A320     False   
6        Gran Canaria       LPA     Edelweiss Air           A320     False   
7              Lisbon       LIS  TAP Air Portugal           A321     False   
8            Tenerife       TFS     Edelweiss Air           A320     False   
9           Marrakesh       RAK     Edelweiss Air           A320     False   
10              Olbia       OLB     Edelweiss Air           A320     False   
11           Pristina       PRN     Edelweiss Air           A359

## Count missing values in columns

In [134]:
# Count missing values per column
print(flightradar_clean.isnull().sum())

# Total number of missing values in the DataFrame
print("Total missing values:", flightradar_clean.isnull().sum().sum())

DESTINATION_CLEAN    0
IATA_CODE            0
AIRLINE_CLEAN        7
AIRCRAFT_SHORT       0
CANCELED             0
DATETIME             0
DEPART_TIME          7
DEPART_DATETIME      0
DELAY_MINUTES        0
dtype: int64
Total missing values: 14


## Show rows with missing values in the DataFrame

In [135]:
# Filter rows with any missing values
missing_rows = flightradar_clean[flightradar_clean.isnull().any(axis=1)]

# Display rows with missing values
print(missing_rows)


    DESTINATION_CLEAN IATA_CODE AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
1            Pristina       PRN    Flexflight            320     False   
15           Hurghada       HRG    Flexflight            320     False   
30             London       LHR         Swiss           A20N     False   
55             Madrid       MAD           NaN           CL60     False   
108            Venice       TSF           NaN           BE20     False   
145          Tel Aviv       TLV         Swiss           A333      True   
210             Paris       LBG           NaN           E55P     False   
238             Olbia       OLB           NaN           PC12     False   
250              Riga       RIX           NaN           C680     False   
251            Rimini       RMI           NaN           P180     False   
314         Innsbruck       INN           NaN           CL60     False   
329          Helsinki       HEL       Finnair           A321      True   
345           Cologne       CGN     Eu

## Show the DataFrame without missing values

In [136]:
# Drop rows with any missing values (NaN)
flightradar_clean = flightradar_clean.dropna(axis=0)

# Display the DataFrame after dropping rows with missing values
print(flightradar_clean)


    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Larnaca       LCA     Edelweiss Air           A320     False   
2            Pristina       PRN    Chair Airlines           A320     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4           Heraklion       HER     Edelweiss Air           A320     False   
5             Antalya       AYT     Edelweiss Air           A320     False   
..                ...       ...               ...            ...       ...   
385      Johannesburg       JNB             Swiss           A343     False   
386         Sao Paulo       GRU             Swiss           B77W     False   
387         Hong Kong       HKG             Swiss           B77W     False   
388             Milan       MXP  Helvetic Airways           E190     False   
389            Geneva       GVA             Swiss           BCS1     False   

               DATETIME DEPART_TIME     DEPART_DATETIME  DELAY_

## Check for duplicate rows in the DataFrame and display them if any exist

In [137]:
# Duplikate prüfen
duplicates = flightradar_clean[flightradar_clean.duplicated()]

# Duplikate anzeigen (falls vorhanden)
print(duplicates)

# Anzahl der Duplikate
print(f"Number of duplicate rows: {duplicates.shape[0]}")

Empty DataFrame
Columns: [DESTINATION_CLEAN, IATA_CODE, AIRLINE_CLEAN, AIRCRAFT_SHORT, CANCELED, DATETIME, DEPART_TIME, DEPART_DATETIME, DELAY_MINUTES]
Index: []
Number of duplicate rows: 0


## Save the cleaned DataFrame to a CSV file for further use or analysis

In [138]:
# Speichern des DataFrames als CSV-Datei
flightradar_clean.to_csv("250505_flightradar_prepared.csv", index=False)
