# Data preparation
## Read and show Excel data for one day from Flightradar

In [123]:
import pandas as pd

In [124]:
# XLSX-Datei laden
flightradar = pd.read_excel("250502_flightradar.xlsx")

# Erste 5 Zeilen anzeigen
print(flightradar.head(5))  

  web-scraper-order                              web-scraper-start-url  page  \
0      1746256881-1  https://www.flightradar24.com/data/airports/zr...   NaN   
1      1746256881-2  https://www.flightradar24.com/data/airports/zr...   NaN   
2      1746256881-3  https://www.flightradar24.com/data/airports/zr...   NaN   
3      1746256881-4  https://www.flightradar24.com/data/airports/zr...   NaN   
4      1746256881-5  https://www.flightradar24.com/data/airports/zr...   NaN   

   page2  page3  page4  page5  page6  page7      TIME  FLIGHT  \
0    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   WK186   
1    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00  W22160   
2    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   CS600   
3    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK130   
4    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK226   

       DESTINATION           AIRLINE       AIRCRAFT          STATUS  
0   Antalya (AYT)-   Edelweiss Air -  A359 (HB-IHF)  Depar

## Show data types and shape of the DataFrame

In [125]:
print(flightradar.dtypes)
print(flightradar.shape)  

web-scraper-order         object
web-scraper-start-url     object
page                     float64
page2                    float64
page3                    float64
page4                    float64
page5                    float64
page6                    float64
page7                    float64
TIME                      object
FLIGHT                    object
DESTINATION               object
AIRLINE                   object
AIRCRAFT                  object
STATUS                    object
dtype: object
(383, 15)


## The Date is missing in the DataFrame. Create a new field DATETIME
### DATETIME is the planned departure time for that day

In [126]:
# Datum als Variable definieren
date_str = "2025-05-02"

In [127]:
# TIME als String sicherstellen (falls es z. B. float ist)
flightradar["TIME"] = flightradar["TIME"].astype(str)

# DATETIME-Feld erzeugen: 'date_str' + Zeit aus TIME-Feld
flightradar["DATETIME"] = date_str + " " + flightradar["TIME"]

# Kontrolle
print(flightradar[["TIME", "DATETIME"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

        TIME             DATETIME
0   05:45:00  2025-05-02 05:45:00
1   05:45:00  2025-05-02 05:45:00
2   05:45:00  2025-05-02 05:45:00
3   06:00:00  2025-05-02 06:00:00
4   06:00:00  2025-05-02 06:00:00
5   06:10:00  2025-05-02 06:10:00
6   06:10:00  2025-05-02 06:10:00
7   06:20:00  2025-05-02 06:20:00
8   06:20:00  2025-05-02 06:20:00
9   06:20:00  2025-05-02 06:20:00
10  06:25:00  2025-05-02 06:25:00
11  06:25:00  2025-05-02 06:25:00
12  06:25:00  2025-05-02 06:25:00
13  06:30:00  2025-05-02 06:30:00
14  06:30:00  2025-05-02 06:30:00
15  06:30:00  2025-05-02 06:30:00
16  06:30:00  2025-05-02 06:30:00
17  06:35:00  2025-05-02 06:35:00
18  06:45:00  2025-05-02 06:45:00
19  06:45:00  2025-05-02 06:45:00
20  06:50:00  2025-05-02 06:50:00
21  06:50:00  2025-05-02 06:50:00
22  06:55:00  2025-05-02 06:55:00
23  06:55:00  2025-05-02 06:55:00
24  06:55:00  2025-05-02 06:55:00
25  06:55:00  2025-05-02 06:55:00
26  06:55:00  2025-05-02 06:55:00
27  07:00:00  2025-05-02 07:00:00
28  07:00:00  

## Create two new fields: DESTINATION_CLEAN and IATA_CODE using regex

In [128]:
# DESTINATION_CLEAN: Alles vor der Klammer
flightradar["DESTINATION_CLEAN"] = flightradar["DESTINATION"].str.extract(r"^(.*?)\s*\(")

# IATA-CODE: Inhalt in der Klammer
flightradar["IATA_CODE"] = flightradar["DESTINATION"].str.extract(r"\((\w{3})\)")

# Kontrolle
print(flightradar[["DESTINATION", "DESTINATION_CLEAN", "IATA_CODE"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                 DESTINATION  DESTINATION_CLEAN IATA_CODE
0             Antalya (AYT)-            Antalya       AYT
1            Pristina (PRN)-           Pristina       PRN
2            Pristina (PRN)-           Pristina       PRN
3            Hurghada (HRG)-           Hurghada       HRG
4              Bilbao (BIO)-             Bilbao       BIO
5              Lisbon (LIS)-             Lisbon       LIS
6            Pristina (PRN)-           Pristina       PRN
7               Porto (OPO)-              Porto       OPO
8     Sharm el-Sheikh (SSH)-    Sharm el-Sheikh       SSH
9             Seville (SVQ)-            Seville       SVQ
10       Gran Canaria (LPA)-       Gran Canaria       LPA
11            Funchal (FNC)-            Funchal       FNC
12            Larnaca (LCA)-            Larnaca       LCA
13             Dublin (DUB)-             Dublin       DUB
14           Tenerife (TFS)-           Tenerife       TFS
15           Hurghada (HRG)-           Hurghada       HRG
16           H

## Create three new fields: CANCELED, DEPART_TIME and DEPART_DATETIME using regex

In [129]:
# 1. CANCELED-Spalte (Boolean)
flightradar["CANCELED"] = flightradar["STATUS"].str.contains("Canceled", na=False)

# 2. DEPART_TIME-Spalte (nur wenn "Departed HH:MM" enthalten ist)
flightradar["DEPART_TIME"] = flightradar["STATUS"].str.extract(r"Departed\s*(\d{2}:\d{2})")[0]

# 3. DEPART_DATETIME-Spalte: date_str + DEPART_TIME (fallback "00:00") + ":00"
flightradar["DEPART_DATETIME"] = (
    date_str
    + " "
    + flightradar["DEPART_TIME"].fillna("00:00") # falls keine Zeit da ist, setze "00:00"
    + ":00" # Sekunden anhängen
)

# Kontrolle
print(flightradar[["STATUS","CANCELED","DEPART_TIME","DEPART_DATETIME"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                   STATUS  CANCELED DEPART_TIME      DEPART_DATETIME
0          Departed 06:07     False       06:07  2025-05-02 06:07:00
1                 Unknown     False         NaN  2025-05-02 00:00:00
2          Departed 06:11     False       06:11  2025-05-02 06:11:00
3          Departed 06:13     False       06:13  2025-05-02 06:13:00
4          Departed 06:28     False       06:28  2025-05-02 06:28:00
5          Departed 06:41     False       06:41  2025-05-02 06:41:00
6          Departed 06:50     False       06:50  2025-05-02 06:50:00
7          Departed 06:52     False       06:52  2025-05-02 06:52:00
8          Departed 06:30     False       06:30  2025-05-02 06:30:00
9          Departed 06:35     False       06:35  2025-05-02 06:35:00
10         Departed 07:01     False       07:01  2025-05-02 07:01:00
11         Departed 06:56     False       06:56  2025-05-02 06:56:00
12         Departed 06:55     False       06:55  2025-05-02 06:55:00
13         Departed 06:38     Fals

## Creating the AIRLINE_CLEAN field and removing unimportant parts

In [130]:
flightradar["AIRLINE_CLEAN"] = (
    flightradar["AIRLINE"]
      .str.extract(r'^(.+?)(?=\s*(?:\(|-))')[0]  # Gruppe 1: alles bis vor "(" oder "-"
      .str.strip()                               # führende/trailing Leerzeichen entfernen
)

# Kontrolle
print(flightradar[["AIRLINE","AIRLINE_CLEAN"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                                   AIRLINE     AIRLINE_CLEAN
0                          Edelweiss Air -     Edelweiss Air
1                             Flexflight -        Flexflight
2                         Chair Airlines -    Chair Airlines
3                          Edelweiss Air -     Edelweiss Air
4                          Edelweiss Air -     Edelweiss Air
5                       TAP Air Portugal -  TAP Air Portugal
6                          Edelweiss Air -     Edelweiss Air
7                                  Swiss -             Swiss
8                          Edelweiss Air -     Edelweiss Air
9                          Edelweiss Air -     Edelweiss Air
10                         Edelweiss Air -     Edelweiss Air
11                         Edelweiss Air -     Edelweiss Air
12                         Edelweiss Air -     Edelweiss Air
13                      Helvetic Airways -  Helvetic Airways
14                         Edelweiss Air -     Edelweiss Air
15                      

## Creating the AIRCRAFT_SHORT field

In [131]:
# Create the AIRCRAFT_SHORT field by removing text inside parentheses
flightradar["AIRCRAFT_SHORT"] = (
    flightradar["AIRCRAFT"]
      .str.replace(r"\s*\(.*?\)", "", regex=True)  # non‑greedy, regex=True
      .str.strip()
)

# Check the result
print(flightradar[["AIRCRAFT", "AIRCRAFT_SHORT"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

         AIRCRAFT AIRCRAFT_SHORT
0   A359 (HB-IHF)           A359
1          320 ()            320
2   A320 (HB-JOS)           A320
3   A320 (HB-JJN)           A320
4   A320 (HB-IHY)           A320
5   A20N (CS-TVO)           A20N
6   A320 (HB-IJW)           A320
7   BCS3 (HB-JCU)           BCS3
8   A320 (HB-JLR)           A320
9   A320 (HB-IHZ)           A320
10  A320 (HB-JJM)           A320
11  A320 (HB-JLS)           A320
12  A320 (HB-IHX)           A320
13  E290 (HB-AZG)           E290
14  A320 (HB-IJU)           A320
15         320 ()            320
16  A320 (HB-JOK)           A320
17  A320 (HB-IJV)           A320
18  A321 (HB-ION)           A321
19  A320 (HB-JLP)           A320
20  BCS3 (HB-JCB)           BCS3
21  E190 (HB-JVM)           E190
22  E190 (HB-JVX)           E190
23  BCS3 (HB-JCF)           BCS3
24  BCS3 (YL-AAW)           BCS3
25  E295 (HB-AZI)           E295
26  BCS3 (HB-JCF)           BCS3
27  B738 (PH-BXY)           B738
28  E290 (HB-AZC)           E290
29  A320 (

## Create a new field DELAY and DELAY_MINUTES and calculate it from DATETIME and DEPART_DATETIME

In [132]:
# 1) Stelle sicher, dass beide Spalten als datetime64 vorliegen
flightradar["DATETIME"] = pd.to_datetime(flightradar["DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")
flightradar["DEPART_DATETIME"] = pd.to_datetime(flightradar["DEPART_DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")

# 2) DELAY als Differenz berechnen
flightradar["DELAY"] = flightradar["DEPART_DATETIME"] - flightradar["DATETIME"]

# 3a) Variante A: Floor‑Division auf total_seconds, dann in int
flightradar["DELAY_MINUTES"] = (
    flightradar["DELAY"].dt.total_seconds() // 60
).astype(int)

# Kontrolle
print(flightradar[["DATETIME","DEPART_DATETIME","DELAY","DELAY_MINUTES"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

              DATETIME     DEPART_DATETIME             DELAY  DELAY_MINUTES
0  2025-05-02 05:45:00 2025-05-02 06:07:00   0 days 00:22:00             22
1  2025-05-02 05:45:00 2025-05-02 00:00:00 -1 days +18:15:00           -345
2  2025-05-02 05:45:00 2025-05-02 06:11:00   0 days 00:26:00             26
3  2025-05-02 06:00:00 2025-05-02 06:13:00   0 days 00:13:00             13
4  2025-05-02 06:00:00 2025-05-02 06:28:00   0 days 00:28:00             28
5  2025-05-02 06:10:00 2025-05-02 06:41:00   0 days 00:31:00             31
6  2025-05-02 06:10:00 2025-05-02 06:50:00   0 days 00:40:00             40
7  2025-05-02 06:20:00 2025-05-02 06:52:00   0 days 00:32:00             32
8  2025-05-02 06:20:00 2025-05-02 06:30:00   0 days 00:10:00             10
9  2025-05-02 06:20:00 2025-05-02 06:35:00   0 days 00:15:00             15
10 2025-05-02 06:25:00 2025-05-02 07:01:00   0 days 00:36:00             36
11 2025-05-02 06:25:00 2025-05-02 06:56:00   0 days 00:31:00             31
12 2025-05-0

## Showing full DataFrame

In [133]:
flightradar.head(3)

Unnamed: 0,web-scraper-order,web-scraper-start-url,page,page2,page3,page4,page5,page6,page7,TIME,...,DATETIME,DESTINATION_CLEAN,IATA_CODE,CANCELED,DEPART_TIME,DEPART_DATETIME,AIRLINE_CLEAN,AIRCRAFT_SHORT,DELAY,DELAY_MINUTES
0,1746256881-1,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-02 05:45:00,Antalya,AYT,False,06:07,2025-05-02 06:07:00,Edelweiss Air,A359,0 days 00:22:00,22
1,1746256881-2,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-02 05:45:00,Pristina,PRN,False,,2025-05-02 00:00:00,Flexflight,320,-1 days +18:15:00,-345
2,1746256881-3,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-05-02 05:45:00,Pristina,PRN,False,06:11,2025-05-02 06:11:00,Chair Airlines,A320,0 days 00:26:00,26


## Creating a New DataFrame with Selected Columns

In [134]:
# Create a new DataFrame with selected columns
flightradar_clean = flightradar[[
    "DESTINATION_CLEAN", 
    "IATA_CODE", 
    "AIRLINE_CLEAN", 
    "AIRCRAFT_SHORT", 
    "CANCELED", 
    "DATETIME",
    "DEPART_TIME", 
    "DEPART_DATETIME", 
    "DELAY_MINUTES"
]].copy()

# Check the result
print(flightradar_clean.head(50))
print("\n")
print(flightradar_clean.dtypes)
print(flightradar_clean.shape)

    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Antalya       AYT     Edelweiss Air           A359     False   
1            Pristina       PRN        Flexflight            320     False   
2            Pristina       PRN    Chair Airlines           A320     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4              Bilbao       BIO     Edelweiss Air           A320     False   
5              Lisbon       LIS  TAP Air Portugal           A20N     False   
6            Pristina       PRN     Edelweiss Air           A320     False   
7               Porto       OPO             Swiss           BCS3     False   
8     Sharm el-Sheikh       SSH     Edelweiss Air           A320     False   
9             Seville       SVQ     Edelweiss Air           A320     False   
10       Gran Canaria       LPA     Edelweiss Air           A320     False   
11            Funchal       FNC     Edelweiss Air           A320

## Count missing values in columns

In [135]:
# Count missing values per column
print(flightradar_clean.isnull().sum())

# Total number of missing values in the DataFrame
print("Total missing values:", flightradar_clean.isnull().sum().sum())

DESTINATION_CLEAN     0
IATA_CODE             0
AIRLINE_CLEAN         3
AIRCRAFT_SHORT        0
CANCELED              0
DATETIME              0
DEPART_TIME          10
DEPART_DATETIME       0
DELAY_MINUTES         0
dtype: int64
Total missing values: 13


## Show rows with missing values in the DataFrame

In [136]:
# Filter rows with any missing values
missing_rows = flightradar_clean[flightradar_clean.isnull().any(axis=1)]

# Display rows with missing values
print(missing_rows)


     DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
1             Pristina       PRN        Flexflight            320     False   
15            Hurghada       HRG        Flexflight            320     False   
23            Belgrade       BEG             Swiss           BCS3     False   
53           Heraklion       HER        Flexflight            319     False   
63             Hamburg       HAM             Swiss           BCS3      True   
83            Istanbul       IST  Turkish Airlines            310      True   
113           Portoroz       POW               NaN           M600     False   
119             Geneva       GVA               NaN           SR22     False   
125           Helsinki       HEL           Finnair            32B      True   
129             Dublin       DUB             Swiss           A20N     False   
142  Palma de Mallorca       PMI    Chair Airlines           A320     False   
232             Odense       ODE               NaN  

## Show the DataFrame without missing values

In [137]:
# Drop rows with any missing values (NaN)
flightradar_clean = flightradar_clean.dropna(axis=0)

# Display the DataFrame after dropping rows with missing values
print(flightradar_clean)


    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Antalya       AYT     Edelweiss Air           A359     False   
2            Pristina       PRN    Chair Airlines           A320     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4              Bilbao       BIO     Edelweiss Air           A320     False   
5              Lisbon       LIS  TAP Air Portugal           A20N     False   
..                ...       ...               ...            ...       ...   
378         Sao Paulo       GRU             Swiss           B77W     False   
379         Hong Kong       HKG             Swiss           B77W     False   
380         Singapore       SIN             Swiss           B77W     False   
381      Johannesburg       JNB             Swiss           A343     False   
382             Milan       MXP  Helvetic Airways           E190     False   

               DATETIME DEPART_TIME     DEPART_DATETIME  DELAY_

## Check for duplicate rows in the DataFrame and display them if any exist

In [138]:
# Duplikate prüfen
duplicates = flightradar_clean[flightradar_clean.duplicated()]

# Duplikate anzeigen (falls vorhanden)
print(duplicates)

# Anzahl der Duplikate
print(f"Number of duplicate rows: {duplicates.shape[0]}")

Empty DataFrame
Columns: [DESTINATION_CLEAN, IATA_CODE, AIRLINE_CLEAN, AIRCRAFT_SHORT, CANCELED, DATETIME, DEPART_TIME, DEPART_DATETIME, DELAY_MINUTES]
Index: []
Number of duplicate rows: 0


## Save the cleaned DataFrame to a CSV file for further use or analysis

In [139]:
# Speichern des DataFrames als CSV-Datei
flightradar_clean.to_csv("250502_flightradar_prepared.csv", index=False)
