In [68]:
import pandas as pd

# Data preparation
## Read and show Excel data for one day from Flightradar

In [69]:
# XLSX-Datei laden
flightradar = pd.read_excel("20250428_flightradar.xlsx")

# Erste 5 Zeilen anzeigen
print(flightradar.head(5))  

  web-scraper-order                              web-scraper-start-url  page  \
0      1745937263-1  https://www.flightradar24.com/data/airports/zr...   NaN   
1      1745937263-2  https://www.flightradar24.com/data/airports/zr...   NaN   
2      1745937263-3  https://www.flightradar24.com/data/airports/zr...   NaN   
3      1745937263-4  https://www.flightradar24.com/data/airports/zr...   NaN   
4      1745937263-5  https://www.flightradar24.com/data/airports/zr...   NaN   

   page2  page3  page4  page5  page6  page7      TIME  FLIGHT  \
0    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   WK370   
1    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00  W22160   
2    NaN    NaN    NaN    NaN    NaN    NaN  05:45:00   GM600   
3    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK134   
4    NaN    NaN    NaN    NaN    NaN    NaN  06:00:00   WK348   

        DESTINATION           AIRLINE       AIRCRAFT          STATUS  
0    Larnaca (LCA)-   Edelweiss Air -  A320 (HB-JJM)  Dep

## Show data types and shape of the DataFrame.

In [70]:
print(flightradar.dtypes)
print(flightradar.shape)  

web-scraper-order         object
web-scraper-start-url     object
page                     float64
page2                    float64
page3                    float64
page4                    float64
page5                    float64
page6                    float64
page7                    float64
TIME                      object
FLIGHT                    object
DESTINATION               object
AIRLINE                   object
AIRCRAFT                  object
STATUS                    object
dtype: object
(400, 15)


## The Date is missing in the DataFrame. Create a new field DATETIME.
### DATETIME is the planned departure time for that day.

In [71]:
# Datum als Variable definieren
date_str = "2025-04-28"

In [72]:
# TIME als String sicherstellen (falls es z. B. float ist)
flightradar["TIME"] = flightradar["TIME"].astype(str)

# DATETIME-Feld erzeugen: 'date_str' + Zeit aus TIME-Feld
flightradar["DATETIME"] = date_str + " " + flightradar["TIME"]

# Kontrolle
print(flightradar[["TIME", "DATETIME"]].head())
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

       TIME             DATETIME
0  05:45:00  2025-04-28 05:45:00
1  05:45:00  2025-04-28 05:45:00
2  05:45:00  2025-04-28 05:45:00
3  06:00:00  2025-04-28 06:00:00
4  06:00:00  2025-04-28 06:00:00


web-scraper-order         object
web-scraper-start-url     object
page                     float64
page2                    float64
page3                    float64
page4                    float64
page5                    float64
page6                    float64
page7                    float64
TIME                      object
FLIGHT                    object
DESTINATION               object
AIRLINE                   object
AIRCRAFT                  object
STATUS                    object
DATETIME                  object
dtype: object
(400, 16)


## Create two new fields: DESTINATION_CLEAN and IATA_CODE using regex.

In [73]:
# DESTINATION_CLEAN: Alles vor der Klammer
flightradar["DESTINATION_CLEAN"] = flightradar["DESTINATION"].str.extract(r"^(.*?)\s*\(")

# IATA-CODE: Inhalt in der Klammer
flightradar["IATA_CODE"] = flightradar["DESTINATION"].str.extract(r"\((\w{3})\)")

# Kontrolle
print(flightradar[["DESTINATION", "DESTINATION_CLEAN", "IATA_CODE"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                 DESTINATION  DESTINATION_CLEAN IATA_CODE
0             Larnaca (LCA)-            Larnaca       LCA
1            Pristina (PRN)-           Pristina       PRN
2            Pristina (PRN)-           Pristina       PRN
3            Hurghada (HRG)-           Hurghada       HRG
4           Heraklion (HER)-          Heraklion       HER
5        Gran Canaria (LPA)-       Gran Canaria       LPA
6             Antalya (AYT)-            Antalya       AYT
7              Lisbon (LIS)-             Lisbon       LIS
8            Tenerife (TFS)-           Tenerife       TFS
9           Marrakesh (RAK)-          Marrakesh       RAK
10              Olbia (OLB)-              Olbia       OLB
11           Pristina (PRN)-           Pristina       PRN
12               Oslo (OSL)-               Oslo       OSL
13   Oberpfaffenhofen (OBF)-   Oberpfaffenhofen       OBF
14             Bilbao (BIO)-             Bilbao       BIO
15            Funchal (FNC)-            Funchal       FNC
16            

## Create three new fields: CANCELED, DEPART_TIME and DEPART_DATETIME using regex.

In [74]:
# 1. CANCELED-Spalte (Boolean)
flightradar["CANCELED"] = flightradar["STATUS"].str.contains("Canceled", na=False)

# 2. DEPART_TIME-Spalte (nur wenn "Departed HH:MM" enthalten ist)
flightradar["DEPART_TIME"] = flightradar["STATUS"].str.extract(r"Departed\s*(\d{2}:\d{2})")[0]

# 3. DEPART_DATETIME-Spalte: date_str + DEPART_TIME (fallback "00:00") + ":00"
flightradar["DEPART_DATETIME"] = (
    date_str
    + " "
    + flightradar["DEPART_TIME"].fillna("00:00") # falls keine Zeit da ist, setze "00:00"
    + ":00" # Sekunden anhängen
)

# Kontrolle
print(flightradar[["STATUS","CANCELED","DEPART_TIME","DEPART_DATETIME"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                   STATUS  CANCELED DEPART_TIME      DEPART_DATETIME
0          Departed 06:02     False       06:02  2025-04-28 06:02:00
1                 Unknown     False         NaN  2025-04-28 00:00:00
2          Departed 06:00     False       06:00  2025-04-28 06:00:00
3          Departed 06:11     False       06:11  2025-04-28 06:11:00
4          Departed 06:09     False       06:09  2025-04-28 06:09:00
5          Departed 06:40     False       06:40  2025-04-28 06:40:00
6          Departed 06:20     False       06:20  2025-04-28 06:20:00
7          Departed 06:47     False       06:47  2025-04-28 06:47:00
8          Departed 06:43     False       06:43  2025-04-28 06:43:00
9          Departed 06:35     False       06:35  2025-04-28 06:35:00
10         Departed 06:45     False       06:45  2025-04-28 06:45:00
11         Departed 06:38     False       06:38  2025-04-28 06:38:00
12         Departed 06:42     False       06:42  2025-04-28 06:42:00
13         Departed 06:49     Fals

## Creating the AIRLINE_CLEAN field and removing unimportant parts

In [75]:
flightradar["AIRLINE_CLEAN"] = (
    flightradar["AIRLINE"]
      .str.extract(r'^(.+?)(?=\s*(?:\(|-))')[0]  # Gruppe 1: alles bis vor "(" oder "-"
      .str.strip()                               # führende/trailing Leerzeichen entfernen
)

# Kontrolle
print(flightradar[["AIRLINE","AIRLINE_CLEAN"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

                                      AIRLINE     AIRLINE_CLEAN
0                             Edelweiss Air -     Edelweiss Air
1                                Flexflight -        Flexflight
2                            Chair Airlines -    Chair Airlines
3                             Edelweiss Air -     Edelweiss Air
4                             Edelweiss Air -     Edelweiss Air
5                             Edelweiss Air -     Edelweiss Air
6                             Edelweiss Air -     Edelweiss Air
7                          TAP Air Portugal -  TAP Air Portugal
8                             Edelweiss Air -     Edelweiss Air
9                             Edelweiss Air -     Edelweiss Air
10                            Edelweiss Air -     Edelweiss Air
11                            Edelweiss Air -     Edelweiss Air
12                           NetJets Europe -    NetJets Europe
13                           NetJets Europe -    NetJets Europe
14                            Edelweiss 

## Creating the AIRCRAFT_SHORT field

In [76]:
# Create the AIRCRAFT_SHORT field by removing text inside parentheses
flightradar["AIRCRAFT_SHORT"] = (
    flightradar["AIRCRAFT"]
      .str.replace(r"\s*\(.*?\)", "", regex=True)  # non‑greedy, regex=True
      .str.strip()
)

# Check the result
print(flightradar[["AIRCRAFT", "AIRCRAFT_SHORT"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

         AIRCRAFT AIRCRAFT_SHORT
0   A320 (HB-JJM)           A320
1          320 ()            320
2   A319 (HB-JOJ)           A319
3   A320 (HB-IHZ)           A320
4   A320 (HB-JLR)           A320
5   A320 (HB-JJL)           A320
6   A320 (HB-JLS)           A320
7   A21N (CS-TJM)           A21N
8   A320 (HB-IJV)           A320
9   A320 (HB-IHX)           A320
10  A320 (HB-JJK)           A320
11  A359 (HB-IHF)           A359
12  E55P (CS-PJD)           E55P
13  C68A (CS-LTV)           C68A
14  A320 (HB-JLP)           A320
15  A320 (HB-IJW)           A320
16  A320 (HB-JJM)           A320
17         320 ()            320
18  A319 (HB-JOJ)           A319
19  A320 (HB-IHZ)           A320
20  A320 (HB-JLR)           A320
21  A320 (HB-JJL)           A320
22  A320 (HB-JLS)           A320
23  A21N (CS-TJM)           A21N
24  A320 (HB-IJV)           A320
25  A320 (HB-IHX)           A320
26  A320 (HB-JJK)           A320
27  A359 (HB-IHF)           A359
28  E55P (CS-PJD)           E55P
29  C68A (

## Create a new field DELAY and DELAY_MINUTES and calculate it from DATETIME and DEPART_DATETIME

In [77]:
# 1) Stelle sicher, dass beide Spalten als datetime64 vorliegen
flightradar["DATETIME"] = pd.to_datetime(flightradar["DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")
flightradar["DEPART_DATETIME"] = pd.to_datetime(flightradar["DEPART_DATETIME"], format="%Y-%m-%d %H:%M:%S", errors="coerce")

# 2) DELAY als Differenz berechnen
flightradar["DELAY"] = flightradar["DEPART_DATETIME"] - flightradar["DATETIME"]

# 3a) Variante A: Floor‑Division auf total_seconds, dann in int
flightradar["DELAY_MINUTES"] = (
    flightradar["DELAY"].dt.total_seconds() // 60
).astype(int)

# Kontrolle
print(flightradar[["DATETIME","DEPART_DATETIME","DELAY","DELAY_MINUTES"]].head(50))
print("\n")
print(flightradar.dtypes)
print(flightradar.shape)

              DATETIME     DEPART_DATETIME             DELAY  DELAY_MINUTES
0  2025-04-28 05:45:00 2025-04-28 06:02:00   0 days 00:17:00             17
1  2025-04-28 05:45:00 2025-04-28 00:00:00 -1 days +18:15:00           -345
2  2025-04-28 05:45:00 2025-04-28 06:00:00   0 days 00:15:00             15
3  2025-04-28 06:00:00 2025-04-28 06:11:00   0 days 00:11:00             11
4  2025-04-28 06:00:00 2025-04-28 06:09:00   0 days 00:09:00              9
5  2025-04-28 06:10:00 2025-04-28 06:40:00   0 days 00:30:00             30
6  2025-04-28 06:10:00 2025-04-28 06:20:00   0 days 00:10:00             10
7  2025-04-28 06:20:00 2025-04-28 06:47:00   0 days 00:27:00             27
8  2025-04-28 06:20:00 2025-04-28 06:43:00   0 days 00:23:00             23
9  2025-04-28 06:25:00 2025-04-28 06:35:00   0 days 00:10:00             10
10 2025-04-28 06:25:00 2025-04-28 06:45:00   0 days 00:20:00             20
11 2025-04-28 06:30:00 2025-04-28 06:38:00   0 days 00:08:00              8
12 2025-04-2

## Showing full DataFrame

In [78]:
flightradar.head(3)

Unnamed: 0,web-scraper-order,web-scraper-start-url,page,page2,page3,page4,page5,page6,page7,TIME,...,DATETIME,DESTINATION_CLEAN,IATA_CODE,CANCELED,DEPART_TIME,DEPART_DATETIME,AIRLINE_CLEAN,AIRCRAFT_SHORT,DELAY,DELAY_MINUTES
0,1745937263-1,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-04-28 05:45:00,Larnaca,LCA,False,06:02,2025-04-28 06:02:00,Edelweiss Air,A320,0 days 00:17:00,17
1,1745937263-2,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-04-28 05:45:00,Pristina,PRN,False,,2025-04-28 00:00:00,Flexflight,320,-1 days +18:15:00,-345
2,1745937263-3,https://www.flightradar24.com/data/airports/zr...,,,,,,,,05:45:00,...,2025-04-28 05:45:00,Pristina,PRN,False,06:00,2025-04-28 06:00:00,Chair Airlines,A319,0 days 00:15:00,15


## Creating a New DataFrame with Selected Columns

In [79]:
# Create a new DataFrame with selected columns
flightradar_clean = flightradar[[
    "DESTINATION_CLEAN", 
    "IATA_CODE", 
    "AIRLINE_CLEAN", 
    "AIRCRAFT_SHORT", 
    "CANCELED", 
    "DATETIME",
    "DEPART_TIME", 
    "DEPART_DATETIME", 
    "DELAY_MINUTES"
]].copy()

# Check the result
print(flightradar_clean.head(50))
print("\n")
print(flightradar_clean.dtypes)
print(flightradar_clean.shape)

    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Larnaca       LCA     Edelweiss Air           A320     False   
1            Pristina       PRN        Flexflight            320     False   
2            Pristina       PRN    Chair Airlines           A319     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4           Heraklion       HER     Edelweiss Air           A320     False   
5        Gran Canaria       LPA     Edelweiss Air           A320     False   
6             Antalya       AYT     Edelweiss Air           A320     False   
7              Lisbon       LIS  TAP Air Portugal           A21N     False   
8            Tenerife       TFS     Edelweiss Air           A320     False   
9           Marrakesh       RAK     Edelweiss Air           A320     False   
10              Olbia       OLB     Edelweiss Air           A320     False   
11           Pristina       PRN     Edelweiss Air           A359

In [80]:
# Count missing values per column
print(flightradar_clean.isnull().sum())

# Total number of missing values in the DataFrame
print("Total missing values:", flightradar_clean.isnull().sum().sum())

DESTINATION_CLEAN     0
IATA_CODE             0
AIRLINE_CLEAN         5
AIRCRAFT_SHORT        0
CANCELED              0
DATETIME              0
DEPART_TIME          10
DEPART_DATETIME       0
DELAY_MINUTES         0
dtype: int64
Total missing values: 15


In [81]:
# Display columns with missing values
missing_values = flightradar_clean.isnull().sum()
missing_values = missing_values[missing_values > 0]

# Show the missing values
print("Columns with missing values:\n", missing_values)


Columns with missing values:
 AIRLINE_CLEAN     5
DEPART_TIME      10
dtype: int64


In [82]:
# Filter rows with any missing values
missing_rows = flightradar_clean[flightradar_clean.isnull().any(axis=1)]

# Display rows with missing values
print(missing_rows)


    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
1            Pristina       PRN        Flexflight            320     False   
17           Pristina       PRN        Flexflight            320     False   
32           Hurghada       HRG        Flexflight            320     False   
47             London       LHR             Swiss           A21N     False   
82             Speyer       QCS               NaN           PA34     False   
144             Genoa       GOA               NaN           CL60     False   
209            Lisbon       LIS             Swiss           A21N     False   
224            Lisbon       LIS  TAP Air Portugal           A20N      True   
232            London       BQH               NaN           C25C     False   
238            Munich       MUC               NaN           C525     False   
265            Cannes       CEQ               NaN           SF50     False   
324            Lisbon       LIS  TAP Air Portugal            32N

In [83]:
# Drop rows with any missing values (NaN)
flightradar_clean = flightradar_clean.dropna(axis=0)

# Display the DataFrame after dropping rows with missing values
print(flightradar_clean)


    DESTINATION_CLEAN IATA_CODE     AIRLINE_CLEAN AIRCRAFT_SHORT  CANCELED  \
0             Larnaca       LCA     Edelweiss Air           A320     False   
2            Pristina       PRN    Chair Airlines           A319     False   
3            Hurghada       HRG     Edelweiss Air           A320     False   
4           Heraklion       HER     Edelweiss Air           A320     False   
5        Gran Canaria       LPA     Edelweiss Air           A320     False   
..                ...       ...               ...            ...       ...   
395         Hong Kong       HKG             Swiss           B77W     False   
396         Singapore       SIN             Swiss           B77W     False   
397      Johannesburg       JNB             Swiss           A343     False   
398             Milan       MXP  Helvetic Airways           E190     False   
399            Geneva       GVA             Swiss           BCS1     False   

               DATETIME DEPART_TIME     DEPART_DATETIME  DELAY_

# WETTER DATEN

In [43]:
# CSV‑Datei laden (im gleichen Verzeichnis wie dein Notebook)
weather = pd.read_csv("weather.csv")

# Erstmal Überblick verschaffen
print(weather.head())        # erste 5 Zeilen
print(weather.dtypes)        # Datentypen der Spalten
print(weather.shape)         # (Anzahl Zeilen, Anzahl Spalten)

                  time  temp  dwpt  rhum  prcp  snow  wdir  wspd  wpgt  \
0  2025-04-28 00:00:00   9.9   8.3    90     0   NaN   293     2  15.0   
1  2025-04-28 01:00:00   8.4   8.0    97     0   NaN   276     5   9.0   
2  2025-04-28 02:00:00   8.2   7.8    97     0   NaN   283     7  13.0   
3  2025-04-28 03:00:00   7.8   7.0    95     0   NaN   304     3   7.0   
4  2025-04-28 04:00:00   5.8   5.5    98     0   NaN   306     4   6.0   

     pres  tsun  coco  
0  1024.8   NaN     2  
1  1025.0   NaN     2  
2  1024.8   NaN     2  
3  1024.7   NaN     5  
4  1025.1   NaN     5  
time     object
temp    float64
dwpt    float64
rhum      int64
prcp      int64
snow    float64
wdir      int64
wspd      int64
wpgt    float64
pres    float64
tsun    float64
coco      int64
dtype: object
(96, 12)
