In [9]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv(r"..\data\raw\airline_ontime_2023\Airline_Delay_Cause.csv")

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24518 entries, 0 to 24517
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 24518 non-null  int64  
 1   month                24518 non-null  int64  
 2   carrier              24518 non-null  object 
 3   carrier_name         24518 non-null  object 
 4   airport              24518 non-null  object 
 5   airport_name         24518 non-null  object 
 6   arr_flights          24468 non-null  float64
 7   arr_del15            24459 non-null  float64
 8   carrier_ct           24468 non-null  float64
 9   weather_ct           24468 non-null  float64
 10  nas_ct               24468 non-null  float64
 11  security_ct          24468 non-null  float64
 12  late_aircraft_ct     24468 non-null  float64
 13  arr_cancelled        24468 non-null  float64
 14  arr_diverted         24468 non-null  float64
 15  arr_delay            24468 non-null 

In [None]:
#dropping duplicates and rows with no flights
df = df.drop_duplicates()
df = df[df["arr_flights"] > 0]

In [None]:
#fixing column data types
int_cols = ["year", "month", "arr_flights", "arr_del15", "arr_cancelled", "arr_diverted"]
float_cols = ["carrier_ct", "weather_ct", "nas_ct", "security_ct", "late_aircraft_ct",
              "arr_delay", "carrier_delay", "weather_delay", "nas_delay",
              "security_delay", "late_aircraft_delay"]

for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)

for col in float_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)

In [14]:
df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01")

df["total_delay"] = (
    df["carrier_delay"] + df["weather_delay"] +
    df["nas_delay"] + df["security_delay"] + df["late_aircraft_delay"]
)

df["delay_ratio"] = df["arr_del15"] / df["arr_flights"]

df["cancel_ratio"] = df["arr_cancelled"] / df["arr_flights"]

In [15]:
#clipping negative delays
delay_cols = ["arr_delay","carrier_delay","weather_delay","nas_delay","security_delay","late_aircraft_delay"]
df[delay_cols] = df[delay_cols].clip(lower=0)

In [16]:
keep_cols = ["date","year","month","carrier","carrier_name","airport","airport_name",
             "arr_flights","arr_del15","arr_cancelled","arr_diverted",
             "total_delay","delay_ratio","cancel_ratio"]

df_clean = df[keep_cols]

In [18]:
df_clean.to_csv("../data/processed/airline_ontime_2023_clean.csv", index=False)

print(f"Preprocessing complete! Cleaned file saved with {df_clean.shape[0]} rows.")
print(df_clean.head())

Preprocessing complete! Cleaned file saved with 24468 rows.
        date  year  month carrier       carrier_name airport  \
0 2024-01-01  2024      1      9E  Endeavor Air Inc.     ABE   
1 2024-01-01  2024      1      9E  Endeavor Air Inc.     ABY   
2 2024-01-01  2024      1      9E  Endeavor Air Inc.     AEX   
3 2024-01-01  2024      1      9E  Endeavor Air Inc.     AGS   
4 2024-01-01  2024      1      9E  Endeavor Air Inc.     ALB   

                                        airport_name  arr_flights  arr_del15  \
0  Allentown/Bethlehem/Easton, PA: Lehigh Valley ...           80         16   
1             Albany, GA: Southwest Georgia Regional            3          0   
2           Alexandria, LA: Alexandria International           62          9   
3        Augusta, GA: Augusta Regional at Bush Field           74         15   
4                   Albany, NY: Albany International          101         23   

   arr_cancelled  arr_diverted  total_delay  delay_ratio  cancel_ratio  
0

PREPROCESSING FOR LAX-JFK ROUTE

In [19]:
import pandas as pd
import numpy as np
from pathlib import Path

In [35]:
raw_path = Path(r"..\data\raw\airline_ontime_2023\Airline_Delay_Cause.csv")
ds = pd.read_csv(raw_path)

In [36]:
ds = ds[ds["airport"] == "JFK"]

In [37]:
cols_to_keep = [
    "airport", "month", "carrier",
    "arr_flights", "arr_del15", "arr_cancelled", "arr_diverted",
    "carrier_ct", "weather_ct", "nas_ct", "security_ct", "late_aircraft_ct"
]
ds = ds[cols_to_keep]

In [38]:
ds = ds.dropna()
ds = ds[ds["arr_flights"] > 0]

In [39]:
ds["delay_ratio"] = ds["arr_del15"] / ds["arr_flights"]
ds["cancel_ratio"] = ds["arr_cancelled"] / ds["arr_flights"]
ds["divert_ratio"] = ds["arr_diverted"] / ds["arr_flights"]

In [40]:
delay_causes = ["carrier_ct", "weather_ct", "nas_ct", "security_ct", "late_aircraft_ct"]
for cause in delay_causes:
    ds[f"{cause}_ratio"] = ds[cause] / ds["arr_flights"]

In [41]:
ds = ds.reset_index(drop=True)

In [42]:
processed_path = Path("../data/processed/jfk_processed.csv")
ds.to_csv(processed_path, index=False)

print(f"Processed data saved to {processed_path}")
print(ds.head())

Processed data saved to ..\data\processed\jfk_processed.csv
  airport  month carrier  arr_flights  arr_del15  arr_cancelled  arr_diverted  \
0     JFK      1      9E       1428.0      276.0           69.0           1.0   
1     JFK      1      AA       1295.0      293.0           11.0           4.0   
2     JFK      1      YX       1195.0      179.0           44.0           0.0   
3     JFK      1      AS        239.0       41.0           85.0           1.0   
4     JFK      1      B6       2690.0      747.0           48.0           9.0   

   carrier_ct  weather_ct  nas_ct  security_ct  late_aircraft_ct  delay_ratio  \
0       64.68       15.91   90.86         0.00            104.55     0.193277   
1      103.67       15.31   81.22         1.29             91.51     0.226255   
2       45.35       12.74   71.83         0.69             48.39     0.149791   
3        9.64        3.59   15.76         0.00             12.00     0.171548   
4      250.25       11.78  187.99         0.00  

In [43]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   airport                 104 non-null    object 
 1   month                   104 non-null    int64  
 2   carrier                 104 non-null    object 
 3   arr_flights             104 non-null    float64
 4   arr_del15               104 non-null    float64
 5   arr_cancelled           104 non-null    float64
 6   arr_diverted            104 non-null    float64
 7   carrier_ct              104 non-null    float64
 8   weather_ct              104 non-null    float64
 9   nas_ct                  104 non-null    float64
 10  security_ct             104 non-null    float64
 11  late_aircraft_ct        104 non-null    float64
 12  delay_ratio             104 non-null    float64
 13  cancel_ratio            104 non-null    float64
 14  divert_ratio            104 non-null    fl

In [45]:
og= pd.read_csv(r"..\data\raw\airline_ontime_2023\Airline_Delay_Cause.csv")

In [46]:
og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24518 entries, 0 to 24517
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year                 24518 non-null  int64  
 1   month                24518 non-null  int64  
 2   carrier              24518 non-null  object 
 3   carrier_name         24518 non-null  object 
 4   airport              24518 non-null  object 
 5   airport_name         24518 non-null  object 
 6   arr_flights          24468 non-null  float64
 7   arr_del15            24459 non-null  float64
 8   carrier_ct           24468 non-null  float64
 9   weather_ct           24468 non-null  float64
 10  nas_ct               24468 non-null  float64
 11  security_ct          24468 non-null  float64
 12  late_aircraft_ct     24468 non-null  float64
 13  arr_cancelled        24468 non-null  float64
 14  arr_diverted         24468 non-null  float64
 15  arr_delay            24468 non-null 