In [7]:
import pandas as pd
import numpy as np

# 1) Columns to load (no airTime or delayDueLateAircraft)
cols = [
    'flDate',
    'airlineCode',
    'originAirport',
    'destAirport',
    'crsDepTime',
    'crsArrTime',
    'cancelled',
    'cancellationCode',
    'crsElapsedTime',
    'distance',
    'delayDueWeather',
    'weatherTypeOrigin',
    'severityOrigin',
    'startTimeOrigin',
    'endTimeOrigin',
    'precipitationOrigin',
    'weatherTypeDest',
    'severityDest',
    'startTimeDest',
    'endTimeDest',
    'precipitationDest'
]

# 2) Load only those columns, parsing the date‐time fields
df = pd.read_csv(
    'cleaned.csv',
    usecols=cols,
    parse_dates=[
        'flDate',
        'startTimeOrigin',
        'endTimeOrigin',
        'startTimeDest',
        'endTimeDest'
    ]
)

# 2b) Drop the COVID period (March 2020 through December 2021)
covid_start = pd.Timestamp('2020-03-01')
covid_end   = pd.Timestamp('2021-12-31')
df = df[~df['flDate'].between(covid_start, covid_end)]

# 3) Fill NaNs in weather delay, then convert cancellations ‘B’ → 400 min
df['delayDueWeather'] = df['delayDueWeather'].fillna(0)
mask = (df['cancelled'] == 1) & (df['cancellationCode'] == 'B')
df.loc[mask, 'delayDueWeather'] = 400

print(f"Loaded DataFrame (excl. COVID) with shape: {df.shape}")
df.head()


Loaded DataFrame (excl. COVID) with shape: (7721858, 21)


Unnamed: 0,flDate,airlineCode,originAirport,destAirport,crsDepTime,crsArrTime,cancelled,cancellationCode,crsElapsedTime,distance,...,weatherTypeOrigin,severityOrigin,startTimeOrigin,endTimeOrigin,precipitationOrigin,weatherTypeDest,severityDest,startTimeDest,endTimeDest,precipitationDest
0,2019-01-09,UA,FLL,EWR,1155,1501,0.0,,186.0,1065.0,...,,,NaT,NaT,,Rain,Light,2019-01-09 03:37:00,2019-01-09 06:51:00,0.4
1,2022-11-19,DL,MSP,SEA,2120,2315,0.0,,235.0,1399.0,...,Snow,Light,2022-11-19 09:28:00,2022-11-19 10:53:00,0.0,,,NaT,NaT,
2,2022-11-19,DL,MSP,SEA,2120,2315,0.0,,235.0,1399.0,...,Snow,Light,2022-11-19 11:15:00,2022-11-19 12:53:00,0.0,,,NaT,NaT,
3,2022-11-19,DL,MSP,SEA,2120,2315,0.0,,235.0,1399.0,...,Snow,Light,2022-11-19 15:44:00,2022-11-19 16:35:00,0.0,,,NaT,NaT,
4,2022-11-19,DL,MSP,SEA,2120,2315,0.0,,235.0,1399.0,...,Snow,Light,2022-11-19 16:41:00,2022-11-19 17:04:00,0.0,,,NaT,NaT,


In [8]:
# 1) Filter to only flights with a weather‐based delay
df_weather = df[df['delayDueWeather'] > 0].copy()

# 2) Quick sanity check
print(f"Rows with non-zero weather delay: {df_weather.shape[0]}")

# 3) Peek at the first few
df_weather.head(10)


Rows with non-zero weather delay: 550716


Unnamed: 0,flDate,airlineCode,originAirport,destAirport,crsDepTime,crsArrTime,cancelled,cancellationCode,crsElapsedTime,distance,...,weatherTypeOrigin,severityOrigin,startTimeOrigin,endTimeOrigin,precipitationOrigin,weatherTypeDest,severityDest,startTimeDest,endTimeDest,precipitationDest
141,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
142,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
143,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
144,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
145,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 02:19:00,2022-01-02 05:51:00,0.04,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
146,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 02:19:00,2022-01-02 05:51:00,0.04,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
147,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Moderate,2022-01-02 05:51:00,2022-01-02 06:15:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
148,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Moderate,2022-01-02 05:51:00,2022-01-02 06:15:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
149,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 06:15:00,2022-01-02 09:36:00,0.05,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
150,2022-01-02,YX,ORD,DCA,1935,2241,1.0,B,126.0,612.0,...,Snow,Light,2022-01-02 06:15:00,2022-01-02 09:36:00,0.05,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13


In [9]:
df_weather = df_weather.drop(columns=['cancelled', 'cancellationCode'])

In [10]:
print(df_weather.columns)
print(len(df_weather))
df_weather.head()


Index(['flDate', 'airlineCode', 'originAirport', 'destAirport', 'crsDepTime',
       'crsArrTime', 'crsElapsedTime', 'distance', 'delayDueWeather',
       'weatherTypeOrigin', 'severityOrigin', 'startTimeOrigin',
       'endTimeOrigin', 'precipitationOrigin', 'weatherTypeDest',
       'severityDest', 'startTimeDest', 'endTimeDest', 'precipitationDest'],
      dtype='object')
550716


Unnamed: 0,flDate,airlineCode,originAirport,destAirport,crsDepTime,crsArrTime,crsElapsedTime,distance,delayDueWeather,weatherTypeOrigin,severityOrigin,startTimeOrigin,endTimeOrigin,precipitationOrigin,weatherTypeDest,severityDest,startTimeDest,endTimeDest,precipitationDest
141,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
142,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
143,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
144,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
145,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 02:19:00,2022-01-02 05:51:00,0.04,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26


In [11]:
import pandas as pd

# Use the original datetime columns (before you convert → _min)
origin_cols = [
    'weatherTypeOrigin',
    'severityOrigin',
    'startTimeOrigin',
    'endTimeOrigin',
    'precipitationOrigin'
]
dest_cols = [
    'weatherTypeDest',
    'severityDest',
    'startTimeDest',
    'endTimeDest',
    'precipitationDest'
]

# Masks for fully‐missing vs partially‐missing on the ORIGINAL columns
origin_all_missing = df_weather[origin_cols].isnull().all(axis=1)
origin_any_missing = df_weather[origin_cols].isnull().any(axis=1)
dest_all_missing   = df_weather[dest_cols].isnull().all(axis=1)
dest_any_missing   = df_weather[dest_cols].isnull().any(axis=1)

# Print counts
print("=== Origin Weather Data Missingness ===")
print("Completely missing:", origin_all_missing.sum())
print("Partially missing :", ((origin_any_missing) & (~origin_all_missing)).sum())
print("Fully present     :", (~origin_any_missing).sum(), "\n")

print("=== Destination Weather Data Missingness ===")
print("Completely missing:", dest_all_missing.sum())
print("Partially missing :", ((dest_any_missing) & (~dest_all_missing)).sum())
print("Fully present     :", (~dest_any_missing).sum(), "\n")

# Crosstab of intersection
print("=== Rows by (origin_all_missing, dest_all_missing) ===")
print(pd.crosstab(
    origin_all_missing,
    dest_all_missing,
    rownames=['origin_all_missing'],
    colnames=['dest_all_missing']
))


=== Origin Weather Data Missingness ===
Completely missing: 45557
Partially missing : 0
Fully present     : 505159 

=== Destination Weather Data Missingness ===
Completely missing: 65964
Partially missing : 0
Fully present     : 484752 

=== Rows by (origin_all_missing, dest_all_missing) ===
dest_all_missing     False  True 
origin_all_missing               
False               453330  51829
True                 31422  14135


In [12]:
# 1) Record the original size
before_count = len(df_weather)

# 2) If you still have raw “UNK” strings, drop them:
for col in ['weatherTypeOrigin','weatherTypeDest','severityOrigin','severityDest']:
    if df_weather[col].dtype == object:
        df_weather = df_weather[df_weather[col] != 'UNK']

# 3) If you’ve already mapped “UNK” → -1, drop those too:
for col in ['weatherTypeOrigin','weatherTypeDest','severityOrigin','severityDest']:
    if pd.api.types.is_integer_dtype(df_weather[col]):
        df_weather = df_weather[df_weather[col] != -1]

# 4) Record the new size
after_count = len(df_weather)

# 5) Print before & after
print(f"Rows before dropping UNK/-1 entries: {before_count}")
print(f"Rows after  dropping UNK/-1 entries: {after_count}")
print(f"Total dropped: {before_count - after_count}")


Rows before dropping UNK/-1 entries: 550716
Rows after  dropping UNK/-1 entries: 506598
Total dropped: 44118


In [13]:
# Recompute the “all-missing” masks on df_weather
origin_cols = [
    'weatherTypeOrigin','severityOrigin',
    'startTimeOrigin','endTimeOrigin',
    'precipitationOrigin'
]
dest_cols = [
    'weatherTypeDest','severityDest',
    'startTimeDest','endTimeDest',
    'precipitationDest'
]

origin_all_missing = df_weather[origin_cols].isna().all(axis=1)
dest_all_missing   = df_weather[dest_cols].isna().all(axis=1)

print("Before drop, rows:", df_weather.shape[0])
# Drop rows where both blocks are missing
df_weather = df_weather.loc[~(origin_all_missing & dest_all_missing)].copy()
print("After  drop, rows:", df_weather.shape[0])


Before drop, rows: 506598
After  drop, rows: 492463


In [14]:
# Boolean masks (re-compute or reuse from above)
origin_all_missing = df_weather[['weatherTypeOrigin','severityOrigin',
                                 'startTimeOrigin','endTimeOrigin',
                                 'precipitationOrigin']].isna().all(axis=1)
dest_all_missing   = df_weather[['weatherTypeDest','severityDest',
                                 'startTimeDest','endTimeDest',
                                 'precipitationDest']].isna().all(axis=1)

# Fill origin-missing rows
df_weather.loc[origin_all_missing, [
    'startTimeOrigin', 'endTimeOrigin',
    'weatherTypeOrigin','severityOrigin',
    'precipitationOrigin'
]] = [-1, -1, -1, -1, 0]

# Fill dest-missing rows
df_weather.loc[dest_all_missing, [
    'startTimeDest', 'endTimeDest',
    'weatherTypeDest','severityDest',
    'precipitationDest'
]] = [-1, -1, -1, -1, 0]

# (Optional) Verify
print("After block-fill, any origin NaNs left?", 
      df_weather[['weatherTypeOrigin','severityOrigin',
                  'startTimeOrigin','endTimeOrigin',
                  'precipitationOrigin']].isna().any().any())
print("After block-fill, any dest NaNs left?", 
      df_weather[['weatherTypeDest','severityDest',
                  'startTimeDest','endTimeDest',
                  'precipitationDest']].isna().any().any())


  df_weather.loc[origin_all_missing, [
  df_weather.loc[origin_all_missing, [
  df_weather.loc[dest_all_missing, [
  df_weather.loc[dest_all_missing, [


After block-fill, any origin NaNs left? False
After block-fill, any dest NaNs left? False


In [15]:
df_weather.head(10)

Unnamed: 0,flDate,airlineCode,originAirport,destAirport,crsDepTime,crsArrTime,crsElapsedTime,distance,delayDueWeather,weatherTypeOrigin,severityOrigin,startTimeOrigin,endTimeOrigin,precipitationOrigin,weatherTypeDest,severityDest,startTimeDest,endTimeDest,precipitationDest
141,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
142,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 01:10:00,2022-01-02 01:51:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
143,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
144,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 01:51:00,2022-01-02 02:19:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
145,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 02:19:00,2022-01-02 05:51:00,0.04,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
146,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 02:19:00,2022-01-02 05:51:00,0.04,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
147,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 05:51:00,2022-01-02 06:15:00,0.03,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
148,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Moderate,2022-01-02 05:51:00,2022-01-02 06:15:00,0.03,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13
149,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 06:15:00,2022-01-02 09:36:00,0.05,Rain,Light,2022-01-02 03:17:00,2022-01-02 05:52:00,0.26
150,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,Snow,Light,2022-01-02 06:15:00,2022-01-02 09:36:00,0.05,Rain,Light,2022-01-02 06:52:00,2022-01-02 11:19:00,0.13


In [16]:
# … earlier in the same cell …

# 1) Ensure those four columns are true datetimes
for col in ['startTimeOrigin','endTimeOrigin','startTimeDest','endTimeDest']:
    df_weather[col] = pd.to_datetime(df_weather[col], errors='coerce')

# 2) Now convert datetime → minutes‐of‐day, missing→‐1
for col in ['startTimeOrigin','endTimeOrigin','startTimeDest','endTimeDest']:
    df_weather[col + '_min'] = (
        df_weather[col]
          .dt.hour.mul(60)
          .add(df_weather[col].dt.minute)
          .fillna(-1)
          .astype(int)
    )

# 3) Drop the originals
df_weather.drop(columns=[
    'startTimeOrigin','endTimeOrigin','startTimeDest','endTimeDest'
], inplace=True)

# … rest of your mapping steps …


In [17]:
import pandas as pd

# 0) (Optional) Inspect what’s in df_weather right now
print("Columns before:", df_weather.columns.tolist())

# 1) Fill precipitation
df_weather['precipitationOrigin'] = df_weather['precipitationOrigin'].fillna(0)
df_weather['precipitationDest']   = df_weather['precipitationDest'].fillna(0)

# 2) Convert datetime columns → minutes-from-midnight, missing→-1
for col in ['startTimeOrigin','endTimeOrigin','startTimeDest','endTimeDest']:
    if col in df_weather.columns:
        df_weather[col + '_min'] = (
            df_weather[col]
            .dt.hour.mul(60)
            .add(df_weather[col].dt.minute)
            .fillna(-1)
            .astype(int)
        )
    else:
        print(f"Warning: {col} not in df_weather, skipping conversion")

# 3) Now drop the original datetime cols
to_drop = [c for c in ['startTimeOrigin','endTimeOrigin','startTimeDest','endTimeDest'] 
           if c in df_weather.columns]
df_weather.drop(columns=to_drop, inplace=True)

# 4) Map severities → integers (–1 for missing)
severity_map = {'Light': 0, 'Moderate': 1, 'Heavy': 2}
for side in ['Origin','Dest']:
    col = f'severity{side}'
    df_weather[col] = df_weather[col].map(severity_map).fillna(-1).astype(int)

# 5) Map weatherType → integers (–1 for missing)
all_types = pd.concat([
    df_weather['weatherTypeOrigin'].dropna(),
    df_weather['weatherTypeDest'].dropna()
]).unique()
weather_map = {wt: i for i, wt in enumerate(all_types)}
for side in ['Origin','Dest']:
    col = f'weatherType{side}'
    df_weather[col] = df_weather[col].map(weather_map).fillna(-1).astype(int)

# 6) Quick check
print("Columns after:", df_weather.columns.tolist())
df_weather.head()


Columns before: ['flDate', 'airlineCode', 'originAirport', 'destAirport', 'crsDepTime', 'crsArrTime', 'crsElapsedTime', 'distance', 'delayDueWeather', 'weatherTypeOrigin', 'severityOrigin', 'precipitationOrigin', 'weatherTypeDest', 'severityDest', 'precipitationDest', 'startTimeOrigin_min', 'endTimeOrigin_min', 'startTimeDest_min', 'endTimeDest_min']
Columns after: ['flDate', 'airlineCode', 'originAirport', 'destAirport', 'crsDepTime', 'crsArrTime', 'crsElapsedTime', 'distance', 'delayDueWeather', 'weatherTypeOrigin', 'severityOrigin', 'precipitationOrigin', 'weatherTypeDest', 'severityDest', 'precipitationDest', 'startTimeOrigin_min', 'endTimeOrigin_min', 'startTimeDest_min', 'endTimeDest_min']


Unnamed: 0,flDate,airlineCode,originAirport,destAirport,crsDepTime,crsArrTime,crsElapsedTime,distance,delayDueWeather,weatherTypeOrigin,severityOrigin,precipitationOrigin,weatherTypeDest,severityDest,precipitationDest,startTimeOrigin_min,endTimeOrigin_min,startTimeDest_min,endTimeDest_min
141,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,0,0,0.03,1,0,0.26,70,111,197,352
142,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,0,0,0.03,1,0,0.13,70,111,412,679
143,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,0,1,0.03,1,0,0.26,111,139,197,352
144,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,0,1,0.03,1,0,0.13,111,139,412,679
145,2022-01-02,YX,ORD,DCA,1935,2241,126.0,612.0,400.0,0,0,0.04,1,0,0.26,139,351,197,352


In [19]:
import numpy as np
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 0) Exclude cancellations and compute severity score & impact
df_nc = df_weather[df_weather['delayDueWeather'] != 400].copy()

# Map weather types to numeric severity
severity_map = {'Clear':1, 'Fog':1, 'Precipitation':1,
                'Rain':2, 'Snow':3, 'Storm':3,
                'Cold':3, 'Hail':3, np.nan:1}
df_nc['SeverityOriginScore'] = df_nc['weatherTypeOrigin'].map(severity_map).fillna(1)
df_nc['SeverityDestScore']   = df_nc['weatherTypeDest'].map(severity_map).fillna(1)

# Weather impact score = precipitation * severity
df_nc['WeatherImpactOrigin'] = df_nc['precipitationOrigin'] * df_nc['SeverityOriginScore']
df_nc['WeatherImpactDest']   = df_nc['precipitationDest']   * df_nc['SeverityDestScore']

# 1) Build feature matrix and target
X = df_nc[['crsDepTime','crsArrTime','crsElapsedTime','distance',
           'startTimeOrigin_min','endTimeOrigin_min',
           'startTimeDest_min','endTimeDest_min',
           'precipitationOrigin','precipitationDest',
           'SeverityOriginScore','SeverityDestScore',
           'WeatherImpactOrigin','WeatherImpactDest',
           'weatherTypeOrigin','weatherTypeDest',
           'airlineCode','originAirport','destAirport']]
y = df_nc['delayDueWeather']

# 2) Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3) Define numeric & categorical features
numeric_features = ['crsDepTime','crsArrTime','crsElapsedTime','distance',
                    'startTimeOrigin_min','endTimeOrigin_min',
                    'startTimeDest_min','endTimeDest_min',
                    'precipitationOrigin','precipitationDest',
                    'SeverityOriginScore','SeverityDestScore',
                    'WeatherImpactOrigin','WeatherImpactDest']
categorical_features = ['weatherTypeOrigin','weatherTypeDest',
                        'airlineCode','originAirport','destAirport']

# 4) Preprocessor & pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_features)
])

xgb_pipe = Pipeline([
    ('pre', preprocessor),
    ('xgb', XGBRegressor(
        tree_method='hist',
        device='cuda',
        random_state=42,
        learning_rate=0.1,
        max_depth=6,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=1,
        verbosity=0
    ))
])

# 5) Fit & evaluate
xgb_pipe.fit(X_train, y_train)
preds = xgb_pipe.predict(X_test)
print("XGBoost w/ Severity Score → RMSE: {:.2f}".format(np.sqrt(mean_squared_error(y_test, preds))))
print("                          R²:   {:.3f}".format(r2_score(y_test, preds)))

# 6) Save the model
joblib.dump(xgb_pipe, 'cuda_xgb_severity_model.joblib')
print("Model saved to 'cuda_xgb_severity_model.joblib'")


XGBoost w/ Severity Score → RMSE: 76.28
                          R²:   0.682
Model saved to 'cuda_xgb_severity_model.joblib'


In [31]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# 0) Load your DataFrame if not already in memory
# df_weather = pd.read_csv('/mnt/data/cleaned_weather_data.csv')

# 1) Merge airport → cluster
clusters = pd.read_csv('airport_cluster_assignments_iata.csv')
clusters.rename(columns={'AirportCode':'originAirport','Cluster':'cluster'}, inplace=True)
df = (
    df_weather.drop(columns=['cluster'], errors='ignore')
              .merge(clusters[['originAirport','cluster']], on='originAirport', how='left')
)

# 2) Exclude cancellations
df_nc = df[df['delayDueWeather'] != 400].copy()

# 3) Severity scores & impact
severity_map = {
    'Clear':1, 'Fog':1, 'Precipitation':1,
    'Rain':2, 'Snow':3, 'Storm':3, 'Cold':3, 'Hail':3
}
df_nc['SeverityOriginScore'] = df_nc['weatherTypeOrigin'].map(severity_map).fillna(1)
df_nc['SeverityDestScore']   = df_nc['weatherTypeDest'].map(severity_map).fillna(1)
df_nc['WeatherImpactOrigin'] = df_nc['precipitationOrigin'] * df_nc['SeverityOriginScore']
df_nc['WeatherImpactDest']   = df_nc['precipitationDest']   * df_nc['SeverityDestScore']

# 4) Add/derive year column if possible
if 'year' not in df_nc.columns:
    if 'fl_date' in df_nc.columns:
        df_nc['year'] = pd.to_datetime(df_nc['fl_date']).dt.year

# 5) Split off test set
features = [
    'crsDepTime','crsArrTime','crsElapsedTime','distance',
    'startTimeOrigin_min','endTimeOrigin_min',
    'startTimeDest_min','endTimeDest_min',
    'precipitationOrigin','precipitationDest',
    'SeverityOriginScore','SeverityDestScore',
    'WeatherImpactOrigin','WeatherImpactDest',
    'weatherTypeOrigin','weatherTypeDest',
    'airlineCode','originAirport','destAirport',
    'cluster'
]
if 'year' in df_nc.columns:
    features.append('year')

X = df_nc[features]
y = df_nc['delayDueWeather']
_, X_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=42)

# 6) Load trained pipeline
xgb_sev = joblib.load('cuda_xgb_severity_model.joblib')

# 7) Current predictions
delay_base = xgb_sev.predict(X_test.drop(columns=['cluster']))

# 8) Build future scenario
years = 10
cluster_pct = {
    0: {'precip':0.303,'fog_sev':0.011,'rain_sev':-0.001,'snow_sev':0.005,'storm_sev':0.010},
    1: {'precip':0.000,'fog_sev':0.017,'rain_sev':-0.002,'snow_sev':0.005,'storm_sev':0.010},
    2: {'precip':0.000,'fog_sev':0.008,'rain_sev':-0.002,'snow_sev':0.005,'storm_sev':0.010},
    3: {'precip':0.000,'fog_sev':0.009,'rain_sev':0.000,'snow_sev':0.005,'storm_sev':0.010}
}
X_future = X_test.copy()

for cl, pct in cluster_pct.items():
    mask = X_future['cluster'] == cl
    # scale precipitation
    factor = 1 + pct['precip'] * years
    for col in ['precipitationOrigin','precipitationDest']:
        X_future.loc[mask, col] = X_future.loc[mask, col].fillna(0) * factor
    # bump severity scores by years * slope
    for wtype, key in [('Fog','fog_sev'),('Snow','snow_sev'),
                       ('Rain','rain_sev'),('Storm','storm_sev')]:
        bump = pct[key] * years
        mask_o = mask & (X_future['weatherTypeOrigin'] == wtype)
        mask_d = mask & (X_future['weatherTypeDest']   == wtype)
        X_future.loc[mask_o, 'SeverityOriginScore'] += bump
        X_future.loc[mask_d, 'SeverityDestScore']   += bump

# 9) Shift calendar year forward (if year exists)
if 'year' in X_future.columns:
    X_future['year'] = X_future['year'] + years

# 10) Recompute impact
X_future['WeatherImpactOrigin'] = X_future['precipitationOrigin'] * X_future['SeverityOriginScore']
X_future['WeatherImpactDest']   = X_future['precipitationDest']   * X_future['SeverityDestScore']

# 11) Future predictions
delay_future = xgb_sev.predict(X_future.drop(columns=['cluster']))

# 12) Compare
print(f"Avg current delay: {delay_base.mean():.1f} min")
print(f"Avg future  delay: {delay_future.mean():.1f} min")


Avg current delay: 78.1 min
Avg future  delay: 79.1 min


In [32]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# ─────────────────────────────────────────────────────────────
# 0)  Load + merge + engineer exactly as before
# ─────────────────────────────────────────────────────────────
clusters = pd.read_csv('airport_cluster_assignments_iata.csv')
clusters.rename(columns={'AirportCode':'originAirport','Cluster':'cluster'}, inplace=True)
df = (
    df_weather.drop(columns=['cluster'], errors='ignore')
              .merge(clusters[['originAirport','cluster']], on='originAirport', how='left')
)

df_nc = df[df['delayDueWeather'] != 400].copy()

severity_map = {
    'Clear':1,'Fog':1,'Precipitation':1,
    'Rain':2,'Snow':3,'Storm':3,'Cold':3,'Hail':3
}
df_nc['SeverityOriginScore'] = df_nc['weatherTypeOrigin'].map(severity_map).fillna(1)
df_nc['SeverityDestScore']   = df_nc['weatherTypeDest'].map(severity_map).fillna(1)

# If logistic-reg model doesn’t use WeatherImpact, we can skip these columns
df_nc['WeatherImpactOrigin'] = df_nc['precipitationOrigin'] * df_nc['SeverityOriginScore']
df_nc['WeatherImpactDest']   = df_nc['precipitationDest']   * df_nc['SeverityDestScore']

if 'year' not in df_nc.columns and 'fl_date' in df_nc.columns:
    df_nc['year'] = pd.to_datetime(df_nc['fl_date']).dt.year

# ─────────────────────────────────────────────────────────────
# 1)  Build feature matrix matching the logistic model
#     ( cluster is NOT part of its feature list )
# ─────────────────────────────────────────────────────────────
logreg_feats = [
    'airlineCode','originAirport','destAirport',
    'crsDepTime','crsArrTime','crsElapsedTime','distance',
    'weatherTypeOrigin','severityOrigin','precipitationOrigin',
    'weatherTypeDest','severityDest','precipitationDest',
    'startTimeOrigin_min','endTimeOrigin_min',
    'startTimeDest_min','endTimeDest_min'
]
if 'year' in df_nc.columns:
    logreg_feats.append('year')

X = df_nc[logreg_feats]
y = (df_nc['delayDueWeather'] == 400).astype(int)   # canceled label

_, X_test, _, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ─────────────────────────────────────────────────────────────
# 2)  Load the logistic-regression pipeline
# ─────────────────────────────────────────────────────────────
logreg = joblib.load('logreg_C0_01.liblinear.joblib')

# Current cancellation predictions
pred_curr = logreg.predict(X_test)
curr_cancel_rate = pred_curr.mean() * 100

# ─────────────────────────────────────────────────────────────
# 3)  Build 20-year future scenario (same % changes as before)
# ─────────────────────────────────────────────────────────────
years = 10
cluster_pct = {
    0: {'precip':0.303,'fog_sev':0.011,'rain_sev':-0.001,'snow_sev':0.005,'storm_sev':0.010},
    1: {'precip':0.000,'fog_sev':0.017,'rain_sev':-0.002,'snow_sev':0.005,'storm_sev':0.010},
    2: {'precip':0.000,'fog_sev':0.008,'rain_sev':-0.002,'snow_sev':0.005,'storm_sev':0.010},
    3: {'precip':0.000,'fog_sev':0.009,'rain_sev':0.000,'snow_sev':0.005,'storm_sev':0.010}
}

X_future = X_test.copy()
X_future = X_future.join(df_nc.loc[X_test.index, 'cluster'])

for cl, pct in cluster_pct.items():
    m = X_future['cluster'] == cl
    # scale precipitation
    scale = 1 + pct['precip'] * years
    for col in ['precipitationOrigin','precipitationDest']:
        if col in X_future.columns:
            X_future.loc[m, col] = X_future.loc[m, col].fillna(0) * scale
    # bump severities
    for w, k in [('Fog','fog_sev'),('Snow','snow_sev'),
                 ('Rain','rain_sev'),('Storm','storm_sev')]:
        bump = pct[k] * years
        m_o = m & (X_future['weatherTypeOrigin']==w)
        m_d = m & (X_future['weatherTypeDest']==w)
        X_future.loc[m_o, 'severityOrigin'] += bump
        X_future.loc[m_d, 'severityDest']   += bump

if 'year' in X_future.columns:
    X_future['year'] += years

# Drop helper column so feature set matches
X_future = X_future[logreg_feats]

# ─────────────────────────────────────────────────────────────
# 4)  Future cancellation predictions
# ─────────────────────────────────────────────────────────────
pred_future = logreg.predict(X_future)
future_cancel_rate = pred_future.mean() * 100

# ─────────────────────────────────────────────────────────────
# 5)  Report
# ─────────────────────────────────────────────────────────────
print(f"Current cancellation rate : {curr_cancel_rate:.1f}%")
print(f"Future  cancellation rate : {future_cancel_rate:.1f}%")

Current cancellation rate : 52.9%
Future  cancellation rate : 54.8%
