In [None]:
#imports
import pandas as pd
import numpy as np
import warnings
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score,mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.decomposition import PCA
import pickle as pkl
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression, ElasticNet


#cohen kappa for scoring
def cohen_kappa_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return cohen_kappa_score(y, y_pred)


#convert numeric response to categorical
def categorize_delays(delays):
    result = np.where(delays < 30, 'ontime',
             np.where((delays >= 30) & (delays <= 120), 'minordelay',
             np.where(delays > 120, 'majordelay', delays)))
    return result


#match columns of original dataset
def match_cols(data, cols):
    for col in cols:
        if col not in data.columns:
            data[col] = 0

    new_data = data[cols]
    return new_data


#returns x,y
def get_data(path):
    flights = pd.read_csv(path)
    planes = pd.read_csv('planes.csv')
    weather = pd.read_csv('weather.csv')

    #Get columns from model fit from pkl
    with open('model_cols.pkl','rb') as f:
        model_cols = list(pkl.load(f))

    #impute weather
    weather_orig = weather['origin']
    weather = weather.drop(['wind_gust','origin','time_hour','year'],axis=1)
    imputer = IterativeImputer(sample_posterior=True)
    weather = pd.DataFrame(imputer.fit_transform(weather), columns=weather.columns)
    weather.insert(0, 'origin', weather_orig)

    #impute airplanes
    planes = planes.drop('speed',axis=1)
    year_by_model = planes.groupby('model')['year'].first()
    planes['year'] = planes['year'].fillna(planes['model'].map(year_by_model)) #still some missing... use median
    planes['year'] = planes['year'].fillna(planes['year'].median())


    ##### New variables #####
    #delay severity
    flights['delay_severity'] = categorize_delays(flights['dep_delay'])

    #existance of a delay
    flights['is_delayed'] = np.where(flights['delay_severity'] == 'ontime', 0, 1)

    #snowing category
    weather['snowing'] = (weather['precip'] > 0) & (weather['temp'] <= 32).astype(int)

    #day of week + weekend category (F-M)
    flights['date'] = pd.to_datetime(flights[['year', 'month', 'day']])
    flights['day_of_week'] = flights['date'].dt.day_name()
    flights['is_weekend'] = flights['day_of_week'].isin(['Friday', 'Saturday', 'Sunday', 'Monday']).astype(int)

    #peak dates (Thanksgiving (11/28), Christmas, Memorial Day (5/27), July Fourth, and Labor Day(9/2)) pm 5 days
    peak_dates = pd.to_datetime(['2013-11-28', '2013-12-25', '2013-07-04', '2013-05-27', '2013-09-02'])

    peak_weeks = pd.DataFrame() #get 5 days before/after
    for date in peak_dates:
        date_range = pd.date_range(start=date - pd.Timedelta(days=5), 
                                end=date + pd.Timedelta(days=5))
        peak_weeks = pd.concat([peak_weeks, pd.DataFrame({'date': date_range})], ignore_index=True)
        
    flights['peak_week'] = flights['date'].isin(peak_weeks['date']).astype(int)

    #peak times (6PM-9PM)
    flights['peak_time'] = flights['hour'].between(18, 21)
    flights['peak_time'] = flights['peak_time'].astype(int)

    #prior airline, origin, and destination delays (takes 2 min to run)
    print('Getting new variables (1/3)')
    flights['date'] = pd.to_datetime(flights[['year', 'month', 'day', 'hour', 'minute']])

    flights['carrier_delay'] = flights.apply(
        lambda row: flights[(flights['carrier'] == row['carrier']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    print('Getting new variables (2/3)')
    flights['origin_delay'] = flights.apply(
        lambda row: flights[(flights['origin'] == row['origin']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    print('Getting new variables (3/3)')
    flights['dest_delay'] = flights.apply(
        lambda row: flights[(flights['dest'] == row['dest']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    flights['carrier_delay'] = categorize_delays(flights['carrier_delay'])
    flights['carrier_delay'] = np.where(flights['carrier_delay'] == 'ontime', 0, 1)

    flights['origin_delay'] = categorize_delays(flights['origin_delay'])
    flights['origin_delay'] = np.where(flights['origin_delay'] == 'ontime', 0, 1)

    flights['dest_delay'] = categorize_delays(flights['dest_delay'])
    flights['dest_delay'] = np.where(flights['dest_delay'] == 'ontime', 0, 1)

    #number of flights leaving airport same day
    flights['flight_volume'] = flights.apply(
        lambda row: len(flights[(flights['origin'] == row['origin']) & 
                            (flights['year'] == row['year']) & 
                            (flights['month'] == row['month']) & 
                            (flights['day'] == row['day'])]),axis=1)

    #create final dataset
    flights = pd.merge(flights, weather, on=['month', 'day', 'hour', 'origin'])

    planes['year_manufactured'] = planes['year']
    planes = planes.drop('year',axis=1)
    flights = pd.merge(flights, planes, on='tailnum')

    #responses
    ys = flights[['dep_delay', 'delay_severity', 'is_delayed']]

    flights = flights.drop(['arr_time', 'arr_delay', 'flight','date','tailnum','air_time',
                            'year', 'month', 'day', 'dest', 'dep_time',
                            'dep_delay', 'delay_severity', 'is_delayed'],axis=1)
    
    #predictors
    x = pd.get_dummies(flights,dtype=int)

    #match columns to original data
    x = match_cols(x, model_cols)
    
    return x,ys


#Function to input delay data and return predictions
def predict_delays(path, verbose = True):
    x_input, y_input = get_data(path)
    
    with open('model1.pkl','rb') as f:
        mod1 = pkl.load(f)
        mod1_cols = pkl.load(f)

    with open('model1.pkl','rb') as f:
        mod2 = pkl.load(f)
        mod2_cols = pkl.load(f)

    if verbose: print('Fitting model...')

    

    preds = mod1.predict(x_input)
    
    if verbose: print(classification_report(y_input,preds))
    
    return preds

In [91]:
#mod1

##### Data #####
data = pd.read_csv('flight_data_full.csv')
data = data.drop(['Unnamed: 0','air_time','year', 'month', 'day', 'dest', 'dep_time'],axis=1)

#data for first model
x = data.drop(['dep_delay', 'delay_severity', 'is_delayed'],axis=1)
x = pd.get_dummies(x,dtype=int)
y = data['is_delayed']
x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y,train_size=.7,random_state=764)

rf = RandomForestClassifier(class_weight='balanced',ccp_alpha=0,max_depth=None,min_samples_split=2,
                            max_features='sqrt',n_estimators=200)

rf.fit(x_train1, y_train1)
y_pred = rf.predict(x_test1)
print(classification_report(y_test1, y_pred))

y_pred_prob = rf.predict_proba(x_test1)[:, 1]
threshold = 0.15 #.79 both
y_pred_adjusted = (y_pred_prob >= threshold).astype(int)

print("Adjusted Threshold Classification Report:\n", classification_report(y_test1, y_pred_adjusted))


              precision    recall  f1-score   support

           0       0.89      0.99      0.93      5595
           1       0.82      0.35      0.49      1083

    accuracy                           0.88      6678
   macro avg       0.85      0.67      0.71      6678
weighted avg       0.88      0.88      0.86      6678

Adjusted Threshold Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.78      0.86      5595
           1       0.41      0.79      0.54      1083

    accuracy                           0.78      6678
   macro avg       0.68      0.79      0.70      6678
weighted avg       0.86      0.78      0.80      6678



In [90]:
##### mod 2 #####

data2 = data[data['is_delayed'] == True]

y = data2['delay_severity'] == 'majordelay'
x = data2.drop(['dep_delay', 'delay_severity', 'is_delayed'],axis=1)
x = pd.get_dummies(x,dtype=int)

x_train2, x_test2, y_train2, y_test2 = train_test_split(x,y,train_size=.7,random_state=764)

dt = DecisionTreeClassifier(random_state=42, max_depth=5, class_weight='balanced')

dt.fit(x_train2, y_train2)
y_pred = dt.predict(x_test2)
print(classification_report(y_test2, y_pred))

              precision    recall  f1-score   support

       False       0.92      0.54      0.68       833
        True       0.30      0.82      0.44       203

    accuracy                           0.59      1036
   macro avg       0.61      0.68      0.56      1036
weighted avg       0.80      0.59      0.63      1036



In [None]:
testing = x_test1[y_pred_adjusted == True]
testing = match_cols(testing, list(x_train2.columns))
delays_classified = dt.predict(testing)



In [106]:
data.loc[x_test1.index, 'is_delayed_pred'] = y_pred_adjusted

# Filter delayed rows for Model 2
delayed_indices = data[data['is_delayed_pred'] == 1].index
delayed_data = data.loc[delayed_indices]

# Prepare Model 2 features
x_mod2 = delayed_data.drop(['dep_delay', 'delay_severity', 'is_delayed'], axis=1)
x_mod2 = pd.get_dummies(x_mod2, dtype=int)
x_mod2 = match_cols(x_mod2, list(x_train2.columns))

# Model 2 Predictions
delayed_predictions = dt.predict(x_mod2)
data.loc[delayed_indices, 'delay_severity_pred'] = delayed_predictions


In [None]:
#Load models and columns

with open('model1.pkl','wb') as f:
    pkl.dump(rf,f)
    pkl.dump(list(x_train1.columns),f)

with open('model2.pkl','wb') as f:
    pkl.dump(dt,f)
    pkl.dump(list(x_train2.columns),f)

In [125]:
# Load the new preprocessed dataset
new_data,y = get_data('flights_set1.csv')


Getting new variables (1/3)
Getting new variables (2/3)
Getting new variables (3/3)


In [None]:
# Align features
aligned_new_data = new_data.reindex(columns=x.columns, fill_value=0)

# Model 1 predictions
y_new_mod1_pred_prob = rf.predict_proba(aligned_new_data)[:, 1]
y_new_mod1_pred = (y_new_mod1_pred_prob >= threshold).astype(int)

# Add Model 1 predictions to the aligned data
aligned_new_data['is_delayed_pred'] = y_new_mod1_pred

# Filter rows predicted as delayed
delayed_indices = aligned_new_data[aligned_new_data['is_delayed_pred'] == 1].index
delayed_data = aligned_new_data.loc[delayed_indices]

# Align delayed data for Model 2
delayed_data_mod2 = delayed_data.reindex(columns=x_train2.columns, fill_value=0)

# Predict delay severity for delayed flights using Model 2
if not delayed_data_mod2.empty:
    delays_classified = dt.predict(delayed_data_mod2)
    aligned_new_data.loc[delayed_indices, 'delay_severity_pred'] = delays_classified
else:
    aligned_new_data['delay_severity_pred'] = None

# Inspect the results
print(aligned_new_data.head())

   distance  hour  minute  is_weekend  peak_week  peak_time  carrier_delay  \
0      1400    17       0           1          0          0              0   
1       200    21       0           0          0          1              0   
2      1085    15       0           1          0          0              0   
3      1065    16       0           1          1          0              0   
4      1068     6       0           1          0          0              0   

   origin_delay  dest_delay  flight_volume  ...  model_VANS AIRCRAFT RV6  \
0             0           0             14  ...                        0   
1             0           0             24  ...                        0   
2             0           0             19  ...                        0   
3             0           0             13  ...                        0   
4             0           0             19  ...                        0   

   model_ZODIAC 601HDS  engine_4 Cycle  engine_Reciprocating  \
0         

  aligned_new_data['is_delayed_pred'] = y_new_mod1_pred
  aligned_new_data.loc[delayed_indices, 'delay_severity_pred'] = delays_classified


In [None]:
true_y = pd.read_csv('flights_set1.csv')


guesses = aligned_new_data[['is_delayed_pred','delay_severity_pred']]

result = np.where(
    guesses['is_delayed_pred'] == 0,                              # Condition 1
    'ontime',                                           # Result if Condition 1 is True
    np.where(guesses['delay_severity_pred'], 'majordelay', 'minordelay') # Nested conditions
)

print(result)



['minordelay' 'minordelay' 'minordelay' ... 'ontime' 'ontime' 'ontime']


In [129]:

print(classification_report(y['delay_severity'],result))

              precision    recall  f1-score   support

  majordelay       0.20      0.90      0.32       401
  minordelay       0.22      0.35      0.27      1724
      ontime       0.97      0.77      0.86     11778

    accuracy                           0.72     13903
   macro avg       0.46      0.67      0.48     13903
weighted avg       0.86      0.72      0.77     13903



In [None]:
results = predict_delays('flights_set1.csv')