<a href="https://colab.research.google.com/github/jrickey24/FlightDelayPrediction/blob/main/FlightDelayPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# Import From GitHub Repo Raw Text
csv_url_1 = 'https://raw.githubusercontent.com/jrickey24/MLDatasets/main/Airlines.csv' # Original File With All Airlines
csv_url_2 = 'https://raw.githubusercontent.com/jrickey24/MLDatasets/main/AirlinesRevised.csv' # AA, AS, DL, UA, US Airlines Only
csv_url_3 = 'https://raw.githubusercontent.com/jrickey24/MLDatasets/main/Airlines_AA_DL_UA.csv' # AA, DL, UA Airlines Only
csv_url_4 = 'https://raw.githubusercontent.com/jrickey24/MLDatasets/main/Airlines_OG.csv' # OG Full Data Set
df = pd.read_csv(csv_url_4)
# Removing id & Flight(number)as these aren't useful classifiers
# Removing Time(duration of flight) & Length(distance of flight)
# The Relationship Between the AirportFrom & AirportTo Correspond to Time and Length
#df.drop(['id', 'Flight', 'Length', 'Time'], axis='columns', inplace=True)
df.drop(['id', 'Flight'], axis='columns', inplace=True)
df.drop_duplicates()
#df.head(10) # Read First 10 Rows From CSV File

Unnamed: 0,Airline,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,CO,SFO,IAH,3,15,205,1
1,US,PHX,CLT,3,15,222,1
2,AA,LAX,DFW,3,20,165,1
3,AA,SFO,DFW,3,20,195,1
4,AS,ANC,SEA,3,30,202,0
...,...,...,...,...,...,...,...
539363,DL,SLC,JFK,5,1425,264,1
539365,US,KOA,PHX,5,1425,349,1
539367,UA,HNL,LAX,5,1428,333,0
539374,DL,LAX,ATL,5,1435,255,0


In [None]:
details = df.apply(lambda x : True
            if x['Airline'] == 'AA' and x['AirportFrom'] == '' else False, axis = 1

In [None]:
column_one = 'Airline'
column_two = 'Delay'
column_sv_one = 'DL'
column_sv_two= 1
search_conditions = 2

if search_conditions == 1:
  details = df.apply(lambda x : True
            if x[column_one] == column_sv_one else False, axis = 1)
else:
  details = df.apply(lambda x : True
            if x[column_one] == column_sv_one and x[column_two] == column_sv_two else False, axis = 1)   

num_rows = len(details[details == True].index)

if search_conditions == 1: 
   print('Number of Rows in dataframe in which ' + column_one + ' is ' + str(column_sv_one) + ': ', num_rows )
else:
  print('Number of Rows in dataframe in which ' + column_one + ' is ' + str(column_sv_one) + ' and ' + column_two + ' is ' + str(column_sv_two) + ': ', num_rows )

Number of Rows in dataframe in which Airline is DL and Delay is 1:  27452


Number of Rows in dataframe in which Airline is AA:  45656

Number of Rows in dataframe in which Airline is DL:  60940

Number of Rows in dataframe in which Airline is UA:  27619

Number of Rows in dataframe in which DayOfWeek is 1:  17965

Number of Rows in dataframe in which DayOfWeek is 2:  17665

Number of Rows in dataframe in which DayOfWeek is 3:  22226

Number of Rows in dataframe in which DayOfWeek is 4:  22502

Number of Rows in dataframe in which DayOfWeek is 5:  21229

Number of Rows in dataframe in which DayOfWeek is 6:  15124

Number of Rows in dataframe in which DayOfWeek is 7:  17504

Number of Rows in dataframe in which DayOfWeek is 7 and Airline is AA:  5969

Number of Rows in dataframe in which DayOfWeek is 7 and Airline is DL:  7952

Number of Rows in dataframe in which Airline is AA and Delay is 0:  27920

Number of Rows in dataframe in which Airline is AA and Delay is 1:  17736

Number of Rows in dataframe in which Airline is DL and Delay is 0:  33488

Number of Rows in dataframe in which Airline is DL and Delay is 1:  27452




In [None]:
#print(df['Airline'].value_counts()['AA']) # 45,656 Records For American Airlines Flights
#print(df['Airline'].value_counts()['DL']) # 60,940 Records For Delta Airlines Flight
#print(df['Delay'].value_counts()[0])  # 80,081 Non-Delayed Flights
#print(df['Delay'].value_counts()[1])  # 54,134 Delayed Flights
#print(df['DayOfWeek'].value_counts().unique) # Unique Days of Week 1-7

45656


In [3]:
# Transform DayOfWeek Value To String Representation
df['DayOfWeek'] = df['DayOfWeek'].replace([1,2,3,4,5,6,7],['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
#print(df['DayOfWeek'].value_counts().unique)
#print(df['AirportFrom'].value_counts().unique)

In [3]:
common_airports_list = ['ATL','DFW','DEN','ORD','LAX','CLT','LAS','PHX','MCO','SEA','MIA','IAH','JFK','FLL','EWR','SFO']
common_airports_df = df[df['AirportFrom'].isin(common_airports_list)]
common_airports_df.head(10)
common_airports_df.count #75916
count_delayed = common_airports_df[common_airports_df['Delay'].isin([1])]
count_delayed.count #31996

df_to_use = common_airports_df
#df_to_use = df


In [5]:
# Check if NaN values present in any input columns
df.columns[df.isna().any()]

Index([], dtype='object')

In [4]:
# Encode Non-numeric Classifiers for Calculations
airline_dummies = pd.get_dummies(df_to_use.Airline)
airport_from_dummies = pd.get_dummies(df_to_use.AirportFrom)
#airport_to_dummies = pd.get_dummies(df_to_use.AirportTo)
#day_of_week_dummies = pd.get_dummies(df_to_use.DayOfWeek)

# Concat Dummies
#model_input_x = pd.concat([df_to_use,airline_dummies], axis='columns')
#model_input_x = pd.concat([df_to_use,airline_dummies,airport_from_dummies], axis='columns')
model_input_x = pd.concat([df_to_use,airline_dummies,airport_from_dummies], axis='columns')
#model_input_x = pd.concat([df_to_use,airline_dummies,airport_from_dummies,day_of_week_dummies], axis='columns')
#model_input_x = pd.concat([df_to_use,airline_dummies,airport_from_dummies,airport_to_dummies,day_of_week_dummies], axis='columns')
# Drop the Plain Text Version of the Columns for Modeling
model_input_x.drop(['Airline', 'AirportFrom', 'AirportTo'], axis='columns', inplace=True)
#model_input_x.drop(['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek', 'Saturday', 'Sunday'], axis='columns', inplace=True)
#model_input_x.drop(['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek'], axis='columns', inplace=True)
target_y = df_to_use['Delay'] # Set Delay As Target Value Y
model_input_x.drop(['Delay'], axis='columns', inplace=True) # Drop Delay Column From Model Input X
model_input_x.head(10)

Unnamed: 0,DayOfWeek,Time,Length,9E,AA,AS,B6,CO,DL,EV,...,IAH,JFK,LAS,LAX,MCO,MIA,ORD,PHX,SEA,SFO
0,3,15,205,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,3,15,222,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,20,165,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,3,20,195,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,3,30,181,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
6,3,30,220,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
7,3,30,228,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
8,3,35,216,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
9,3,40,200,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11,3,50,212,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
seed = 42
X_train, X_test, y_train, y_test = train_test_split(model_input_x, target_y, test_size=0.20, random_state=seed)

print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')

X_train : (187572, 37)
y_train : (187572,)
X_test : (46894, 37)
y_test : (46894,)


In [13]:
ext_gradient_boost_model = XGBClassifier(eta=0.3, gama=0, scale_pos_weight=1.15) # defaul eta = 0.3, default gama = 0 (controls model overfitting) default scale_pos_weight = 1
ext_gradient_boost_model.fit(X_train, y_train)

XGBClassifier(eta=0.3, gama=0, scale_pos_weight=1.15)

In [15]:
y_pred = ext_gradient_boost_model.predict(X_test)

print(f'Train Accuracy -: {ext_gradient_boost_model.score(X_train, y_train):.3f}')
print(f'Test Accuracy -: {ext_gradient_boost_model.score(X_test, y_test):.3f}')
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

Train Accuracy -: 0.630
Test Accuracy -: 0.635
ACCURACY OF THE MODEL:  0.6348146884462831


In [6]:
random_forest_model = RandomForestClassifier(n_estimators=100)
random_forest_model.fit(X_train, y_train)

RandomForestClassifier()

In [7]:
#random_forest_model.oob_score_
y_pred = random_forest_model.predict(X_test)
print(f'Train Accuracy -: {random_forest_model.score(X_train, y_train):.3f}')
print(f'Test Accuracy -: {random_forest_model.score(X_test, y_test):.3f}')
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))
#random_forest_model.feature_importances_

Train Accuracy -: 0.806
Test Accuracy -: 0.602
ACCURACY OF THE MODEL:  0.6016121465432678


In [8]:
with open('rf1_model_pkl', 'wb') as files:
    pickle.dump(random_forest_model, files)

# New Section

In [None]:
from imblearn.under_sampling import RandomUnderSampler

random_under_sampler = RandomUnderSampler(sampling_strategy=1)
x = np.array(model_input_x)
y = target_y
x_res, y_res = random_under_sampler.fit_resample(x,y)

In [None]:
# Create Logistic Regression Model
log_reg_model = LogisticRegression(solver='liblinear')
#log_reg_model.fit(model_input_x, target_y)
#log_reg_model.score(model_input_x, target_y)
log_reg_model.fit(x_res, y_res)
log_reg_model.score(x_res, y_res)

0.5381922740342543

In [None]:
#print(classification_report(target_y, log_reg_model.predict(model_input_x)))
print(classification_report(y_res, log_reg_model.predict(x_res)))

              precision    recall  f1-score   support

           0       0.58      0.41      0.48     31996
           1       0.54      0.70      0.61     31996

    accuracy                           0.55     63992
   macro avg       0.56      0.55      0.55     63992
weighted avg       0.56      0.55      0.55     63992



In [None]:
print(pd.DataFrame(x_res).head)

<bound method NDFrame.head of         0    1    2    3    4    5    6    7    8    9    ...  601  602  603  \
0         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
2         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4         0    0    0    0    0    1    0    0    0    0  ...    0    0    0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
480523    0    0    0    1    0    0    0    0    0    0  ...    0    0    0   
480524    0    0    0    1    0    0    0    0    0    0  ...    0    0    0   
480525    0    0    0    1    0    0    0    0    0    0  ...    0    0    0   
480526    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
480527    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

        6