following:https://github.com/aakashgoel12/Flight-Delay-Prediction/blob/master/code/flightDelay_v1.ipynb

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import svm
import datetime
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#ingest data
flights = pd.read_csv('flight_predictions.csv')

In [3]:
flights.head()

In [6]:
#convert to minutes
def convert_to_datetime(value):
    return datetime.datetime.strptime(str(value), '%H:%M:%S')
def convert_to_minute(value):
    return int(value.total_seconds() / 60)
    
#both datetime but want an int
# elapsed time = arrival_time - departure_time
#need to convert to datetime, subtract then convert to minutes 
flights['ELAPSED_TIME'] = flights.apply(
    lambda row: convert_to_minute(convert_to_datetime(row['ARRIVAL_TIME'])-convert_to_datetime(row['DEPARTURE_TIME'])) if np.isnan(row['ELAPSED_TIME']) else row['ELAPSED_TIME'],
    axis=1
)
#both datetime but want an int
# arrival_delay = arrival_time - scheduled arrival
flights['ARRIVAL_DELAY'] = flights.apply(
    lambda row: convert_to_minute(convert_to_datetime(row['ARRIVAL_TIME'])-convert_to_datetime(row['SCHEDULED_ARRIVAL'])) if np.isnan(row['ARRIVAL_DELAY']) else row['ARRIVAL_DELAY'],
    axis=1
)
#all int
# air time = elapsed_time - taxi_out- taxi_in
flights['AIR_TIME'] = flights.apply(
    lambda row: row['ELAPSED_TIME']-row['TAXI_OUT']-row['TAXI_IN'] if np.isnan(row['AIR_TIME']) else row['AIR_TIME'],
    axis=1
)

#make sure my dataset is clean
print(flights['DEPARTURE_TIME'].isnull().sum())
print(flights.isnull().values.any())
print(flights.isnull().sum())

0
False
DAY_OF_WEEK                           0
SCHEDULED_DEPARTURE                   0
DEPARTURE_TIME                        0
DEPARTURE_DELAY                       0
TAXI_OUT                              0
SCHEDULED_TIME                        0
ELAPSED_TIME                          0
AIR_TIME                              0
DISTANCE                              0
TAXI_IN                               0
SCHEDULED_ARRIVAL                     0
ARRIVAL_TIME                          0
ARRIVAL_DELAY                         0
DATE                                  0
weekend                               0
holiday                               0
long_flight                           0
airline_avg_arrival_delay             0
airline_avg_departure_delay           0
source_airport_avg_departure_delay    0
destination_airport_avg_delay         0
dtype: int64


In [8]:

def get_minutes(value):
    value = convert_to_datetime(value)
    hour = value.hour
    minute = value.minute
    return hour*60+minute

#get minutes
flights['ARRIVAL_TIME'] = flights.apply(
    lambda row: get_minutes(row['ARRIVAL_TIME']),
    axis=1
)

#get minutes
flights['DEPARTURE_TIME'] = flights.apply(
    lambda row: get_minutes(row['DEPARTURE_TIME']),
    axis=1
)

#get minutes
flights['SCHEDULED_ARRIVAL'] = flights.apply(
    lambda row: get_minutes(row['SCHEDULED_ARRIVAL']),
    axis=1
)

#get minutes
flights['SCHEDULED_DEPARTURE'] = flights.apply(
    lambda row: get_minutes(row['SCHEDULED_DEPARTURE']),
    axis=1
)

In [10]:
#save to csv
flights.to_csv('flight_predictions_final.csv', index = False, header=True)

In [11]:
#get the target variable
y = np.where(flights['ARRIVAL_DELAY']>10, 1, 0)

#only need a subset
flights = flights.iloc[:,:10000] 

#split data
X_train, X_test, y_train, y_test = train_test_split(flights, y, test_size=0.2, random_state=0)

In [13]:

#set parameters
parameters = {'kernel':['linear'], 'C':[1], 'max_iter': [1000]}
#parameters = {'kernel':('linear', 'rbf'), 'C':[0.5, 1, 10, 50],'degree':['3', '4','5'], 'max_iter': ['500','1000','2000']}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


In [17]:
print("Best estimator found by grid search:")
print(clf.best_estimator_)
#print("Best parameter (CV score=%0.3f):" % search.best_score_)
#print(search.best_params_)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Best estimator found by grid search:
SVC(C=1, kernel='linear', max_iter=1000)
              precision    recall  f1-score   support

           0       0.96      0.98      0.97    891183
           1       0.93      0.84      0.88    254131

    accuracy                           0.95   1145314
   macro avg       0.94      0.91      0.93   1145314
weighted avg       0.95      0.95      0.95   1145314

[[875105  16078]
 [ 39942 214189]]
