In [1]:
import pandas as pd
import numpy as np

In [199]:
df=pd.read_csv('flights_rand-copy1.csv')
df.shape

(10000, 42)

In [200]:
df_flights=df[['fl_date', 'air_time','mkt_unique_carrier', 'branded_code_share', 'mkt_carrier','mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 
              'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 
              'crs_dep_time','dep_delay','crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance','arr_delay','carrier_delay', 
              'weather_delay', 'nas_delay', 'security_delay','late_aircraft_delay','cancelled','taxi_in', 'taxi_out','diverted']].copy()

In [201]:
df_flights.isnull().sum()

fl_date                   0
air_time                202
mkt_unique_carrier        0
branded_code_share        0
mkt_carrier               0
mkt_carrier_fl_num        0
op_unique_carrier         0
tail_num                 35
op_carrier_fl_num         0
origin_airport_id         0
origin                    0
origin_city_name          0
dest_airport_id           0
dest                      0
dest_city_name            0
crs_dep_time              0
dep_delay               162
crs_arr_time              0
dup                       0
crs_elapsed_time          0
flights                   0
distance                  0
arr_delay               199
carrier_delay          8128
weather_delay          8128
nas_delay              8128
security_delay         8128
late_aircraft_delay    8128
cancelled                 0
taxi_in                 180
taxi_out                171
diverted                  0
dtype: int64

In [202]:
#Dealing with missing values

carr_delay_mean=df_flights['carrier_delay'].astype('float').mean(axis=0)
wea_delay_mean=df_flights['weather_delay'].astype('float').mean(axis=0)
nas_delay_mean=df_flights['nas_delay'].astype('float').mean(axis=0)
sec_delay_mean=df_flights['security_delay'].astype('float').mean(axis=0)
aircraft_delay_mean=df_flights['late_aircraft_delay'].astype('float').mean(axis=0)


df_flights['carrier_delay'].replace(np.nan, carr_delay_mean, inplace=True)
df_flights['weather_delay'].replace(np.nan, wea_delay_mean, inplace=True)
df_flights['nas_delay'].replace(np.nan, nas_delay_mean, inplace=True)
df_flights['security_delay'].replace(np.nan, sec_delay_mean, inplace=True)
df_flights['late_aircraft_delay'].replace(np.nan, aircraft_delay_mean, inplace=True)

df_flights=df_flights.dropna()

# Feature Engineering

In [203]:
df_flights.head()

Unnamed: 0,fl_date,air_time,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,...,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,cancelled,taxi_in,taxi_out,diverted
0,2018-07-27,43.0,DL,DL_CODESHARE,DL,5799,CP,N615CZ,5799,14869,...,-2.0,19.652778,4.308761,16.317842,0.152778,26.074786,0.0,5.0,18.0,0.0
1,2018-10-08,91.0,UA,UA_CODESHARE,UA,6129,YV,N88325,6129,12266,...,20.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,0.0
2,2018-10-31,36.0,WN,WN,WN,809,WN,N7841A,809,13232,...,-19.0,19.652778,4.308761,16.317842,0.152778,26.074786,0.0,6.0,11.0,0.0
3,2019-10-15,221.0,AA,AA,AA,1830,AA,N556UW,1830,12889,...,-2.0,19.652778,4.308761,16.317842,0.152778,26.074786,0.0,15.0,20.0,0.0
4,2018-06-19,187.0,AS,AS_CODESHARE,AS,3300,OO,N176SY,3300,14831,...,-16.0,19.652778,4.308761,16.317842,0.152778,26.074786,0.0,5.0,14.0,0.0


In [204]:
# Converting date and time columns
df_flights['fl_date'] = pd.to_datetime(df_flights['fl_date'], errors='coerce')
df_flights['month'] = df_flights['fl_date'].dt.month
df_flights['day_of_week'] = df_flights['fl_date'].dt.dayofweek
df_flights['day_of_month'] = df_flights['fl_date'].dt.day
df_flights['year'] = df_flights['fl_date'].dt.year

In [205]:
df_flights = df_flights.drop('fl_date', axis = 1)

In [206]:
#Converting to air_time and arr_delay to category
def flight_duration(x):
    if x <=180:
        return 'Short'
    elif x >180 and x<360:
        return 'Medium'
    elif x>360:
        return 'Long'

df_flights['flight_duration_type']=df_flights['air_time'].apply(lambda x: flight_duration(x))
df_flights['flight_duration_type'].value_counts()

Short     8463
Medium    1295
Long        38
Name: flight_duration_type, dtype: int64

In [207]:
# Separating State name from city
df_flights['origin_state']=df_flights['origin_city_name'].apply(lambda x: x.split(', ')[1])
df_flights['destination_state']=df_flights['dest_city_name'].apply(lambda x: x.split(', ')[1])

In [208]:
# Finding the mean of departure and arrival delays based on different carriers
df_flights['dep_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['dep_delay'].transform(np.mean)
df_flights['arr_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['arr_delay'].transform(np.mean)

In [209]:
df_flights.columns

Index(['air_time', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'dep_delay', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights',
       'distance', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay', 'cancelled', 'taxi_in',
       'taxi_out', 'diverted', 'month', 'day_of_week', 'day_of_month', 'year',
       'flight_duration_type', 'origin_state', 'destination_state',
       'dep_delay_mean_by_carrier', 'arr_delay_mean_by_carrier'],
      dtype='object')

In [210]:
# Finding the mean of different delay types based on different carriers
df_flights['weather_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['weather_delay'].transform(np.mean)
df_flights['security_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['security_delay'].transform(np.mean)
df_flights['carrier_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['carrier_delay'].transform(np.mean)
df_flights['late_aircraft_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['late_aircraft_delay'].transform(np.mean)
df_flights['nas_delay_mean_by_carrier']=df_flights.groupby(['op_unique_carrier'])['nas_delay'].transform(np.mean)

In [211]:
#make the types categories
df_flights["mkt_carrier"] = df_flights["mkt_carrier"].astype("category")
df_flights["op_unique_carrier"] = df_flights["op_unique_carrier"].astype("category")
df_flights["tail_num"] = df_flights["tail_num"].astype("category")
df_flights["op_carrier_fl_num"] = df_flights["op_carrier_fl_num"].astype("category")
df_flights["origin_airport_id"] = df_flights["origin_airport_id"].astype("category")
df_flights["dest_airport_id"] = df_flights["dest_airport_id"].astype("category")
df_flights["mkt_carrier_fl_num"] = df_flights["mkt_carrier_fl_num"].astype("category")

In [212]:
# ENCODE AIRPORTS AND TAILNUM
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df_flights['mkt_carrier'] = encoder.fit_transform(df_flights[['mkt_carrier']])
df_flights['mkt_unique_carrier'] = encoder.fit_transform(df_flights[['mkt_unique_carrier']])
df_flights['mkt_carrier_fl_num'] = encoder.fit_transform(df_flights[['mkt_carrier_fl_num']])
df_flights['op_unique_carrier'] = encoder.fit_transform(df_flights[['op_unique_carrier']])
df_flights['tail_num'] = encoder.fit_transform(df_flights[['tail_num']])
df_flights['op_carrier_fl_num'] = encoder.fit_transform(df_flights[['op_carrier_fl_num']])
df_flights['origin_airport_id'] = encoder.fit_transform(df_flights[['origin_airport_id']])
df_flights['dest_airport_id'] = encoder.fit_transform(df_flights[['dest_airport_id']])

In [213]:
df_flights.isnull().sum()

air_time                               0
mkt_unique_carrier                     0
branded_code_share                     0
mkt_carrier                            0
mkt_carrier_fl_num                     0
op_unique_carrier                      0
tail_num                               0
op_carrier_fl_num                      0
origin_airport_id                      0
origin                                 0
origin_city_name                       0
dest_airport_id                        0
dest                                   0
dest_city_name                         0
crs_dep_time                           0
dep_delay                              0
crs_arr_time                           0
dup                                    0
crs_elapsed_time                       0
flights                                0
distance                               0
arr_delay                              0
carrier_delay                          0
weather_delay                          0
nas_delay       

In [214]:
# Converting categorical data by getting dummies


dummy_durtypes=pd.get_dummies(df_flights[['flight_duration_type']])

df_flights=pd.concat([df_flights, dummy_durtypes], axis=1).drop(['flight_duration_type'], axis=1)

In [223]:
df_flights=df_flights.drop(['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
                            'mkt_carrier_fl_num', 'op_unique_carrier','op_carrier_fl_num', 
                            'origin', 'origin_city_name','dup','dest', 'dest_city_name',
                            'year', 'origin_state', 'destination_state','carrier_delay', 
                            'weather_delay', 'nas_delay','security_delay', 'late_aircraft_delay', 
                            'cancelled','diverted',
                           ], axis=1)

In [224]:
df_flights

Unnamed: 0,air_time,tail_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_delay,crs_arr_time,crs_elapsed_time,flights,distance,...,dep_delay_mean_by_carrier,arr_delay_mean_by_carrier,weather_delay_mean_by_carrier,security_delay_mean_by_carrier,carrier_delay_mean_by_carrier,late_aircraft_delay_mean_by_carrier,nas_delay_mean_by_carrier,flight_duration_type_Long,flight_duration_type_Medium,flight_duration_type_Short
0,43.0,2211.0,287.0,130.0,1100,-4.0,1204,64.0,1.0,188.0,...,11.600000,10.043478,3.297139,0.116908,18.142995,29.439836,16.999740,0,0,1
1,91.0,3726.0,146.0,82.0,955,21.0,1051,116.0,1.0,667.0,...,12.891892,9.324324,4.958387,0.122694,20.968254,28.820678,15.772630,0,0,1
2,36.0,3035.0,190.0,78.0,1640,-2.0,1850,70.0,1.0,228.0,...,10.197954,3.136582,3.825787,0.182636,19.046355,26.018101,14.793534,0,0,1
3,221.0,1986.0,166.0,56.0,750,0.0,1508,258.0,1.0,1916.0,...,10.648601,6.359266,3.864474,0.119792,19.645578,25.666157,16.052539,0,1,0
4,187.0,429.0,284.0,17.0,938,-6.0,1514,216.0,1.0,1476.0,...,8.499504,4.539148,5.548190,0.126583,19.578516,25.583272,16.969986,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,69.0,3732.0,146.0,171.0,1445,-10.0,1619,94.0,1.0,429.0,...,12.891892,9.324324,4.958387,0.122694,20.968254,28.820678,15.772630,0,0,1
9996,69.0,1466.0,295.0,151.0,1400,0.0,1532,92.0,1.0,400.0,...,1.155039,1.108527,3.775484,0.126723,17.696490,23.410869,15.775264,0,0,1
9997,173.0,3266.0,235.0,132.0,855,9.0,1527,212.0,1.0,1488.0,...,10.648601,6.359266,3.864474,0.119792,19.645578,25.666157,16.052539,0,0,1
9998,80.0,522.0,82.0,31.0,1650,44.0,1740,110.0,1.0,562.0,...,10.197954,3.136582,3.825787,0.182636,19.046355,26.018101,14.793534,0,0,1


In [226]:
df_flights.dtypes

air_time                               float64
tail_num                               float64
origin_airport_id                      float64
dest_airport_id                        float64
crs_dep_time                             int64
dep_delay                              float64
crs_arr_time                             int64
crs_elapsed_time                       float64
flights                                float64
distance                               float64
arr_delay                              float64
taxi_in                                float64
taxi_out                               float64
month                                    int64
day_of_week                              int64
day_of_month                             int64
dep_delay_mean_by_carrier              float64
arr_delay_mean_by_carrier              float64
weather_delay_mean_by_carrier          float64
security_delay_mean_by_carrier         float64
carrier_delay_mean_by_carrier          float64
late_aircraft

# Modeling

In [260]:
X=df_flights.drop('dep_delay', axis=1)
y=df_flights['dep_delay']

In [261]:
# Spliting into Train and Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=100, shuffle=True)

In [262]:
#Random Forest Regressors
from sklearn.ensemble import RandomForestRegressor

clf=RandomForestRegressor(n_estimators=40, max_depth=6, random_state=0)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)


In [263]:

from sklearn.metrics import mean_absolute_error
print('Mean Absolue Error: ', mean_absolute_error(y_test, y_pred))

from sklearn.metrics import mean_squared_error
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))

from sklearn.metrics import r2_score
print('R2_score: ', r2_score(y_test, y_pred))

Mean Absolue Error:  5.476428141999186
Mean Squared Error:  76.80533743922796
R2_score:  0.9558792480994297


In [264]:
# Logistic Regression

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [265]:

clf=LogisticRegression()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [259]:

print('Mean Absolue Error: ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))
print('R2_score: ', r2_score(y_test, y_pred))


Mean Absolue Error:  1.1656373910893074e-13
Mean Squared Error:  2.3205206998678408e-26
R2_score:  1.0


In [266]:
#Linear Regression
from sklearn.linear_model import LinearRegression

clf=LinearRegression()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [267]:

print('Mean Absolue Error: ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))
print('R2_score: ', r2_score(y_test, y_pred))

Mean Absolue Error:  1.1656373910893074e-13
Mean Squared Error:  2.3205206998678408e-26
R2_score:  1.0


In [268]:
# Naives Bayes, GaussianNB Naive Bayes
from sklearn.naive_bayes import GaussianNB

clf=GaussianNB()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

In [269]:

print('Mean Absolue Error: ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: ', mean_squared_error(y_test, y_pred))
print('R2_score: ', r2_score(y_test, y_pred))

Mean Absolue Error:  15.526530612244898
Mean Squared Error:  831.1404081632653
R2_score:  0.522552195384525
