# Training set: 15 Dec 2018 - 31 Dec 2018

## Variable transformation, adding new features, models

In [1]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

In [2]:
train_set = pd.read_csv('../data/train_dec2018.csv')
train_set.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,fl_datetime
0,2018-12-22,UA,UA_CODESHARE,UA,3807,ZW,N467AW,3807,11721,FNT,...,,,,,,,,,,2018-12-22
1,2018-12-19,DL,DL,DL,2402,DL,N338DN,2402,10423,AUS,...,,,,,,,,,,2018-12-19
2,2018-12-23,UA,UA_CODESHARE,UA,4357,EV,N11544,4357,11267,DAY,...,,,,,,,,,,2018-12-23
3,2018-12-16,WN,WN,WN,4483,WN,N275WN,4483,14771,SFO,...,,,,,,,,,,2018-12-16
4,2018-12-18,AA,AA,AA,2539,AA,N812NN,2539,14122,PIT,...,,,,,,,,,,2018-12-18


In [3]:
train_set.shape

(40605, 43)

In [4]:
# train_set['arr_delay'].quantile(q=0.99) -> 184

# for training we will remove arr_delay > 200

train_set = train_set.drop(train_set.index[train_set['arr_delay'] > 200])
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40277 entries, 0 to 40604
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   fl_date              40277 non-null  object 
 1   mkt_unique_carrier   40277 non-null  object 
 2   branded_code_share   40277 non-null  object 
 3   mkt_carrier          40277 non-null  object 
 4   mkt_carrier_fl_num   40277 non-null  int64  
 5   op_unique_carrier    40277 non-null  object 
 6   tail_num             40277 non-null  object 
 7   op_carrier_fl_num    40277 non-null  int64  
 8   origin_airport_id    40277 non-null  int64  
 9   origin               40277 non-null  object 
 10  origin_city_name     40277 non-null  object 
 11  dest_airport_id      40277 non-null  int64  
 12  dest                 40277 non-null  object 
 13  dest_city_name       40277 non-null  object 
 14  crs_dep_time         40277 non-null  int64  
 15  dep_time             40277 non-null 

In [5]:
# New column for flight date - datetime type

train_set['fl_datetime'] = pd.to_datetime(train_set['fl_date'])

In [6]:
# Drop columns

to_drop = ['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 
           'origin_city_name', 'dest', 'dest_city_name', 'dep_time', 
           'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'arr_time', 
           'cancelled', 'cancellation_code', 'diverted', 'dup', 'actual_elapsed_time', 
           'air_time', 'flights', 'carrier_delay', 'weather_delay', 'nas_delay', 
           'security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime', 
           'longest_add_gtime', 'no_name']

train_set.drop(to_drop, axis=1, inplace=True)

In [7]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40277 entries, 0 to 40604
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   fl_date            40277 non-null  object        
 1   op_unique_carrier  40277 non-null  object        
 2   tail_num           40277 non-null  object        
 3   op_carrier_fl_num  40277 non-null  int64         
 4   origin_airport_id  40277 non-null  int64         
 5   origin             40277 non-null  object        
 6   dest_airport_id    40277 non-null  int64         
 7   crs_dep_time       40277 non-null  int64         
 8   crs_arr_time       40277 non-null  int64         
 9   arr_delay          40277 non-null  float64       
 10  crs_elapsed_time   40277 non-null  float64       
 11  distance           40277 non-null  float64       
 12  fl_datetime        40277 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(5), object(4)
memory 

### crs_dep_time and crs_arr_time converted from hhmm format (int) to minutes (int)

In [8]:
from modules.data_prep import hhmm_to_min


# Add new columns (convert hhmm time into minutes)
train_set['crs_dep_time_min'] = train_set['crs_dep_time'].map(lambda x: hhmm_to_min(x))
train_set['crs_arr_time_min'] = train_set['crs_arr_time'].map(lambda x: hhmm_to_min(x))

In [9]:
# New columns - Log transformations (did not use them for models)

train_set['crs_elapsed_time_log'] = train_set['crs_elapsed_time'].apply(np.log1p)
train_set['distance_log'] = train_set['distance'].apply(np.log1p)
train_set['crs_dep_time_min_log'] = train_set['crs_dep_time_min'].apply(np.log1p)
train_set['crs_arr_time_min_log'] = train_set['crs_arr_time_min'].apply(np.log1p)

In [10]:
# Reset index

train_set.reset_index(drop=True, inplace=True)
train_set.shape

(40277, 19)

In [11]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40277 entries, 0 to 40276
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   fl_date               40277 non-null  object        
 1   op_unique_carrier     40277 non-null  object        
 2   tail_num              40277 non-null  object        
 3   op_carrier_fl_num     40277 non-null  int64         
 4   origin_airport_id     40277 non-null  int64         
 5   origin                40277 non-null  object        
 6   dest_airport_id       40277 non-null  int64         
 7   crs_dep_time          40277 non-null  int64         
 8   crs_arr_time          40277 non-null  int64         
 9   arr_delay             40277 non-null  float64       
 10  crs_elapsed_time      40277 non-null  float64       
 11  distance              40277 non-null  float64       
 12  fl_datetime           40277 non-null  datetime64[ns]
 13  crs_dep_time_min

### Day of week - one-hot encoded

In [12]:
# New column - Day of week (0 - Monday)
train_set['fl_dayofweek'] = train_set['fl_datetime'].dt.dayofweek

# One-hot encoding for 'fl_dayofweek'
enc_dayofweek = OneHotEncoder()
dayofweek_onehot = pd.DataFrame(enc_dayofweek.fit_transform(train_set[['fl_dayofweek']]).toarray())
dayofweek_onehot.columns = enc_dayofweek.get_feature_names_out(['fl_dayofweek'])
dayofweek_onehot.head(3)


Unnamed: 0,fl_dayofweek_0,fl_dayofweek_1,fl_dayofweek_2,fl_dayofweek_3,fl_dayofweek_4,fl_dayofweek_5,fl_dayofweek_6
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
dayofweek_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40277 entries, 0 to 40276
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fl_dayofweek_0  40277 non-null  float64
 1   fl_dayofweek_1  40277 non-null  float64
 2   fl_dayofweek_2  40277 non-null  float64
 3   fl_dayofweek_3  40277 non-null  float64
 4   fl_dayofweek_4  40277 non-null  float64
 5   fl_dayofweek_5  40277 non-null  float64
 6   fl_dayofweek_6  40277 non-null  float64
dtypes: float64(7)
memory usage: 2.2 MB


In [14]:
# Concatenate 2 df
train_set1 = pd.concat([train_set, dayofweek_onehot], axis=1)
train_set1.shape

(40277, 27)

### Airports (origin and dest) - ordinal encoding

In [15]:
# DF with origin airports ranked by percentage of departure delays (dep_delay > 0)

origin_airports = pd.read_csv('CSV Files/airport_features.csv', usecols=['origin_airport_id', 'Ranking'])
origin_airports.head(3)

Unnamed: 0,origin_airport_id,Ranking
0,12899,1
1,11525,2
2,14582,3


In [16]:
# Add columns with airport codes for origin and dest airports

train_set2 = pd.merge(train_set1, origin_airports, on='origin_airport_id', how='left')
train_set2.rename(columns={'Ranking': 'origin_airport_rank'}, inplace=True)

train_set3 = pd.merge(train_set2, origin_airports, 
                      left_on='dest_airport_id', right_on='origin_airport_id', 
                      how='left')
train_set3.rename(columns={'Ranking': 'dest_airport_rank'}, inplace=True)
train_set3.drop('origin_airport_id_y', axis=1, inplace=True)

train_set3.head(3)

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id_x,origin,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,...,fl_dayofweek,fl_dayofweek_0,fl_dayofweek_1,fl_dayofweek_2,fl_dayofweek_3,fl_dayofweek_4,fl_dayofweek_5,fl_dayofweek_6,origin_airport_rank,dest_airport_rank
0,2018-12-22,ZW,N467AW,3807,11721,FNT,13930,1645,1706,-1.0,...,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,155,312
1,2018-12-19,DL,N338DN,2402,10423,AUS,10397,1220,1523,-3.0,...,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,323,321
2,2018-12-23,EV,N11544,4357,11267,DAY,11618,1135,1328,5.0,...,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,173,334


### Carrier Features

In [17]:
# DataFrame with new carrier features
carriers = pd.read_csv('CSV Files/carrier_features.csv')

# Rename carrier features (add prefix c_)
carriers.columns = [f'c_{col}' for col in carriers.columns]
carriers.head(3)

Unnamed: 0,c_op_unique_carrier,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,c_avg_carrier_delay_if_del,c_delayed_flights,c_percent_delayed
0,9E,3.788254,20.54492,502191,50.899457,20.54492,143729,28.62
1,9K,-1.413897,26.77027,1661,20.508046,26.77027,435,26.19
2,AA,6.209128,21.546057,1862100,38.495859,21.546057,686758,36.88


In [18]:
# Add columns with carrier features

train_set4 = pd.merge(train_set3, carriers, 
                      left_on='op_unique_carrier', right_on='c_op_unique_carrier', 
                      how='left')
train_set4.drop('c_op_unique_carrier', axis=1, inplace=True)

train_set4.head(3)

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id_x,origin,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,...,fl_dayofweek_6,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,c_avg_carrier_delay_if_del,c_delayed_flights,c_percent_delayed
0,2018-12-22,ZW,N467AW,3807,11721,FNT,13930,1645,1706,-1.0,...,0.0,155,312,7.347722,23.122728,208050,55.110798,23.122728,64875,31.18
1,2018-12-19,DL,N338DN,2402,10423,AUS,10397,1220,1523,-3.0,...,0.0,323,321,0.464917,23.45528,1939694,36.608088,23.45528,545674,28.13
2,2018-12-23,EV,N11544,4357,11267,DAY,11618,1135,1328,5.0,...,1.0,173,334,11.460218,27.317656,335976,54.186518,27.317656,120632,35.9


In [19]:
train_set4_2 = train_set4.drop(train_set4.index[train_set4['c_percent_delayed'] > 95])

In [20]:
train_set4_2.shape

(40277, 36)

### Tail Number Features - Age of plane

In [21]:
# DataFrame with new carrier features
tail_num = pd.read_csv('CSV Files/tail_num_features.csv', usecols=['tail_num', 'age'])
tail_num.columns = ['tail_num', 'age_of_aircraft']
tail_num = tail_num.drop_duplicates(subset='tail_num', keep='last')

tail_num.head(3)

Unnamed: 0,tail_num,age_of_aircraft
0,8805,11
1,8809,11
2,N101HQ,13


In [22]:
#merge tail_num_features into df_train
train_set5 = pd.merge(train_set4_2, tail_num, how='left', on='tail_num')

#fill in nan with mean of column
train_set5['age_of_aircraft'] = train_set5['age_of_aircraft'].fillna(train_set5['age_of_aircraft'].mean())

#change age_of_aircraft to int
train_set5['age_of_aircraft'] = train_set5['age_of_aircraft'].astype(int)

train_set5.head(3)


Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id_x,origin,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,...,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,c_avg_carrier_delay_if_del,c_delayed_flights,c_percent_delayed,age_of_aircraft
0,2018-12-22,ZW,N467AW,3807,11721,FNT,13930,1645,1706,-1.0,...,155,312,7.347722,23.122728,208050,55.110798,23.122728,64875,31.18,18
1,2018-12-19,DL,N338DN,2402,10423,AUS,10397,1220,1523,-3.0,...,323,321,0.464917,23.45528,1939694,36.608088,23.45528,545674,28.13,2
2,2018-12-23,EV,N11544,4357,11267,DAY,11618,1135,1328,5.0,...,173,334,11.460218,27.317656,335976,54.186518,27.317656,120632,35.9,18


In [23]:
train_set5_2 = train_set5.drop(train_set5.index[train_set5['age_of_aircraft'] > 50])

In [24]:
train_set5_2.shape

(40276, 37)

### Flight number features

In [25]:
# DataFrame with new features related to flight numbers
flight_num = pd.read_csv('CSV Files/flight_num_features.csv')

# Rename carrier features (add prefix fn_)
flight_num.columns = ['op_carrier_fl_num', 'round'] + [f'fn_{col}' for col in flight_num.columns[2:]]

# Drop duplicates
flight_num = flight_num.drop_duplicates(subset=['op_carrier_fl_num','round'], keep='first')

flight_num.head(3)

Unnamed: 0,op_carrier_fl_num,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
0,1,7,38.729885,-1.540519,1276,19.237986,437,34.25
1,1,8,38.44186,-12.23347,1725,20.782051,390,22.61
2,1,9,39.125,-5.8125,80,18.85,20,25.0


In [26]:
from modules.data_prep import add_round


# New column in training set - 'crs_dep_time' rounded to hour
train_set6  = add_round(train_set5_2).copy()
train_set6['round'] = train_set6['round'].astype(int)

# Add columns with flight number related features
train_set7 = pd.merge(train_set6, flight_num, 
                      on=['op_carrier_fl_num', 'round'], 
                      how='left')

train_set7.head(3)


Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id_x,origin,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,...,c_delayed_flights,c_percent_delayed,age_of_aircraft,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
0,2018-12-22,ZW,N467AW,3807,11721,FNT,13930,1645,1706,-1.0,...,64875,31.18,18,16,68.030769,5.180952,329.0,43.990741,108.0,32.83
1,2018-12-19,DL,N338DN,2402,10423,AUS,10397,1220,1523,-3.0,...,545674,28.13,2,12,58.934426,5.816794,395.0,25.144578,166.0,42.03
2,2018-12-23,EV,N11544,4357,11267,DAY,11618,1135,1328,5.0,...,120632,35.9,18,11,59.684211,-2.434043,246.0,37.075758,66.0,26.83


In [27]:
train_set7.shape

(40276, 44)

In [28]:
train_set8 = train_set7.drop(train_set7.index[train_set7['fn_avg_del_total'] > 200])

In [29]:
# Check the number of rows
print(train_set.shape)
print(train_set2.shape)
print(train_set3.shape)
print(train_set4.shape)
print(train_set5.shape)
print(train_set6.shape)
print(train_set7.shape)
print(train_set8.shape)

(40277, 20)
(40277, 28)
(40277, 29)
(40277, 36)
(40277, 37)
(40276, 38)
(40276, 44)
(40276, 44)


In [30]:
train_set8.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40276 entries, 0 to 40275
Data columns (total 44 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   fl_date                     40276 non-null  object        
 1   op_unique_carrier           40276 non-null  object        
 2   tail_num                    40276 non-null  object        
 3   op_carrier_fl_num           40276 non-null  int64         
 4   origin_airport_id_x         40276 non-null  int64         
 5   origin                      40276 non-null  object        
 6   dest_airport_id             40276 non-null  int64         
 7   crs_dep_time                40276 non-null  int64         
 8   crs_arr_time                40276 non-null  int64         
 9   arr_delay                   40276 non-null  float64       
 10  crs_elapsed_time            40276 non-null  float64       
 11  distance                    40276 non-null  float64   

In [31]:
# Remove rows with null values

train_set8.dropna(inplace=True)

In [32]:
train_set8.describe()

Unnamed: 0,op_carrier_fl_num,origin_airport_id_x,dest_airport_id,crs_dep_time,crs_arr_time,arr_delay,crs_elapsed_time,distance,crs_dep_time_min,crs_arr_time_min,...,c_delayed_flights,c_percent_delayed,age_of_aircraft,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
count,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,...,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0
mean,2908.130689,12707.430809,12716.101036,1326.928468,1487.951449,2.533662,142.999378,796.797306,807.070884,904.442834,...,464614.353775,34.437001,12.89747,13.128672,65.25284,5.496133,305.756947,38.427981,105.405463,35.032647
std,1905.641014,1522.980423,1521.652775,488.912887,514.916821,31.980643,74.044971,599.352494,294.204918,309.205365,...,307315.556237,3.321089,6.3315,4.888873,26.820175,9.615432,246.6399,17.345376,91.783534,20.00195
min,1.0,10135.0,10135.0,3.0,1.0,-102.0,24.0,31.0,3.0,1.0,...,1580.0,28.13,0.0,0.0,0.0,-33.181818,1.0,0.0,0.0,0.0
25%,1245.0,11292.0,11292.0,919.0,1104.0,-15.0,90.0,354.0,559.0,664.0,...,178897.0,32.92,7.0,9.0,50.480263,-1.100168,105.0,27.6,34.0,26.99
50%,2563.0,12892.0,12892.0,1320.0,1515.0,-5.0,125.0,632.0,800.0,915.0,...,529793.0,35.75,14.0,13.0,62.069767,4.173729,244.0,36.173913,80.0,34.31
75%,4566.0,14057.0,14098.0,1730.0,1915.0,9.0,173.0,1034.0,1050.0,1155.0,...,686758.0,36.36,17.0,17.0,75.244874,10.552946,457.0,46.509106,154.0,41.78
max,9370.0,16218.0,16218.0,2359.0,2359.0,200.0,703.0,4983.0,1439.0,1439.0,...,970215.0,49.42,41.0,23.0,950.0,91.555556,1805.0,518.0,864.0,3300.0


In [33]:
train_set8.corr()['arr_delay']

op_carrier_fl_num             0.051539
origin_airport_id_x           0.003365
dest_airport_id              -0.006130
crs_dep_time                  0.121599
crs_arr_time                  0.088232
arr_delay                     1.000000
crs_elapsed_time             -0.064847
distance                     -0.040503
crs_dep_time_min              0.121602
crs_arr_time_min              0.088338
crs_elapsed_time_log         -0.055774
distance_log                 -0.026804
crs_dep_time_min_log          0.114056
crs_arr_time_min_log          0.030445
fl_dayofweek                  0.068125
fl_dayofweek_0               -0.074155
fl_dayofweek_1               -0.097292
fl_dayofweek_2               -0.031564
fl_dayofweek_3                0.115061
fl_dayofweek_4                0.153557
fl_dayofweek_5               -0.007928
fl_dayofweek_6               -0.047004
origin_airport_rank           0.066700
dest_airport_rank             0.042710
c_avg_delay                   0.074447
c_avg_carrier_delay      

In [34]:
train_set8.columns

Index(['fl_date', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num',
       'origin_airport_id_x', 'origin', 'dest_airport_id', 'crs_dep_time',
       'crs_arr_time', 'arr_delay', 'crs_elapsed_time', 'distance',
       'fl_datetime', 'crs_dep_time_min', 'crs_arr_time_min',
       'crs_elapsed_time_log', 'distance_log', 'crs_dep_time_min_log',
       'crs_arr_time_min_log', 'fl_dayofweek', 'fl_dayofweek_0',
       'fl_dayofweek_1', 'fl_dayofweek_2', 'fl_dayofweek_3', 'fl_dayofweek_4',
       'fl_dayofweek_5', 'fl_dayofweek_6', 'origin_airport_rank',
       'dest_airport_rank', 'c_avg_delay', 'c_avg_carrier_delay',
       'c_total_flights', 'c_avg_delay_if_del', 'c_avg_carrier_delay_if_del',
       'c_delayed_flights', 'c_percent_delayed', 'age_of_aircraft', 'round',
       'fn_avg_sum_of_delays', 'fn_avg_del_total', 'fn_total_flights',
       'fn_avg_del_if_delayed', 'fn_times_delayed', 'fn_percent_delayed'],
      dtype='object')

In [35]:
# Numeric features (will be scaled)

num_feat = ['crs_elapsed_time', 'distance', 'crs_dep_time_min', 'crs_arr_time_min',
            'origin_airport_rank','dest_airport_rank', 'c_avg_delay', 'c_avg_carrier_delay',
            'c_total_flights', 'c_avg_delay_if_del', 'c_avg_carrier_delay_if_del',
            'c_delayed_flights', 'c_percent_delayed', 'age_of_aircraft', 'round',
            'fn_avg_sum_of_delays', 'fn_avg_del_total', 'fn_total_flights',
            'fn_avg_del_if_delayed', 'fn_times_delayed', 'fn_percent_delayed']

In [36]:
# Standard scaling of numeric features

scaler_standard = StandardScaler()
df_scaled_standard = pd.DataFrame(scaler_standard.fit_transform(train_set8[num_feat].astype(float)))
df_scaled_standard.columns = train_set8[num_feat].columns
df_scaled_standard.head()

Unnamed: 0,crs_elapsed_time,distance,crs_dep_time_min,crs_arr_time_min,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,...,c_delayed_flights,c_percent_delayed,age_of_aircraft,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
0,-0.837331,-0.957374,0.672768,0.393133,-1.824396,0.391795,0.528777,0.603633,-1.311693,1.749491,...,-1.300762,-0.980715,0.805906,0.587326,0.103577,-0.032779,0.09424,0.32071,0.028268,-0.110123
1,-0.270101,0.027034,-0.227976,0.060016,0.54249,0.517882,-1.442034,0.674844,0.663977,-0.381722,...,0.26377,-1.8991,-1.721173,-0.230868,-0.235587,0.033349,0.36184,-0.765828,0.660198,0.349838
2,-0.405156,-0.440143,-0.380933,-0.311909,-1.570801,0.700007,1.706341,1.501916,-1.16574,1.643029,...,-1.119327,0.440523,0.805906,-0.435417,-0.207631,-0.824744,-0.242287,-0.07796,-0.429336,-0.410097
3,-0.715782,-0.638693,-0.63586,-0.628855,0.64111,0.952181,-0.558664,-0.763074,1.547357,-1.235778,...,1.645237,0.395357,0.016194,-0.639966,0.210463,0.238664,-0.517996,0.549965,-0.483812,-0.038129
4,0.688788,0.45083,1.128238,1.191964,-1.091788,0.742036,0.202753,0.266011,0.575448,-0.164281,...,0.722861,0.735611,-0.299691,1.200972,0.201698,0.359993,0.13884,0.47421,0.202594,0.071862


In [37]:
df_scaled_standard.shape

(40164, 21)

In [38]:
df_scaled_standard.describe()

Unnamed: 0,crs_elapsed_time,distance,crs_dep_time_min,crs_arr_time_min,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,...,c_delayed_flights,c_percent_delayed,age_of_aircraft,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
count,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,...,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0,40164.0
mean,-1.811092e-16,-7.202463e-17,1.797686e-16,-1.560681e-16,9.544313000000001e-17,-2.513523e-16,-4.982113e-16,-5.142286e-16,-1.765621e-16,8.622721000000001e-17,...,-1.2892340000000001e-17,1.159868e-15,6.289232e-16,-1.251641e-16,3.166364e-16,6.869374e-17,6.009976e-17,-2.135789e-16,7.563056e-17,1.80696e-16
std,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,...,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012,1.000012
min,-1.607143,-1.277724,-2.733064,-2.921858,-3.994041,-3.965204,-1.443994,-1.690326,-1.545415,-2.558413,...,-1.506725,-1.8991,-2.037058,-2.685452,-2.433006,-4.022537,-1.235651,-2.215487,-1.148428,-1.751483
25%,-0.7157817,-0.738802,-0.8432013,-0.777625,-0.5141555,-0.5048222,-0.5586637,-0.7630739,-0.9667288,-0.3817218,...,-0.9297314,-0.456784,-0.9314608,-0.8445144,-0.5508078,-0.6860205,-0.813978,-0.6242653,-0.7779865,-0.4020981
50%,-0.2430901,-0.2749623,-0.02403417,0.03414332,0.3875152,0.3917949,0.198003,0.05163238,0.2872445,-0.1642812,...,0.2120929,0.3953568,0.1741362,-0.02631978,-0.1186835,-0.1375311,-0.2503963,-0.1299537,-0.2768011,-0.03612926
75%,0.4051726,0.3957699,0.8257243,0.8103361,0.6974645,0.700007,0.4737443,0.6748438,0.6639769,0.657332,...,0.7228609,0.5790338,0.6479635,0.7918749,0.3725613,0.5259125,0.6132217,0.4659009,0.5294538,0.337339
max,7.563074,6.984629,2.147948,1.728831,1.275097,1.246383,5.09573,1.576323,1.547357,2.830288,...,1.645237,4.511528,4.438582,2.019167,32.98853,8.950247,6.078748,27.64875,8.265142,163.2345


In [39]:
df_scaled_standard.corr()

Unnamed: 0,crs_elapsed_time,distance,crs_dep_time_min,crs_arr_time_min,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,...,c_delayed_flights,c_percent_delayed,age_of_aircraft,round,fn_avg_sum_of_delays,fn_avg_del_total,fn_total_flights,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed
crs_elapsed_time,1.0,0.975213,-0.025619,0.015563,0.1378,0.175973,-0.075099,-0.054112,0.072508,-0.043137,...,0.067642,0.021823,-0.152871,-0.025901,-0.051109,-0.083363,0.108966,-0.047793,0.119849,0.005028
distance,0.975213,1.0,-0.004985,0.019288,0.165718,0.166751,-0.088691,-0.079224,0.101945,-0.090959,...,0.101745,0.051811,-0.152223,-0.005613,-0.068376,-0.078164,0.107356,-0.067411,0.119275,0.013781
crs_dep_time_min,-0.025619,-0.004985,1.0,0.687172,0.08297,-0.080123,0.002547,-0.006967,0.003637,-0.005559,...,0.004334,0.010126,0.000365,0.998221,0.022355,0.4646,-0.072125,0.238639,0.106178,0.285297
crs_arr_time_min,0.015563,0.019288,0.687172,1.0,0.087031,-0.05394,-0.008451,0.006422,0.004905,0.002121,...,0.000845,-0.019085,-0.013932,0.687681,0.033235,0.3936,-0.025035,0.210767,0.132385,0.216726
origin_airport_rank,0.1378,0.165718,0.08297,0.087031,1.0,0.013022,-0.066823,-0.169959,0.271753,-0.152298,...,0.288255,0.108053,-0.079403,0.083659,-0.122814,0.071551,-0.025113,-0.085274,0.028087,0.086819
dest_airport_rank,0.175973,0.166751,-0.080123,-0.05394,0.013022,1.0,-0.077369,-0.169856,0.269975,-0.15804,...,0.28555,0.100461,-0.082397,-0.079706,-0.025405,0.022292,-0.008237,-0.017779,0.008508,0.033645
c_avg_delay,-0.075099,-0.088691,0.002547,-0.008451,-0.066823,-0.077369,1.0,0.298113,-0.430664,0.732556,...,-0.353023,0.668566,-0.048385,0.00211,0.168124,0.202044,0.08492,0.216985,0.118372,0.049912
c_avg_carrier_delay,-0.054112,-0.079224,-0.006967,0.006422,-0.169959,-0.169856,0.298113,1.0,-0.06136,0.50221,...,-0.111014,-0.172995,0.109108,-0.006627,0.13128,0.045726,-0.020948,0.124921,-0.02042,-0.024375
c_total_flights,0.072508,0.101945,0.003637,0.004905,0.271753,0.269975,-0.430664,-0.06136,1.0,-0.524743,...,0.987534,-0.100972,0.092716,0.004094,-0.093102,-0.042135,-0.191838,-0.101305,-0.186187,0.010578
c_avg_delay_if_del,-0.043137,-0.090959,-0.005559,0.002121,-0.152298,-0.15804,0.732556,0.50221,-0.524743,1.0,...,-0.530524,0.029288,0.037983,-0.004787,0.208911,0.112362,0.078038,0.242409,0.079831,-0.015923


In [43]:
# DF with one-hot encoded day of week

df_onehot = train_set8[['fl_dayofweek_0', 'fl_dayofweek_1', 'fl_dayofweek_2', 'fl_dayofweek_3', 
                       'fl_dayofweek_4', 'fl_dayofweek_5', 'fl_dayofweek_6']].copy()
df_onehot.reset_index(drop=True, inplace=True)
df_onehot.head(3)

Unnamed: 0,fl_dayofweek_0,fl_dayofweek_1,fl_dayofweek_2,fl_dayofweek_3,fl_dayofweek_4,fl_dayofweek_5,fl_dayofweek_6
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
df_onehot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40164 entries, 0 to 40163
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fl_dayofweek_0  40164 non-null  float64
 1   fl_dayofweek_1  40164 non-null  float64
 2   fl_dayofweek_2  40164 non-null  float64
 3   fl_dayofweek_3  40164 non-null  float64
 4   fl_dayofweek_4  40164 non-null  float64
 5   fl_dayofweek_5  40164 non-null  float64
 6   fl_dayofweek_6  40164 non-null  float64
dtypes: float64(7)
memory usage: 2.1 MB


In [46]:
# DF with both scaled and one-hot encoded features

df_prepared = pd.concat([df_scaled_standard, df_onehot], axis=1)
df_prepared.head(3)

Unnamed: 0,crs_elapsed_time,distance,crs_dep_time_min,crs_arr_time_min,origin_airport_rank,dest_airport_rank,c_avg_delay,c_avg_carrier_delay,c_total_flights,c_avg_delay_if_del,...,fn_avg_del_if_delayed,fn_times_delayed,fn_percent_delayed,fl_dayofweek_0,fl_dayofweek_1,fl_dayofweek_2,fl_dayofweek_3,fl_dayofweek_4,fl_dayofweek_5,fl_dayofweek_6
0,-0.837331,-0.957374,0.672768,0.393133,-1.824396,0.391795,0.528777,0.603633,-1.311693,1.749491,...,0.32071,0.028268,-0.110123,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.270101,0.027034,-0.227976,0.060016,0.54249,0.517882,-1.442034,0.674844,0.663977,-0.381722,...,-0.765828,0.660198,0.349838,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.405156,-0.440143,-0.380933,-0.311909,-1.570801,0.700007,1.706341,1.501916,-1.16574,1.643029,...,-0.07796,-0.429336,-0.410097,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [47]:
df_prepared.shape

(40164, 28)

In [79]:
df_prepared.to_csv('train_prepared_dec2018.csv', index=False)

### Features and target

In [49]:
# Arrays X_train and y_train

X_train = df_prepared.values
y_train = train_set8['arr_delay'].values

## Modelling

In [51]:
# Function to evaluate the model and compute the RMSE

from sklearn.metrics import mean_squared_error

def compute_rmse(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    return rmse



# Function to display the cross-validation scores

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## Linear Regression (28 features)

In [52]:
# Train a Linear Regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Training error (RMSE) for the Linear Regression model
lin_rmse = compute_rmse(lin_reg, X_train, y_train)
print("RMSE (training):", lin_rmse)

# R-squared
lin_r2 = lin_reg.score(X_train, y_train)
print("R-squared (training):", lin_r2)

RMSE (training): 30.09987921334408
R-squared (training): 0.11413828472671683


In [53]:
# Evaluation using cross-validation

from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=5)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [29.89221437 30.33311762 29.58339474 29.89731278 31.2170237 ]
Mean: 30.184612643051317
Standard deviation: 0.5687578129891071


## Lasso Regression (28 features)

In [54]:
# GridSearchCV for the Lasso Regression model

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso_reg = Lasso(tol=0.05, random_state=42)

param_grid = [{'alpha': [0.001, 0.01, 0.1, 1, 10]}]

grid_search_lasso = GridSearchCV(lasso_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search_lasso.fit(X_train, y_train)

# The best parameter and the corresponding mean RMSE score (for cross-validation)
print(grid_search_lasso.best_params_)
print(np.sqrt(-grid_search_lasso.best_score_))

{'alpha': 0.01}
30.185691839771994


In [60]:
# The best Lasso model
best_lasso = grid_search_lasso.best_estimator_

# Training error
lasso_rmse = compute_rmse(best_lasso, X_train, y_train)
print(lasso_rmse)

# R-squared
lasso_r2 = best_lasso.score(X_train, y_train)
print("R-squared (training):", lasso_r2)

30.10192194223637
R-squared (training): 0.11401804260142667


In [56]:
best_lasso.coef_

array([-13.38807324,  11.71864325,   0.79604251,   0.0801481 ,
         1.20215677,   1.56443177,   0.67975701,   0.        ,
        -7.89851355,  -0.03958595,   0.        ,   7.71059172,
         0.66131835,   0.55924944,   0.        ,  -1.06929887,
         6.31185875,  -0.13994402,  -1.12283819,  -0.        ,
         0.0808501 ,  -2.38488724,  -6.2906988 ,   0.        ,
        11.96463496,  15.15221748,   1.75124143,  -0.91756195])

In [57]:
best_lasso.intercept_

0.05531244309617733

In [59]:
sorted(zip(best_lasso.coef_, df_prepared.columns), reverse=True)

[(15.152217482521207, 'fl_dayofweek_4'),
 (11.964634956669638, 'fl_dayofweek_3'),
 (11.718643250844295, 'distance'),
 (7.71059171835983, 'c_delayed_flights'),
 (6.311858750361537, 'fn_avg_del_total'),
 (1.7512414315660314, 'fl_dayofweek_5'),
 (1.5644317731425275, 'dest_airport_rank'),
 (1.2021567709189573, 'origin_airport_rank'),
 (0.7960425097410015, 'crs_dep_time_min'),
 (0.6797570056971616, 'c_avg_delay'),
 (0.6613183473234763, 'c_percent_delayed'),
 (0.5592494436897218, 'age_of_aircraft'),
 (0.08085009864097782, 'fn_percent_delayed'),
 (0.08014809600286889, 'crs_arr_time_min'),
 (0.0, 'round'),
 (-0.0, 'fn_times_delayed'),
 (0.0, 'fl_dayofweek_2'),
 (0.0, 'c_avg_carrier_delay_if_del'),
 (0.0, 'c_avg_carrier_delay'),
 (-0.0395859530933704, 'c_avg_delay_if_del'),
 (-0.13994401619101884, 'fn_total_flights'),
 (-0.9175619538420284, 'fl_dayofweek_6'),
 (-1.0692988655414781, 'fn_avg_sum_of_delays'),
 (-1.1228381879886689, 'fn_avg_del_if_delayed'),
 (-2.384887238535924, 'fl_dayofweek_0'),

In [70]:
# Choose more important features from Lasso results
features_from_lasso = ['fl_dayofweek_4', 'fl_dayofweek_3', 'c_delayed_flights', 
             'fn_avg_del_total', 'fl_dayofweek_5', 'dest_airport_rank', 
             'origin_airport_rank', 'crs_dep_time_min','c_avg_delay',
             'c_percent_delayed', 'age_of_aircraft', 'fl_dayofweek_6', 
             'fn_avg_sum_of_delays', 'fn_avg_del_if_delayed', 
             'fl_dayofweek_0', 'fl_dayofweek_1', 'c_total_flights', 
             'crs_elapsed_time']


# Check correlations between features
df_prepared[features_from_lasso].corr()

# Highly correlated: c_delayed_flights and c_total_flights (0.99)
# fn_avg_del_total and fn_avg_del_if_delayed (0.68)
# c_avg_delay and c_percent_delayed (0.67)
# fn_avg_sum_of_delays and fn_avg_del_if_delayed (0.84)

Unnamed: 0,fl_dayofweek_4,fl_dayofweek_3,c_delayed_flights,fn_avg_del_total,fl_dayofweek_5,dest_airport_rank,origin_airport_rank,crs_dep_time_min,c_avg_delay,c_percent_delayed,age_of_aircraft,fl_dayofweek_6,fn_avg_sum_of_delays,fn_avg_del_if_delayed,fl_dayofweek_0,fl_dayofweek_1,c_total_flights,crs_elapsed_time
fl_dayofweek_4,1.0,-0.144712,0.003689,0.015208,-0.170882,-0.004815,-5.3e-05,0.006163,0.003574,-0.003566,-0.003702,-0.181589,-0.000142,0.003302,-0.168952,-0.133298,0.004011,0.009643
fl_dayofweek_3,-0.144712,1.0,-0.001025,0.012311,-0.169826,0.00764,-0.003177,0.007309,-0.001305,-0.003668,0.001349,-0.180467,0.008427,0.010776,-0.167908,-0.132474,-0.000711,0.001432
c_delayed_flights,0.003689,-0.001025,1.0,-0.020184,-0.007764,0.28555,0.288255,0.004334,-0.353023,0.032433,0.060798,0.001742,-0.093089,-0.095291,-0.000592,0.001764,0.987534,0.067642
fn_avg_del_total,0.015208,0.012311,-0.020184,1.0,0.005642,0.022292,0.071551,0.4646,0.202044,0.162032,0.011369,0.01542,0.411239,0.62894,-0.031667,-0.008569,-0.042135,-0.083363
fl_dayofweek_5,-0.170882,-0.169826,-0.007764,0.005642,1.0,0.007011,0.000326,0.001706,0.010478,0.002284,0.002033,-0.213104,0.003543,0.004541,-0.198273,-0.156431,-0.008005,0.008366
dest_airport_rank,-0.004815,0.00764,0.28555,0.022292,0.007011,1.0,0.013022,-0.080123,-0.077369,0.100461,-0.082397,0.005496,-0.025405,-0.017779,-0.008888,0.002976,0.269975,0.175973
origin_airport_rank,-5.3e-05,-0.003177,0.288255,0.071551,0.000326,0.013022,1.0,0.08297,-0.066823,0.108053,-0.079403,-0.002034,-0.122814,-0.085274,0.001549,0.004784,0.271753,0.1378
crs_dep_time_min,0.006163,0.007309,0.004334,0.4646,0.001706,-0.080123,0.08297,1.0,0.002547,0.010126,0.000365,0.020424,0.022355,0.238639,-0.037723,0.002969,0.003637,-0.025619
c_avg_delay,0.003574,-0.001305,-0.353023,0.202044,0.010478,-0.077369,-0.066823,0.002547,1.0,0.668566,-0.048385,-0.006916,0.168124,0.216985,0.003637,0.002039,-0.430664,-0.075099
c_percent_delayed,-0.003566,-0.003668,0.032433,0.162032,0.002284,0.100461,0.108053,0.010126,0.668566,1.0,-0.161915,-0.008362,0.007641,0.047541,0.01625,0.006157,-0.100972,0.021823


In [73]:
# Reduce number of features to 15

features2 = ['fl_dayofweek_4', 'fl_dayofweek_3', 'c_delayed_flights', 
             'fn_avg_del_total', 'fl_dayofweek_5', 'dest_airport_rank', 
             'origin_airport_rank', 'crs_dep_time_min', 
             'c_percent_delayed', 'age_of_aircraft', 'fl_dayofweek_6',   
             'fl_dayofweek_0', 'fl_dayofweek_1', 'crs_elapsed_time', 
             'fn_percent_delayed']

# New X_train
X_train2 = df_prepared[features2].values

In [78]:
len(features2)

15

## Linear Regression (15 features)

In [74]:
# Train a Linear Regression model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train2, y_train)

# Training error (RMSE) for the Linear Regression model
lin_rmse = compute_rmse(lin_reg, X_train2, y_train)
print("RMSE (training):", lin_rmse)

# R-squared
lin_r2 = lin_reg.score(X_train2, y_train)
print("R-squared (training):", lin_r2)

RMSE (training): 30.274760510666404
R-squared (training): 0.10381460920883834


In [75]:
# Evaluation using cross-validation

from sklearn.model_selection import cross_val_score

lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=5)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores: [29.89221437 30.33311762 29.58339474 29.89731278 31.2170237 ]
Mean: 30.184612643051317
Standard deviation: 0.5687578129891071


## Random Forest Regressor (28 features)

In [62]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=28, max_depth=5, random_state=42)
forest_reg.fit(X_train, y_train)

# Training error
forest_rmse = compute_rmse(forest_reg, X_train, y_train)
print(forest_rmse)

# R-squared
forest_r2 = forest_reg.score(X_train, y_train)
print("R-squared (training):", forest_r2)

29.941181310267282
R-squared (training): 0.12345485298830938


In [63]:
# Cross-validation

forest_scores = cross_val_score(forest_reg, X_train, y_train,
                                scoring="neg_mean_squared_error", cv=5)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [30.01579563 30.4217756  29.61812399 30.05874872 30.99697878]
Mean: 30.222284544210062
Standard deviation: 0.4635384437474098


In [64]:
# Tuning the Random Forest using grid search

param_grid_rf = [
    {'n_estimators': [25, 30, 35], 'max_depth': [3, 5]}
  ]

rf_reg = RandomForestRegressor(random_state=42)

grid_search_rf = GridSearchCV(rf_reg, param_grid_rf, cv=5,
                              scoring='neg_mean_squared_error', 
                              return_train_score=True)

grid_search_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_depth': [3, 5], 'n_estimators': [25, 30, 35]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [65]:
# The best parameters

grid_search_rf.best_params_

{'max_depth': 5, 'n_estimators': 35}

In [66]:
# The RMSEs and parameters
# For the parameters {'max_depth': 5, 'n_estimators': 35}: the RMSE is approximately 30.23

cvres = grid_search_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

30.665521835387153 {'max_depth': 3, 'n_estimators': 25}
30.669167806625158 {'max_depth': 3, 'n_estimators': 30}
30.67057996908184 {'max_depth': 3, 'n_estimators': 35}
30.22951654449727 {'max_depth': 5, 'n_estimators': 25}
30.22675294238862 {'max_depth': 5, 'n_estimators': 30}
30.22621734433504 {'max_depth': 5, 'n_estimators': 35}


In [67]:
# Feature importances
feature_importances = grid_search_rf.best_estimator_.feature_importances_

# Sorted feature importances
sorted(zip(feature_importances, df_prepared.columns), reverse=True)

[(0.32965193847319646, 'fn_avg_del_total'),
 (0.208059655335528, 'fl_dayofweek_4'),
 (0.1780999348038997, 'fl_dayofweek_3'),
 (0.06060394012126225, 'c_percent_delayed'),
 (0.0519865517796944, 'fn_percent_delayed'),
 (0.025567230171371132, 'c_delayed_flights'),
 (0.0197637559955449, 'dest_airport_rank'),
 (0.01852686929991707, 'origin_airport_rank'),
 (0.015130842948328133, 'crs_dep_time_min'),
 (0.013309943594072273, 'fl_dayofweek_1'),
 (0.009920348142546309, 'fn_avg_sum_of_delays'),
 (0.009624792861402137, 'crs_arr_time_min'),
 (0.009484144823830537, 'c_avg_carrier_delay_if_del'),
 (0.009279722471192136, 'c_total_flights'),
 (0.008436191794345129, 'c_avg_delay_if_del'),
 (0.006833070199133288, 'c_avg_carrier_delay'),
 (0.005173204942215467, 'fn_avg_del_if_delayed'),
 (0.004987882694655571, 'crs_elapsed_time'),
 (0.004098037951050005, 'c_avg_delay'),
 (0.0032210462630938223, 'age_of_aircraft'),
 (0.0027540145149839218, 'distance'),
 (0.0026972518523190312, 'fn_total_flights'),
 (0.0019

In [68]:
# The best Random Forest model
best_rf = grid_search_rf.best_estimator_

# Training error
rf_rmse = compute_rmse(best_rf, X_train, y_train)
print(rf_rmse)

# R-squared
rf_r2 = best_rf.score(X_train, y_train)
print("R-squared (training):", rf_r2)

29.944254131993326
R-squared (training): 0.12327492654138228


## Random Forest (15 features)

In [76]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=35, max_depth=5, random_state=42)
forest_reg.fit(X_train2, y_train)

# Training error
forest_rmse = compute_rmse(forest_reg, X_train2, y_train)
print(forest_rmse)

# R-squared
forest_r2 = forest_reg.score(X_train2, y_train)
print("R-squared (training):", forest_r2)

29.96846526763013
R-squared (training): 0.12185661831970251


In [77]:
# Cross-validation

forest_scores = cross_val_score(forest_reg, X_train2, y_train,
                                scoring="neg_mean_squared_error", cv=5)

forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [30.02957013 30.45089333 29.60908208 30.06877209 30.97486167]
Mean: 30.226635857973214
Standard deviation: 0.4594043561295737
