Repeating earlier models but without categorical features

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from data_cleaning import clean_flights_df, clean_passengers_df, avg_passengers, clean_fuel_df, avg_carrier_arr_delay, avg_fuel_use, avg_taxi_time


### Import datasets

In [3]:
flights_df = clean_flights_df('data/flights_sample.csv')
flights_df.head()



Unnamed: 0,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour
0,UA,UA,SFO,LAX,630,627.0,-3.0,16.0,13.0,816,753.0,-23.0,106.0,86.0,337,2019,7,17,2,6
1,WN,WN,BWI,BOS,1340,1400.0,20.0,36.0,5.0,1510,1609.0,59.0,90.0,129.0,369,2018,6,9,5,13
2,B6,B6,BDL,MCO,600,551.0,-9.0,16.0,9.0,906,845.0,-21.0,186.0,174.0,1050,2019,11,6,2,6
3,WN,WN,LAS,OMA,950,958.0,8.0,16.0,2.0,1425,1417.0,-8.0,155.0,139.0,1099,2019,12,24,1,9
4,WN,WN,MCO,MDW,2045,2042.0,-3.0,11.0,5.0,2235,2231.0,-4.0,170.0,169.0,990,2018,1,14,6,20


In [4]:
flights_df = flights_df.sample(frac=.33, random_state=13)
flights_df.reset_index(inplace=True, drop=True)
flights_df.shape

(1544903, 20)

In [5]:
passengers_df = clean_passengers_df('data/passengers.csv')
fuel_df = clean_fuel_df('data/fuel_consumption.csv')

### Train Test Split and Feature Engineering

In [6]:
X_train, X_test, y_train, y_test = train_test_split(flights_df, flights_df['arr_delay'], test_size=.3, random_state=13)
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)

(463471, 20) (463471,)
(1081432, 20) (1081432,)


In [7]:
# add average passengers and avg fuel use columns

X_train = avg_passengers(X_train, passengers_df)
X_train = avg_fuel_use(X_train, fuel_df)

In [8]:
# create average taxi in and out columns.
taxi_dep_mean_dict = round(X_train.groupby(X_train['dep_time']//100)['taxi_out'].mean(), 2).to_dict()
taxi_arr_mean_dict = round(X_train.groupby(X_train['arr_time']//100)['taxi_in'].mean(), 2).to_dict()

X_train['arr_hour'] = X_train['arr_time'] // 100
X_train['taxi_dep_mean'] = X_train['dep_hour'].map(taxi_dep_mean_dict)
X_train['taxi_arr_mean'] = X_train['arr_hour'].map(taxi_arr_mean_dict)
X_train.drop('arr_hour', axis=1, inplace=True)

In [9]:
# create avg_carrier_delay column
avg_carrier_arr_delay_dict = round(X_train.groupby('op_unique_carrier')['arr_delay'].mean(), 2).to_dict()
X_train['avg_carrier_arr_delay'] = X_train['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [11]:
X_train.columns.tolist()

['mkt_unique_carrier',
 'op_unique_carrier',
 'origin',
 'dest',
 'crs_dep_time',
 'dep_time',
 'dep_delay',
 'taxi_out',
 'taxi_in',
 'crs_arr_time',
 'arr_time',
 'arr_delay',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'distance',
 'year',
 'month',
 'day_of_month',
 'day_of_week',
 'dep_hour',
 'monthly_avg_passengers',
 'avg_monthly_fuel_gallons',
 'avg_monthly_fuel_cost',
 'taxi_dep_mean',
 'taxi_arr_mean',
 'avg_carrier_arr_delay']

In [10]:
# drop columns from train data that won't be used in model
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()
cat_cols

['mkt_unique_carrier', 'op_unique_carrier', 'origin', 'dest']

In [12]:
X_train.drop(columns=cat_cols, inplace=True)

#### Add Engineered features to test data

In [13]:
X_test = avg_passengers(X_test, passengers_df)
X_test = avg_fuel_use(X_test, fuel_df)

In [14]:
# map average delays and taxi times from training data onto test data
X_test['taxi_dep_mean'] = X_test['dep_hour'].map(taxi_dep_mean_dict)

X_test['arr_hour'] = X_test['crs_arr_time'] // 100
X_test['taxi_arr_mean'] = X_test['arr_hour'].map(taxi_arr_mean_dict)
X_test.drop('arr_hour', axis=1, inplace=True)

In [15]:
X_test['avg_carrier_arr_delay'] = X_test['op_unique_carrier'].map(avg_carrier_arr_delay_dict)

In [16]:
X_test.drop(columns=cat_cols, inplace=True)

In [17]:
# check if engineered features created any null values
X_train.isna().sum().sum()

1

In [18]:
X_train[X_train.isna().any(axis=1)]

Unnamed: 0,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,...,month,day_of_month,day_of_week,dep_hour,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
436696,623,812.0,109.0,18.0,7.0,819,1012.0,113.0,116.0,120.0,...,11,24,5,6,,19589589.0,33303519.0,16.77,7.85,11.07


In [19]:
X_train.dropna(inplace=True)
y_train.drop(index=436696, inplace=True)

In [20]:
X_test.isna().sum().sum()

1

In [21]:
X_test[X_test.isna().any(axis=1)]

Unnamed: 0,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,arr_time,arr_delay,crs_elapsed_time,actual_elapsed_time,...,month,day_of_month,day_of_week,dep_hour,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
205109,904,1033.0,89.0,10.0,6.0,1110,1247.0,97.0,126.0,134.0,...,11,28,2,9,,19589589.0,33303519.0,18.49,7.29,11.07


In [22]:
X_test.dropna(inplace=True)
y_test.drop(index=205109, inplace=True)

In [23]:
print(X_test.shape, y_test.shape)
print(X_train.shape, y_train.shape)

(463470, 22) (463470,)
(1081431, 22) (1081431,)


In [24]:
X_test.drop(columns=['dep_time', 'dep_delay', 'taxi_in', 'taxi_out', 'arr_time', 'arr_delay', 'actual_elapsed_time'], inplace=True)
X_train.drop(columns=['dep_time', 'dep_delay', 'taxi_in', 'taxi_out', 'arr_time', 'arr_delay', 'actual_elapsed_time'], inplace=True)

In [21]:
X_train.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
0,1916,2122,126.0,585,2019,5,9,3,19,2752.0,296278877.0,527459997.0,17.54,7.97,6.02
1,1950,2112,82.0,335,2018,10,14,6,19,2761.0,300692722.0,524396865.0,17.54,7.97,7.5
2,1105,1700,235.0,1620,2019,8,22,3,11,5910.0,21472000.0,36842373.0,17.31,7.72,11.07
3,516,555,39.0,102,2018,2,26,0,5,7369.0,18977858.0,32022703.0,16.05,7.04,0.74
4,2210,20,130.0,802,2018,6,6,2,22,3182.0,178747802.0,342753536.0,17.12,7.17,3.61


### Scale Numeric Features

In [25]:
X_train.reset_index(inplace=True, drop=True)
X_train.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day_of_month,day_of_week,dep_hour,monthly_avg_passengers,avg_monthly_fuel_gallons,avg_monthly_fuel_cost,taxi_dep_mean,taxi_arr_mean,avg_carrier_arr_delay
0,1916,2122,126.0,585,2019,5,9,3,19,2752.0,296278877.0,527459997.0,17.54,7.97,6.02
1,1950,2112,82.0,335,2018,10,14,6,19,2761.0,300692722.0,524396865.0,17.54,7.97,7.5
2,1105,1700,235.0,1620,2019,8,22,3,11,5910.0,21472000.0,36842373.0,17.31,7.72,11.07
3,516,555,39.0,102,2018,2,26,0,5,7369.0,18977858.0,32022703.0,16.05,7.04,0.74
4,2210,20,130.0,802,2018,6,6,2,22,3182.0,178747802.0,342753536.0,17.12,7.17,3.61


In [26]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [27]:
X_test.reset_index(inplace=True, drop=True)

In [28]:
X_test_scaled = scaler.transform(X_test)

## Model Training

### RidgeCV

In [29]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV()

rcv.fit(X_train_scaled, y_train)

In [30]:
y_pred = rcv.predict(X_test_scaled)

In [31]:
print('RMSE: ', mean_squared_error(y_test, y_pred, squared=False))
print('R^2: ', r2_score(y_test, y_pred))

RMSE:  49.412532664649156
R^2:  0.010185577073144825


### AdaBoost

In [32]:
from sklearn.ensemble import AdaBoostRegressor

abr = AdaBoostRegressor()
abr.fit(X_train_scaled, y_train)

In [33]:
y_pred = abr.predict(X_test_scaled)
print('RMSE: ', mean_squared_error(y_test, y_pred, squared=False))
print('R^2: ', r2_score(y_test, y_pred))

RMSE:  398.71808835995006
R^2:  -63.44829400822293
