## Using XGBRegressor() on data scaled by MinMaxScaler(), instead of StandardScaler()

## 10000 rows of data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
X = pd.read_csv('X_test.csv')
y = pd.read_csv('y_test.csv')
y = y['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred= xgboost.predict(X_test)

print('Training score: ', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

## 100000 rows of data

In [18]:
X = pd.read_csv('X_test100000.csv')
ydf = pd.read_csv('y_test100000.csv')

y = ydf['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

In [19]:
print('Training R-Square', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

Training R-Square 0.16326952350050172
RMSE:  0.9684616081650654
R-squared score:  0.0340012678853745


# No dummy (categorical) variables
- MinMaxScaler for all variables including feature variable
- 100000 rows
- Month and unique carrier ID columns dropped
- Origin and dest columns replaced with avg. arrival delay (unscaled)

In [11]:
def replace_origin_dest(df):
    # df: X features dataframe without one-hot encoding

    # Find the average delay times by origin location, and store the values in a dictionary
    origin = pd.read_csv('origin_arr_delay.txt', delimiter = '\t', names = ['origin', 'avg_delay'])
    origin = pd.Series(origin.avg_delay.values, index = origin.origin).to_dict()
    
    # Find the average delay times by destination location, and store the values in a dictionary
    dest = pd.read_csv('dest_arr_delay.txt', delimiter = '\t', names = ['dest', 'avg_delay'])
    dest = pd.Series(dest.avg_delay.values, index = dest.dest).to_dict()
    
    # Replace the values in the "origin" and "dest" columns with the average arrival delay time
    df['origin'] = df['origin'].replace(origin)
    df['dest'] = df['dest'].replace(dest)
    
    return df

In [15]:
X = pd.read_csv('X_test100000noDum.csv', compression = 'gzip')
y = pd.read_csv('y_test100000noDum.csv', compression = 'gzip')
X = X.drop(['op_unique_carrier', 'month'], axis = 1)
X = replace_origin_dest(X)
y = y['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Training R-Square', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

Training R-Square 0.17528500134304825
RMSE:  0.02781708895978555
R-squared score:  0.016622917851691343


## 100000 rows, standard scaling, no outlier removal

In [11]:
X = pd.read_pickle('X_standard', compression = 'gzip')
y = pd.read_pickle('y', compression = 'gzip')
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14355700876265898
Test R2 score:  0.005752405341934663


## 100000 rows, min-max scaling, no outlier removal

In [10]:
X = pd.read_pickle('X_minmax', compression = 'gzip')
y = pd.read_pickle('y', compression = 'gzip')
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14355700876265898
Test R2 score:  0.005692622235527689


## 100000 rows, robust scaling, no outlier removal

In [12]:
X = pd.read_pickle('X_robust', compression = 'gzip')
y = pd.read_pickle('y', compression = 'gzip')
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14355700876265898
Test R2 score:  0.005653810087580369


## 100000 rows, power transformation, no outlier removal

In [14]:
X = pd.read_pickle('X_powertrans', compression = 'gzip')
y = pd.read_pickle('y', compression = 'gzip')
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14355700876265898
Test R2 score:  0.0056373997010491506


## 100000 rows, standard scaler, month (1-12) and day of week (0-6) as categorical numerical variables

In [13]:
X = pd.read_pickle('X_fl_datetime_standard', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [14]:
X

Unnamed: 0,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT,op_unique_carrier,origin,dest,month,day_of_week
0,0.404950,0.475464,-0.252310,-0.371468,0.538777,1.766642,-0.046907,4,0
1,-1.030286,-1.074137,-0.808416,-0.651979,-0.541576,-1.089924,-0.722955,7,6
2,-0.825252,0.088064,0.706974,0.748875,0.484104,-1.104809,-0.330836,4,1
3,0.609983,0.475464,-0.794514,-0.889988,0.484104,0.018107,0.555854,11,2
4,-0.005118,0.088064,-0.474752,-0.597577,0.210159,-0.329730,1.621178,10,5
...,...,...,...,...,...,...,...,...,...
99995,0.404950,0.281764,-0.808416,-0.704681,-0.541576,-0.861272,0.260061,9,1
99996,1.635151,-2.817438,-0.224504,-0.087557,0.448690,1.766642,3.889518,10,0
99997,1.020050,1.250264,-0.599876,-0.388469,1.202815,-0.065834,0.349519,1,1
99998,0.815017,0.862864,-0.766708,-0.925690,0.569946,0.513858,-1.006962,6,1


In [15]:
y

0        59.0
1       -20.0
2        -1.0
3       -18.0
4       -15.0
         ... 
99995    -6.0
99996    -9.0
99997   -18.0
99998   -12.0
99999    -1.0
Name: arr_delay, Length: 98047, dtype: float64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 999)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14907160320446422
Test R2 score:  0.017677814376295342


## 100000 rows, min-max scaler, month (1-12) and day of week (0-6) as categorical numerical variables

In [9]:
X = pd.read_pickle('X_fl_datetime_minmax', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.15977622435474026
Test R2 score:  0.009102573118755508


## 100000 rows, standard scaler, month and day of week encoded as dummy variables

In [14]:
X = pd.read_pickle('X_fl_datetime_standard', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [20]:
X['month'] = X['month'].astype(str)
X['day_of_week'] = X['day_of_week'].astype(str)
X = pd.get_dummies(X)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.17279104639136944
Test R2 score:  0.005538443479018329


## 100000 rows, min-max scaler, month and day of week encoded as dummy variables

In [26]:
X = pd.read_pickle('X_fl_datetime_minmax', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [27]:
X['month'] = X['month'].astype(str)
X['day_of_week'] = X['day_of_week'].astype(str)
X = pd.get_dummies(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 999)
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.14907160320446422
Test R2 score:  0.017677814376295342


## PCA dimensionality reduction of features (experimenting)

In [31]:
X = pd.read_pickle('X_fl_datetime_standard', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [32]:
X['month'] = X['month'].astype(str)
X['day_of_week'] = X['day_of_week'].astype(str)
X = pd.get_dummies(X)

In [40]:
X.shape

(98047, 26)

In [33]:
from sklearn.decomposition import PCA

In [58]:
pca = PCA(n_components = 5)
X_pca = pca.fit(X.T)
X_pca.components_.shape

(5, 98047)

In [59]:
X_reduced = X_pca.components_.T
X_reduced.shape

(98047, 5)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.29347805260637816
Test R2 score:  -0.04922182023826038


## Try PCA again with all original categorical features as dummy variables

In [72]:
X = pd.read_pickle('X_all_dummy', compression = 'gzip')
X['month'] = X['fl_date'].dt.month
X['day_of_week'] = X['fl_date'].dt.dayofweek
X = X.drop('fl_date', axis = 1)
y = pd.read_pickle('y', compression = 'gzip')

In [73]:
X['month'] = X['month'].astype(str)
X['day_of_week'] = X['day_of_week'].astype(str)
X = pd.get_dummies(X)

In [74]:
X.shape

(98047, 789)

In [81]:
pca = PCA(n_components = 8)
X_pca = pca.fit(X.T)
print(X_pca.components_.shape)
X_reduced = X_pca.components_.T
print(X_reduced.shape)

(8, 98047)
(98047, 8)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))

Train R2 score:  0.324311607831399
Test R2 score:  -0.03628132015786467


## 100000 rows, extra y_cat column in X

In [83]:
X = pd.read_pickle('X_cat', compression = 'gzip')
y = pd.read_pickle('y', compression = 'gzip')

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                            test_size = 0.2, random_state = 1)
xgboost = XGBRegressor(max_depth = 1)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Train R2 score: ', xgboost.score(X_train, y_train))
print('Test R2 score: ', r2_score(y_test, y_pred))
print('RMSE: ', mean_squared_error(y_test, y_pred))

Train R2 score:  0.027056561459246287
Test R2 score:  0.024163787554979166
RMSE:  2275.89509915173


## XGBRegressor() grid search

In [106]:
parameters = {
            'colsample_bytree': [0.2, 0.3, 0.4],
            'learning_rate': [0.01, 0.1, 1, 10],
            'max_depth': [3, 4, 5, 6, 7],
            'reg_lambda': [1, 10, 20],
            'n_estimators': [10, 100, 1000, 10000]
             }

In [None]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(XGBRegressor(), parameters, verbose=3, n_jobs=-1) #if running overnight, use n_jobs=-1 for max speed.
clf.fit(X,y)

Fitting 5 folds for each of 3500 candidates, totalling 17500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 18.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 38.3min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 52.7min


## Final test set

In [19]:
X_final = pd.read_csv('Final DF Pred', index_col = 0)
X_final = X_final.rename(columns = {'dayWeek': 'day_of_week'})

In [38]:
xgboost = XGBRegressor(n_estimators = 100, learning_rate = 0.1,
                      reg_alpha = 8)
xgboost.fit(X, y)
y_pred = xgboost.predict(X_final)

In [44]:
df_final = X_final
df_final['predicted_delay'] = y_pred
df_final

Unnamed: 0,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT,op_unique_carrier,origin,dest,month,day_of_week,predicted_delay
0,1.002153,0.847341,-0.660447,-0.717995,-0.570204,-1.057898,1.382909,1,2,12.721513
1,-0.420511,-0.298153,-0.728836,-0.717995,-0.570204,-1.057898,1.382909,1,2,9.296444
2,1.408629,1.229172,-1.002392,-0.768505,-0.570204,-1.057898,-0.946535,1,2,2.880046
3,-0.014036,-0.107237,-0.934003,-0.768505,-0.570204,-1.057898,-0.946535,1,2,0.629726
4,-0.826987,-0.870900,-0.865614,-0.768505,-0.570204,-1.057898,-0.946535,1,2,-4.869950
...,...,...,...,...,...,...,...,...,...,...
150618,0.595678,0.656425,0.515841,0.408392,0.185286,0.481501,-0.440436,1,1,4.241131
150619,-1.233462,-1.252732,-0.414247,-0.468809,0.185286,0.481501,0.191313,1,1,-2.170033
150620,-0.826987,-0.679984,-0.523670,-0.468809,0.185286,0.016765,-0.695755,1,1,0.679016
150621,1.408629,1.420088,-0.728836,-0.603504,0.185286,1.336250,0.191313,1,1,6.247889


In [45]:
df_final.to_csv('submission.csv', compression = 'gzip')