In [1]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

import sys
!{sys.executable} -m pip install psycopg2-binary
import pandas as pd
import getpass
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import math
import xgboost as xg
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
df_clean=pd.read_csv('preprocessed.csv')
df_clean=df_clean.drop(['Unnamed: 0'], axis=1)
df_clean

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price,date,DOW
0,Air_India,Delhi,Evening,one,Night,Mumbai,Business,24.75,1,42220,2022-02-11,4
1,Air_India,Delhi,Night,one,Night,Mumbai,Business,26.50,1,44450,2022-02-11,4
2,Air_India,Delhi,Evening,one,Night,Mumbai,Business,6.67,1,46690,2022-02-11,4
3,Vistara,Delhi,Evening,zero,Night,Mumbai,Business,2.17,1,50264,2022-02-11,4
4,Air_India,Delhi,Night,one,Afternoon,Mumbai,Business,17.75,1,50669,2022-02-11,4
...,...,...,...,...,...,...,...,...,...,...,...,...
270629,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Economy,13.83,49,7697,2022-03-31,3
270630,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Economy,13.83,49,7709,2022-03-31,3
270631,Vistara,Chennai,Afternoon,one,Morning,Hyderabad,Economy,20.58,49,8640,2022-03-31,3
270632,Vistara,Chennai,Morning,one,Morning,Hyderabad,Economy,23.33,49,8640,2022-03-31,3


In [4]:
df_clean.isna().sum()

airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
date                0
DOW                 0
dtype: int64

In [5]:
# OHE
df_clean = pd.get_dummies(data=df_clean, columns=['airline','source_city','departure_time','stops','arrival_time','destination_city','class','DOW'])

In [6]:
# train test split
X=df_clean.drop(['price','date'], axis=1)
y=df_clean.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [7]:
# scale data
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# define five final models

# Linear Regression 
LR=LinearRegression()

# Random Forest: Girdsearch (Final Version) & Lab Version
#RF(Lab_Version)= RandomForestRegressor(n_estimators=500, max_depth=9, max_features=22, min_samples_leaf=38)
RF= RandomForestRegressor(n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_depth=90, bootstrap=True)

# Gradient Boosting: Gridsearch & Lab Version (Final Version)
#GB = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=10, max_leaf_nodes=50, random_state=1)
GB=xg.XGBRegressor(tree_method='approx', grow_policy='lossguide', max_depth=0).set_params(learning_rate=0.4824817069404312, 
                                                                                          max_leaves=22, min_child_weight=3,n_estimators=1365,
                                                                                          subsample=0.9840850004419296)

# XGBoost
XGB= xg.XGBRegressor(colsample_bytree=0.7519942256585419, gamma=0.9346178525692137, learning_rate=0.4640891043041282,
                     max_depth=11, max_leaves=68, min_child_weight=100, n_estimators=866, reg_alpha=0.16376286230324713,
                     reg_lambda=0.2736328304636606)

# Neural Network
NN= MLPRegressor(hidden_layer_sizes=10, activation='logistic', solver='lbfgs', alpha=0.01, max_iter = 1000)

# All 5 Models
model_list = ['Linear Regression','Random Forest','Gradient Boosting','XGBoost','Neural Network']
models=[LR,RF,GB,XGB,NN]

In [13]:
# cross validation results
from sklearn.model_selection import cross_validate, KFold
cv_results={}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
for i in range(len(models)): 
    print ("Fitting" + model_list[i])
    scoring = ['r2', 'neg_mean_absolute_percentage_error','neg_root_mean_squared_error']
    scores = cross_validate(models[i], X_train, y_train, cv = cv, scoring=scoring)
    cv_results[model_list[i]]=[]
    cv_results[model_list[i]].append(scores['test_r2'].mean())
    cv_results[model_list[i]].append(abs(scores['test_neg_mean_absolute_percentage_error'].mean()))
    cv_results[model_list[i]].append(abs(scores['test_neg_root_mean_squared_error'].mean()))

FittingLinear Regression
FittingRandom Forest
FittingGradient Boosting
FittingXGBoost
FittingNeural Network


In [12]:
result_df=pd.DataFrame(cv_results)
result_df.index=['R^2','MAPE','RMSE']
result_df.T

Unnamed: 0,R^2,MAPE,RMSE
Linear Regression,0.946894,0.330292,5174.104818
Random Forest,0.993202,0.06828,1851.164694
Gradient Boosting,0.992715,0.098921,1916.161067
XGBoost,0.992982,0.100083,1880.8205
Neural Network,0.959601,0.264327,4512.224561


#### Note About Random Forest & Gradient Boosting Parameters
**two versions of tuned random forest were evaluated**

1: {n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_depth=90, bootstrap=True}

2: {n_estimators=500, max_depth=9, max_features=22, min_samples_leaf=38}

CV results indicate much more robust performance for version 1, thus final results from version 1 are recorded here and in the report.

**two versions of tuned gradient boosting were evaluated**

1: {n_estimators=1000, learning_rate=0.1, max_depth=10, max_leaf_nodes=50, random_state=1}

2: {learning_rate=0.4824817069404312, max_leaves=22, min_child_weight=3,n_estimators=1365, subsample=0.9840850004419296}

CV results indicate similar performance, thus final results from version 2 are recorded here and in the report. 