In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

from pycaret.regression import *

In [3]:
from sklearn.metrics import r2_score

In [4]:
train = pd.read_csv('train_prep.csv')
test = pd.read_csv('test_prep.csv')

## Data Setup

In [5]:
reg1 = setup(data = train, target = 'Fare')

Unnamed: 0,Description,Value
0,session_id,3546
1,Target,Fare
2,Original Data,"(7500, 33)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,22
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(5249, 66)"


## Model Selection

In [4]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [11]:
best_specific = compare_models(include = ['dt','rf','xgboost','catboost','gbr','et','lightgbm','ada','svm'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,326.3,460400.0,655.9,0.99,0.0559,0.0413,2.154
lightgbm,Light Gradient Boosting Machine,494.7,1066000.0,1008.0,0.9766,0.0741,0.0562,0.258
xgboost,Extreme Gradient Boosting,583.6,1390000.0,1166.0,0.9693,0.0903,0.0672,1.187
gbr,Gradient Boosting Regressor,819.2,2583000.0,1584.0,0.9429,0.1318,0.0956,0.323
rf,Random Forest Regressor,953.0,3413000.0,1836.0,0.9231,0.1302,0.1003,1.213
et,Extra Trees Regressor,997.7,3907000.0,1965.0,0.9124,0.1332,0.1051,1.176
dt,Decision Tree Regressor,1256.0,7466000.0,2713.0,0.8274,0.1781,0.128,1.127
ada,AdaBoost Regressor,5229.0,32310000.0,5681.0,0.2619,0.7174,0.9924,0.235
svm,Support Vector Regression,4233.0,47840000.0,6899.0,-0.0692,0.647,0.5475,1.024


## Model Creation + Tuning

In [6]:
cat = create_model('catboost')
# tune hyperparameters of decision tree

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,341.5,509200.0,713.6,0.9904,0.054,0.0392
1,348.4,379800.0,616.3,0.9903,0.0582,0.0446
2,298.9,301400.0,549.0,0.9929,0.0542,0.0384
3,317.2,310800.0,557.5,0.9918,0.0579,0.0427
4,313.6,371800.0,609.7,0.9898,0.0546,0.0402
5,313.1,306900.0,554.0,0.9938,0.054,0.0383
6,365.0,1014000.0,1007.0,0.9809,0.0551,0.0411
7,316.1,500400.0,707.4,0.989,0.0529,0.0394
8,350.2,690000.0,830.7,0.9848,0.0597,0.0427
9,333.8,344100.0,586.6,0.9914,0.0558,0.0421


In [None]:
tuned_cat = tune_model(cat)
# # tune hyperparameters with increased n_iter
# tuned_dt = tune_model(dt, n_iter = 50)
# # tune hyperparameters to optimize AUC
# tuned_dt = tune_model(dt, optimize = 'AUC') #default is 'Accuracy'

In [7]:
lgm = create_model('lightgbm')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,530.9,1143000.0,1069.0,0.9784,0.0728,0.055
1,496.4,833800.0,913.1,0.9787,0.0729,0.0563
2,469.1,831100.0,911.6,0.9804,0.071,0.0538
3,417.4,633100.0,795.7,0.9832,0.0701,0.0529
4,458.8,666200.0,816.2,0.9818,0.072,0.0554
5,519.7,985700.0,992.8,0.98,0.0781,0.0573
6,516.6,2290000.0,1513.0,0.9569,0.0715,0.0527
7,448.6,1369000.0,1170.0,0.9698,0.0713,0.0531
8,487.6,1384000.0,1176.0,0.9695,0.0739,0.0553
9,504.1,939700.0,969.4,0.9766,0.0757,0.0567


In [6]:
tuned_lgm = tune_model(lgm)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,450.8,568800.0,754.2,0.9842,0.0709,0.0553
1,497.3,883200.0,939.8,0.9809,0.0746,0.0563
2,552.6,1198000.0,1094.0,0.9799,0.0697,0.0537
3,478.3,1954000.0,1398.0,0.9626,0.0719,0.0527
4,527.5,1164000.0,1079.0,0.9763,0.0779,0.0598
5,546.4,1622000.0,1274.0,0.969,0.0774,0.0573
6,447.9,1329000.0,1153.0,0.9662,0.0712,0.0524
7,470.9,731400.0,855.2,0.9798,0.0751,0.0578
8,486.6,791300.0,889.6,0.9804,0.0717,0.0554
9,466.2,697900.0,835.4,0.981,0.069,0.0531


In [8]:
xgb = create_model('xgboost')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,599.9,1305000.0,1142.0,0.9754,0.0839,0.0639
1,656.3,1468000.0,1211.0,0.9626,0.089,0.0706
2,574.9,997100.0,998.5,0.9765,0.0844,0.0643
3,520.0,983600.0,991.8,0.974,0.0824,0.0637
4,546.1,1089000.0,1044.0,0.9702,0.0846,0.0629
5,629.7,1484000.0,1218.0,0.9699,0.09,0.0682
6,629.8,1996000.0,1413.0,0.9625,0.0849,0.0655
7,545.4,1819000.0,1349.0,0.9599,0.0835,0.0616
8,624.0,1868000.0,1367.0,0.9589,0.0914,0.068
9,578.3,1156000.0,1075.0,0.9712,0.0848,0.0649


In [16]:
tuned_xgb = tune_model(xgb)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,649.1,1800000.0,1342.0,0.9665,0.1018,0.076
1,742.3,2177000.0,1476.0,0.9501,0.1077,0.0822
2,713.3,1567000.0,1252.0,0.969,0.1154,0.0835
3,636.2,1243000.0,1115.0,0.9722,0.0946,0.0738
4,617.3,1291000.0,1136.0,0.9621,0.1089,0.0778
5,592.1,1013000.0,1007.0,0.9727,0.1071,0.0797
6,717.6,1739000.0,1319.0,0.9618,0.1125,0.083
7,621.2,993400.0,996.7,0.9785,0.1035,0.0748
8,666.0,1815000.0,1347.0,0.9633,0.1021,0.0769
9,674.6,1273000.0,1128.0,0.9694,0.112,0.0786


In [9]:
x_train = train.drop('Fare', axis=1)

## Train Score Check

In [10]:
r2_score(predict_model(cat,train)['Label'], train['Fare'])

0.9957941755255529

In [11]:
r2_score(predict_model(lgm,train)['Label'], train['Fare'])

0.9885609633193176

In [12]:
r2_score(predict_model(xgb,train)['Label'], train['Fare'])

0.9901627655389185

## Stacking

In [13]:
stacker = stack_models(estimator_list = [cat,lgm], meta_model = xgb)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,357.5,546500.0,739.2,0.9897,0.054,0.0399
1,356.8,642200.0,801.4,0.9836,0.0602,0.0449
2,300.5,313800.0,560.1,0.9926,0.0555,0.0397
3,285.9,288500.0,537.1,0.9924,0.0559,0.0407
4,327.0,394200.0,627.9,0.9892,0.0575,0.0425
5,310.3,322300.0,567.7,0.9935,0.052,0.0376
6,362.6,993900.0,996.9,0.9813,0.057,0.0422
7,300.3,264500.0,514.3,0.9942,0.055,0.0407
8,313.2,382600.0,618.5,0.9916,0.0574,0.0415
9,310.2,250500.0,500.5,0.9938,0.0535,0.04


## Blending

In [15]:
blender_hard = blend_models(estimator_list = [cat,lgm,xgb])

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,436.8,809400.0,899.7,0.9847,0.0621,0.0461
1,445.7,665800.0,816.0,0.983,0.0645,0.0505
2,396.5,546000.0,738.9,0.9871,0.0622,0.0459
3,372.5,495300.0,703.7,0.9869,0.0625,0.0473
4,391.2,572200.0,756.5,0.9844,0.062,0.0467
5,430.6,706900.0,840.8,0.9857,0.065,0.0473
6,458.7,1563000.0,1250.0,0.9706,0.0632,0.0475
7,380.2,1029000.0,1014.0,0.9773,0.0614,0.0447
8,442.0,1130000.0,1063.0,0.9751,0.0668,0.0496
9,425.7,622800.0,789.2,0.9845,0.064,0.0487


## Model Finalizing + Prediction

In [16]:
final_model = finalize_model(stacker)

In [17]:
pred = predict_model(final_model, test)

In [18]:
pred

Unnamed: 0,Flight Time,Class,is_dr,fdYear,fdMonth,fdWeek,fdDay,fdDayofweek,fdIs_month_end,fdIs_month_start,...,bdIs_year_end,bdIs_year_start,age,is_bdbd,is_fdbd,depgap,is_weekend,distance,flight_duration,Label
0,19,10,0,2016,4,14,4,0,0,0,...,0,0,12,0,0,47,0,417,55,1685.927979
1,17,10,1,2016,3,10,8,1,0,0,...,0,0,55,0,0,43,0,417,60,2628.825195
2,13,25,1,2016,1,2,11,0,0,0,...,0,0,50,0,0,13,0,417,60,8375.293945
3,13,10,1,2016,10,41,13,3,0,0,...,0,0,58,0,0,31,0,1190,130,5227.619141
4,17,10,0,2016,9,38,24,5,0,0,...,0,0,16,0,0,21,1,1760,155,7957.410156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,11,10,0,2016,5,20,16,0,0,0,...,0,0,60,0,0,8,0,1028,95,4749.652832
2496,20,25,1,2016,7,27,10,6,0,0,...,0,0,37,0,0,11,1,1190,130,34723.687500
2497,16,10,0,2016,7,26,3,6,0,0,...,0,0,13,0,0,43,1,1760,155,7733.168457
2498,2,25,1,2016,1,2,11,0,0,0,...,0,0,56,0,0,14,0,417,60,7197.387207
