### Importing the required libraries

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


# statistical tests libraries
from scipy.stats import levene
from scipy.stats import mannwhitneyu
from statsmodels.stats.anova import anova_lm
import statsmodels.formula.api as sfa

### Load dataset

In [4]:
data= pd.read_csv("capstone_encoded.csv")

# Modeling

In [16]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [17]:
cols=['Model Name','Train R2 Score','Test R2 Score','Train RMSE','Test RMSE','MAPE']

result=pd.DataFrame(columns=cols)

### Outlier Model

In [18]:
# Removing the outliers (as we already see above in the file.)
from sklearn.model_selection import train_test_split
car_o = data.loc[data.price>57000]

x = car_o.drop('price',axis =1)
y = car_o.price

xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = .30, random_state = 42)

xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

cols=['Model Name','Train R2 Score','Test R2 Score','Train RMSE','Test RMSE','MAPE']

result2=pd.DataFrame(columns=cols)

### Decision Tree

In [19]:
dt=DecisionTreeRegressor(random_state = 42)


# Model fitting
model_dt = dt.fit(xtrain,ytrain)


# Model prediction
pred_test = model_dt.predict(xtest)
pred_train = model_dt.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Decision Tree','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363


### Ridge

In [20]:
ridge=Ridge()

# Model fitting
model_ridge = ridge.fit(xtrain,ytrain)


# Model prediction
pred_test = model_ridge.predict(xtest)
pred_train = model_ridge.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Ridge','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791


### Lasso

In [21]:
lasso = Lasso()


# Model fitting
model_lasso = lasso.fit(xtrain,ytrain)


# Model prediction
pred_test = model_lasso.predict(xtest)
pred_train = model_lasso.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Lasso','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648


### Elastic Net

In [22]:
en = ElasticNet()


# Model fitting
model_en = en.fit(xtrain,ytrain)


# Model prediction
pred_test = model_en.predict(xtest)
pred_train = model_en.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Elastic Net','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886


### Random Forest

In [23]:
rf = RandomForestRegressor(random_state=0)


# Model fitting
model_rf = rf.fit(xtrain,ytrain)


# Model prediction
pred_test = model_rf.predict(xtest)
pred_train = model_rf.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Random Forest','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886
4,Random Forest,0.738073,0.09296,34564600.0,155678000.0,12.631994


### AdaBoost Regressor

In [24]:
ada = AdaBoostRegressor(random_state=0)


# Model fitting
model_ada = ada.fit(xtrain,ytrain)


# Model prediction
pred_test = model_ada.predict(xtest)
pred_train = model_ada.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'AdaBoost','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886
4,Random Forest,0.738073,0.09296,34564600.0,155678000.0,12.631994
5,AdaBoost,0.996375,0.02487,4066214.0,161415600.0,9.886909


### Gradient Boosting Regressor

In [25]:
gbm = GradientBoostingRegressor(random_state=0)


# Model fitting
model_gbm = gbm.fit(xtrain,ytrain)


# Model prediction
pred_test = model_gbm.predict(xtest)
pred_train = model_gbm.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'Gradient Boost','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886
4,Random Forest,0.738073,0.09296,34564600.0,155678000.0,12.631994
5,AdaBoost,0.996375,0.02487,4066214.0,161415600.0,9.886909
6,Gradient Boost,0.99661,-0.095808,3932100.0,171112300.0,15.947204


### XGBoost Regressor

In [26]:
xgb = XGBRegressor()


# Model fitting
model_xgb = xgb.fit(xtrain,ytrain)


# Model prediction
pred_test = model_xgb.predict(xtest)
pred_train = model_xgb.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'XGBoost','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886
4,Random Forest,0.738073,0.09296,34564600.0,155678000.0,12.631994
5,AdaBoost,0.996375,0.02487,4066214.0,161415600.0,9.886909
6,Gradient Boost,0.99661,-0.095808,3932100.0,171112300.0,15.947204
7,XGBoost,0.999999,-0.128183,59999.62,173621700.0,13.998462


### CatBoost Regressor

In [27]:
cat = CatBoostRegressor(random_state=0)


# Model fitting
model_cat = cat.fit(xtrain,ytrain)


# Model prediction
pred_test = model_cat.predict(xtest)
pred_train = model_cat.predict(xtrain)


#R2 score:
r2score_train=r2_score(ytrain,pred_train)
r2score_test=r2_score(ytest,pred_test)


# train and test rmse:
rmse_train=np.sqrt(mean_squared_error(ytrain,pred_train))
rmse_test=np.sqrt(mean_squared_error(ytest,pred_test))

# MAPE:
mape = mean_absolute_percentage_error(ytest, pred_test)

mlrmodel=pd.Series({'Model Name':'CatBoost','Train R2 Score':r2score_train,'Test R2 Score':r2score_test,
                    'Train RMSE': rmse_train,'Test RMSE': rmse_test,'MAPE':mape})

result2=result2.append(mlrmodel,ignore_index=True)

result2

Learning rate set to 0.050855
0:	learn: 66869899.6648324	total: 151ms	remaining: 2m 31s
1:	learn: 66208270.9922721	total: 155ms	remaining: 1m 17s
2:	learn: 65559545.2282139	total: 158ms	remaining: 52.4s
3:	learn: 64917368.8714541	total: 161ms	remaining: 40.2s
4:	learn: 64291477.5806429	total: 165ms	remaining: 32.8s
5:	learn: 63671485.5178199	total: 168ms	remaining: 27.8s
6:	learn: 63062889.8874905	total: 171ms	remaining: 24.3s
7:	learn: 62465777.5071199	total: 175ms	remaining: 21.7s
8:	learn: 61873990.3176765	total: 178ms	remaining: 19.6s
9:	learn: 61291985.2563485	total: 182ms	remaining: 18s
10:	learn: 60719617.2963315	total: 185ms	remaining: 16.6s
11:	learn: 60159658.4242417	total: 188ms	remaining: 15.5s
12:	learn: 59434836.2910298	total: 194ms	remaining: 14.7s
13:	learn: 58729996.9554005	total: 198ms	remaining: 13.9s
14:	learn: 58191109.5734814	total: 201ms	remaining: 13.2s
15:	learn: 57663228.4486850	total: 204ms	remaining: 12.6s
16:	learn: 57143775.6716975	total: 208ms	remaining: 

153:	learn: 17359240.6100985	total: 585ms	remaining: 3.21s
154:	learn: 17146340.8183984	total: 587ms	remaining: 3.2s
155:	learn: 17075291.1729735	total: 590ms	remaining: 3.19s
156:	learn: 16910486.5253683	total: 593ms	remaining: 3.18s
157:	learn: 16842288.4455132	total: 596ms	remaining: 3.17s
158:	learn: 16654974.8814785	total: 598ms	remaining: 3.17s
159:	learn: 16451230.3718466	total: 601ms	remaining: 3.15s
160:	learn: 16250210.9386772	total: 604ms	remaining: 3.15s
161:	learn: 16032090.8453264	total: 606ms	remaining: 3.13s
162:	learn: 15968932.9543501	total: 609ms	remaining: 3.13s
163:	learn: 15887981.9403078	total: 611ms	remaining: 3.12s
164:	learn: 15693912.2082139	total: 614ms	remaining: 3.11s
165:	learn: 15502606.2417671	total: 617ms	remaining: 3.1s
166:	learn: 15444583.8870053	total: 620ms	remaining: 3.09s
167:	learn: 15257008.6888302	total: 623ms	remaining: 3.09s
168:	learn: 15200579.8553842	total: 626ms	remaining: 3.08s
169:	learn: 15015713.3948368	total: 629ms	remaining: 3.07s

302:	learn: 4535451.6015909	total: 978ms	remaining: 2.25s
303:	learn: 4494410.2714540	total: 981ms	remaining: 2.25s
304:	learn: 4467926.0428850	total: 983ms	remaining: 2.24s
305:	learn: 4428103.4751422	total: 986ms	remaining: 2.23s
306:	learn: 4399894.7867965	total: 988ms	remaining: 2.23s
307:	learn: 4361452.2731983	total: 991ms	remaining: 2.23s
308:	learn: 4334435.1506487	total: 994ms	remaining: 2.22s
309:	learn: 4308115.0854593	total: 996ms	remaining: 2.22s
310:	learn: 4282473.7070453	total: 999ms	remaining: 2.21s
311:	learn: 4246235.0175651	total: 1s	remaining: 2.21s
312:	learn: 4221455.5315964	total: 1s	remaining: 2.2s
313:	learn: 4201212.3878216	total: 1.01s	remaining: 2.2s
314:	learn: 4159709.3955835	total: 1.01s	remaining: 2.19s
315:	learn: 4140160.4123875	total: 1.01s	remaining: 2.19s
316:	learn: 4099857.5879097	total: 1.01s	remaining: 2.19s
317:	learn: 4077527.8266259	total: 1.02s	remaining: 2.18s
318:	learn: 4055349.9879403	total: 1.02s	remaining: 2.17s
319:	learn: 4022121.17

452:	learn: 1943381.1886822	total: 1.37s	remaining: 1.66s
453:	learn: 1930876.8942694	total: 1.37s	remaining: 1.65s
454:	learn: 1925529.8224302	total: 1.38s	remaining: 1.65s
455:	learn: 1919852.0091297	total: 1.38s	remaining: 1.65s
456:	learn: 1914000.1366973	total: 1.38s	remaining: 1.64s
457:	learn: 1902127.9855949	total: 1.38s	remaining: 1.64s
458:	learn: 1887067.8008161	total: 1.39s	remaining: 1.63s
459:	learn: 1863181.5029089	total: 1.39s	remaining: 1.63s
460:	learn: 1857585.5490694	total: 1.39s	remaining: 1.63s
461:	learn: 1842728.3448882	total: 1.39s	remaining: 1.62s
462:	learn: 1829831.2660385	total: 1.4s	remaining: 1.62s
463:	learn: 1824409.5903877	total: 1.4s	remaining: 1.62s
464:	learn: 1810952.9738299	total: 1.4s	remaining: 1.61s
465:	learn: 1802862.5444631	total: 1.41s	remaining: 1.61s
466:	learn: 1790396.8126647	total: 1.41s	remaining: 1.61s
467:	learn: 1768542.1669954	total: 1.41s	remaining: 1.6s
468:	learn: 1756080.6740404	total: 1.41s	remaining: 1.6s
469:	learn: 1748804

600:	learn: 1034972.8117495	total: 1.77s	remaining: 1.17s
601:	learn: 1032224.4000877	total: 1.77s	remaining: 1.17s
602:	learn: 1029523.8866168	total: 1.77s	remaining: 1.17s
603:	learn: 1024280.6589813	total: 1.78s	remaining: 1.16s
604:	learn: 1019600.6089674	total: 1.78s	remaining: 1.16s
605:	learn: 1014906.2696037	total: 1.78s	remaining: 1.16s
606:	learn: 1009344.0749100	total: 1.78s	remaining: 1.16s
607:	learn: 1005688.2953290	total: 1.79s	remaining: 1.15s
608:	learn: 1002739.9974486	total: 1.79s	remaining: 1.15s
609:	learn: 997989.7412669	total: 1.79s	remaining: 1.15s
610:	learn: 996924.7744773	total: 1.79s	remaining: 1.14s
611:	learn: 991621.1428004	total: 1.8s	remaining: 1.14s
612:	learn: 986381.5237984	total: 1.8s	remaining: 1.14s
613:	learn: 984068.0163071	total: 1.8s	remaining: 1.13s
614:	learn: 982074.9194553	total: 1.81s	remaining: 1.13s
615:	learn: 977270.0979519	total: 1.81s	remaining: 1.13s
616:	learn: 976112.8551535	total: 1.81s	remaining: 1.13s
617:	learn: 974618.776043

812:	learn: 573506.9008000	total: 2.36s	remaining: 543ms
813:	learn: 572183.0828416	total: 2.36s	remaining: 540ms
814:	learn: 570778.7702383	total: 2.37s	remaining: 537ms
815:	learn: 569318.7888063	total: 2.37s	remaining: 534ms
816:	learn: 568605.3303465	total: 2.37s	remaining: 531ms
817:	learn: 566815.6129665	total: 2.37s	remaining: 528ms
818:	learn: 565505.0634867	total: 2.38s	remaining: 525ms
819:	learn: 564219.2829060	total: 2.38s	remaining: 522ms
820:	learn: 563017.7979741	total: 2.38s	remaining: 519ms
821:	learn: 561084.7571229	total: 2.38s	remaining: 517ms
822:	learn: 559674.8409806	total: 2.39s	remaining: 514ms
823:	learn: 557945.6143617	total: 2.39s	remaining: 511ms
824:	learn: 555711.1611118	total: 2.39s	remaining: 508ms
825:	learn: 554854.8000495	total: 2.4s	remaining: 505ms
826:	learn: 553730.0236258	total: 2.4s	remaining: 502ms
827:	learn: 553227.4050619	total: 2.4s	remaining: 499ms
828:	learn: 551844.9385546	total: 2.4s	remaining: 496ms
829:	learn: 551100.1595089	total: 2

Unnamed: 0,Model Name,Train R2 Score,Test R2 Score,Train RMSE,Test RMSE,MAPE
0,Decision Tree,1.0,-0.131432,0.0,173871500.0,4.946363
1,Ridge,0.038847,0.046709,66212090.0,159597700.0,87.023791
2,Lasso,0.03885,0.04683,66212000.0,159587700.0,87.147648
3,Elastic Net,0.024806,0.035263,66693980.0,160553000.0,69.960886
4,Random Forest,0.738073,0.09296,34564600.0,155678000.0,12.631994
5,AdaBoost,0.996375,0.02487,4066214.0,161415600.0,9.886909
6,Gradient Boost,0.99661,-0.095808,3932100.0,171112300.0,15.947204
7,XGBoost,0.999999,-0.128183,59999.62,173621700.0,13.998462
8,CatBoost,0.999966,0.036259,393357.4,160470100.0,8.9496


* For outlier dataset none of the model performs well, So we will go with the Car dataset without outliers.