In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
data=pd.read_csv("data_feat.csv",index_col=['Unnamed: 0'])

X=data.drop('Price',axis=1)
y=data['Price']

X.head(2)
y.head(2)

Unnamed: 0,Airline,Destination,Duration,Total_Stops,Dep_Hour,Dep_Day,Dep_Min,Journey_DOW,Source_Banglore,Source_Kolkata,Source_Delhi,Source_Chennai,Source_Mumbai
0,3,2,170,0,22,24,20,6,1,0,0,0,0
1,8,3,445,2,5,5,50,5,0,1,0,0,0


0    3897.0
1    7662.0
Name: Price, dtype: float64

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test= train_test_split(
   X, y, test_size=0.25, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestRegressor

RF_model= RandomForestRegressor()
RF_model.fit(X_train,y_train)

In [5]:
y_pred=RF_model.predict(X_test)
y_pred

array([15672.41,  5641.19,  9018.7 , ...,  4024.72,  7414.86,  6762.  ])

In [6]:
from sklearn import metrics

In [7]:
metrics.r2_score(y_test,y_pred)

0.8072514621791214

Next we tune the hyper parameters. For a Random Forest, the hyper parameters are:
1. Number of trees to be aggregated over.
2. Number of features to consider while performing a split
3. Number of samples used to split a node.
4. Maximum depth of the tree.

In [38]:
from sklearn.model_selection import RandomizedSearchCV

RF_reg = RandomForestRegressor()

n_trees = np.linspace(start=100,stop=1200,num=6).astype(int)
max_feat =['log2', 'sqrt']
min_samples = np.linspace(start=10,stop=1200,num=120).astype(int)
max_depth =np.linspace(start=5,stop=25,num=6).astype(int)

#The dictionary keys should match with the arguments taken by RandomForestRegressor()

param_dict = {'n_estimators':n_trees,
              'max_features':max_feat,
              'max_depth':max_depth,
              'min_samples_split':min_samples
              }
param_dict

{'n_estimators': array([ 100,  320,  540,  760,  980, 1200]),
 'max_features': ['log2', 'sqrt'],
 'max_depth': array([ 5,  9, 13, 17, 21, 25]),
 'min_samples_split': array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
         120,  130,  140,  150,  160,  170,  180,  190,  200,  210,  220,
         230,  240,  250,  260,  270,  280,  290,  300,  310,  320,  330,
         340,  350,  360,  370,  380,  390,  400,  410,  420,  430,  440,
         450,  460,  470,  480,  490,  500,  510,  520,  530,  540,  550,
         560,  570,  580,  590,  600,  610,  620,  630,  640,  650,  660,
         670,  680,  690,  700,  710,  720,  730,  740,  750,  760,  770,
         780,  790,  800,  810,  820,  830,  840,  850,  860,  870,  880,
         890,  900,  910,  920,  930,  940,  950,  960,  970,  980,  990,
        1000, 1010, 1020, 1030, 1040, 1050, 1060, 1070, 1080, 1090, 1100,
        1110, 1120, 1130, 1140, 1150, 1160, 1170, 1180, 1190, 1200])}

In [39]:
rf_cv= RandomizedSearchCV(estimator=RF_reg, param_distributions=param_dict,n_jobs=-1,verbose=1,n_iter=50,cv=3)

In [40]:
rf_cv.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [41]:
rf_cv.best_estimator_
rf_best= rf_cv.best_estimator_
rf_cv.best_score_

0.7497351172361646

In [42]:
rf_pred = rf_best.predict(X_test)
metrics.r2_score(rf_pred,y_test)

0.6520917742098761

Next we train a  Lasso Regression Model and evaluate it's performance.

In [62]:
from sklearn.linear_model import Lasso

Lass_reg = Lasso()
alpha_range= {'alpha':np.linspace(start=10,stop=200,num=20)}
Lasso_CV =RandomizedSearchCV(estimator=Lass_reg,param_distributions=alpha_range,n_iter=150,cv=5,refit=True)
Lasso_CV.fit(X_train,y_train)



In [64]:
Lasso_best = Lasso_CV.best_estimator_
Lasso_CV.best_params_
Lasso_CV.best_score_

{'alpha': 10.0}

0.573849625822491

Random Forest Performs better, as far as R2 scores are concerned.

Saving the Model.

In [68]:
import pickle

file = open(r'rf_pickle.pkl','wb')
pickle.dump(rf_best,file)