In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
#loading the excel file into DataFrane
df1=pd.read_excel('flightdata-train.xlsx')
df2=pd.read_excel('flightdata-test.xlsx')

In [3]:
df1.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [4]:
df2.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info
0,Jet Airways,6/06/2019,Delhi,Cochin,DEL → BOM → COK,17:30,04:25 07 Jun,10h 55m,1 stop,No info
1,IndiGo,12/05/2019,Kolkata,Banglore,CCU → MAA → BLR,06:20,10:20,4h,1 stop,No info
2,Jet Airways,21/05/2019,Delhi,Cochin,DEL → BOM → COK,19:15,19:00 22 May,23h 45m,1 stop,In-flight meal not included
3,Multiple carriers,21/05/2019,Delhi,Cochin,DEL → BOM → COK,08:00,21:00,13h,1 stop,No info
4,Air Asia,24/06/2019,Banglore,Delhi,BLR → DEL,23:55,02:45 25 Jun,2h 50m,non-stop,No info


In [5]:
df1.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

In [6]:
df1.shape

(10683, 11)

In [7]:
df1['Airline'].unique()

array(['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'], dtype=object)

In [8]:
df1['Source'].unique()

array(['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'], dtype=object)

In [9]:
df1['Destination'].unique()

array(['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'],
      dtype=object)

In [10]:
df1['Route'].unique()

array(['BLR → DEL', 'CCU → IXR → BBI → BLR', 'DEL → LKO → BOM → COK',
       'CCU → NAG → BLR', 'BLR → NAG → DEL', 'CCU → BLR',
       'BLR → BOM → DEL', 'DEL → BOM → COK', 'DEL → BLR → COK',
       'MAA → CCU', 'CCU → BOM → BLR', 'DEL → AMD → BOM → COK',
       'DEL → PNQ → COK', 'DEL → CCU → BOM → COK', 'BLR → COK → DEL',
       'DEL → IDR → BOM → COK', 'DEL → LKO → COK',
       'CCU → GAU → DEL → BLR', 'DEL → NAG → BOM → COK',
       'CCU → MAA → BLR', 'DEL → HYD → COK', 'CCU → HYD → BLR',
       'DEL → COK', 'CCU → DEL → BLR', 'BLR → BOM → AMD → DEL',
       'BOM → DEL → HYD', 'DEL → MAA → COK', 'BOM → HYD',
       'DEL → BHO → BOM → COK', 'DEL → JAI → BOM → COK',
       'DEL → ATQ → BOM → COK', 'DEL → JDH → BOM → COK',
       'CCU → BBI → BOM → BLR', 'BLR → MAA → DEL',
       'DEL → GOI → BOM → COK', 'DEL → BDQ → BOM → COK',
       'CCU → JAI → BOM → BLR', 'CCU → BBI → BLR', 'BLR → HYD → DEL',
       'DEL → TRV → COK', 'CCU → IXR → DEL → BLR',
       'DEL → IXU → BOM → COK', 'CCU 

In [11]:
#2 null values present
df1.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [12]:
#since the count is less ,hence dropping the nul values
df1=df1.dropna(subset=['Route'])
df1=df1.dropna(subset=['Total_Stops'])

In [13]:
df1.dropna(inplace=True)

In [14]:
df1.shape

(10682, 11)

In [15]:
#Dropping the duplicate rows
duplicate=df1[df1.duplicated()]

In [16]:
duplicate.shape

(220, 11)

In [17]:
df1.drop_duplicates(keep='first',inplace=True)

In [18]:
df1.shape

(10462, 11)

In [19]:
df1['Additional_Info'].value_counts()

No info                         8182
In-flight meal not included     1926
No check-in baggage included     318
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
Red-eye flight                     1
1 Short layover                    1
2 Long layover                     1
Name: Additional_Info, dtype: int64

In [20]:
df1['Additional_Info'].value_counts()

No info                         8182
In-flight meal not included     1926
No check-in baggage included     318
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
Red-eye flight                     1
1 Short layover                    1
2 Long layover                     1
Name: Additional_Info, dtype: int64

In [21]:
#Replacing No info with NO Info as they both are similar
df1['Additional_Info']=df1['Additional_Info'].replace({'No info':'No Info'})
df2['Additional_Info']=df2['Additional_Info'].replace({'No info':'No Info'})

In [22]:
#Replacing Delhi with New Delhi as they both are the same state
df1["Destination"] = df1["Destination"].replace({'New Delhi': 'Delhi'})
df2["Destination"] = df2["Destination"].replace({'New Delhi': 'Delhi'})

In [23]:
df1['Total_Stops'].value_counts()

1 stop      5625
non-stop    3475
2 stops     1318
3 stops       43
4 stops        1
Name: Total_Stops, dtype: int64

In [24]:
#mapping teh values for total_stops
title_mapping={'1 stop':'1','non-stop':'2','2 stops':'2','3 stops':'3','4 stops':'4'}
df1["Total_Stops"]=df1["Total_Stops"].map(title_mapping)
df2["Total_Stops"]=df2["Total_Stops"].map(title_mapping)

In [25]:
df1.head(10)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,2,No Info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No Info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No Info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No Info,6218
4,IndiGo,01/03/2019,Banglore,Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No Info,13302
5,SpiceJet,24/06/2019,Kolkata,Banglore,CCU → BLR,09:00,11:25,2h 25m,2,No Info,3873
6,Jet Airways,12/03/2019,Banglore,Delhi,BLR → BOM → DEL,18:55,10:25 13 Mar,15h 30m,1,In-flight meal not included,11087
7,Jet Airways,01/03/2019,Banglore,Delhi,BLR → BOM → DEL,08:00,05:05 02 Mar,21h 5m,1,No Info,22270
8,Jet Airways,12/03/2019,Banglore,Delhi,BLR → BOM → DEL,08:55,10:25 13 Mar,25h 30m,1,In-flight meal not included,11087
9,Multiple carriers,27/05/2019,Delhi,Cochin,DEL → BOM → COK,11:25,19:15,7h 50m,1,No Info,8625


In [26]:
pd.to_datetime(df1['Date_of_Journey'], format = '%d/%m/%Y').dt.day_name()

0           Sunday
1        Wednesday
2           Sunday
3           Sunday
4           Friday
           ...    
10678      Tuesday
10679     Saturday
10680     Saturday
10681       Friday
10682     Thursday
Name: Date_of_Journey, Length: 10462, dtype: object

In [27]:
#Conertion the Date of Journey to day month and Year and also checking whether the day of week is a weekend or not
df1['isWeekend']=((pd.to_datetime(df1['Date_of_Journey'], format = '%d/%m/%Y').dt.dayofweek)//5==1).astype(int)

In [28]:
df2['isWeekend']=((pd.to_datetime(df2['Date_of_Journey'], format = '%d/%m/%Y').dt.dayofweek)//5==1).astype(int)

In [29]:
df1['Day']=pd.to_datetime(df1['Date_of_Journey'], format = '%d/%m/%Y').dt.day_name()

In [30]:
df1['Month']=pd.to_datetime(df1['Date_of_Journey'], format = '%d/%m/%Y').dt.month

In [31]:
df2['Day']=pd.to_datetime(df2['Date_of_Journey'], format = '%d/%m/%Y').dt.day_name()
df2['Month']=pd.to_datetime(df2['Date_of_Journey'], format = '%d/%m/%Y').dt.month

In [32]:
df1.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,isWeekend,Day,Month
0,IndiGo,24/03/2019,Banglore,Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,2,No Info,3897,1,Sunday,3
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No Info,7662,0,Wednesday,5
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No Info,13882,1,Sunday,6
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No Info,6218,1,Sunday,5
4,IndiGo,01/03/2019,Banglore,Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No Info,13302,0,Friday,3


In [33]:
#Extracting the hour and minute from the Dep_time and Arrival_time
df1['Depart_Time_Hour'] = pd.to_datetime(df1.Dep_Time).dt.hour
df1['Depart_Time_Minute'] = pd.to_datetime(df1.Dep_Time).dt.minute
df1['Arrival_Time_Hour'] = pd.to_datetime(df1.Arrival_Time).dt.hour
df1['Arrival_Time_Minute'] = pd.to_datetime(df1.Arrival_Time).dt.minute


In [34]:
df2['Depart_Time_Hour'] = pd.to_datetime(df2.Dep_Time).dt.hour
df2['Depart_Time_Minute'] = pd.to_datetime(df2.Dep_Time).dt.minute
df2['Arrival_Time_Hour'] = pd.to_datetime(df2.Arrival_Time).dt.hour
df2['Arrival_Time_Minute'] = pd.to_datetime(df2.Arrival_Time).dt.minute

In [35]:
#Converting the duration into minutes
duration = list(df1["Duration"])
for i in range (len(duration)):
    if(duration[i].split())!=2:
        if 'h' in duration[i]:
            duration[i]=duration[i].strip() + ' 0m'
        else:
            if 'm' in duration[i]:
                duration[i]='0h {}'.format(duration[i].strip())
dur_hours=[]
dur_minutes=[]
dur_seconds=[]

for i in range (len(duration)):
    dur_minutes.append(int(duration[i].split()[0][:-1])*60 + int(duration[i].split()[1][:-1]))
df1["Duration_minutes"] = dur_minutes

In [36]:
duration = list(df2["Duration"])
for i in range (len(duration)):
    if(duration[i].split())!=2:
        if 'h' in duration[i]:
            duration[i]=duration[i].strip() + ' 0m'
        else:
            if 'm' in duration[i]:
                duration[i]='0h {}'.format(duration[i].strip())
dur_hours=[]
dur_minutes=[]
dur_seconds=[]

for i in range (len(duration)):
    dur_minutes.append(int(duration[i].split()[0][:-1])*60 + int(duration[i].split()[1][:-1]))
df2["Duration_minutes"] = dur_minutes

In [37]:
df1.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,isWeekend,Day,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,IndiGo,24/03/2019,Banglore,Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,2,No Info,3897,1,Sunday,3,22,20,1,10,170
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No Info,7662,0,Wednesday,5,5,50,13,15,445
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2,No Info,13882,1,Sunday,6,9,25,4,25,1140
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No Info,6218,1,Sunday,5,18,5,23,30,325
4,IndiGo,01/03/2019,Banglore,Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No Info,13302,0,Friday,3,16,50,21,35,285


In [38]:
df1.drop(['Date_of_Journey','Dep_Time','Arrival_Time','Duration'],axis=1,inplace=True)
df2.drop(['Date_of_Journey','Dep_Time','Arrival_Time','Duration'],axis=1,inplace=True)

In [39]:
x = df1.drop(["Price"], axis=1)

In [40]:
x.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,isWeekend,Day,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,IndiGo,Banglore,Delhi,BLR → DEL,2,No Info,1,Sunday,3,22,20,1,10,170
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,2,No Info,0,Wednesday,5,5,50,13,15,445
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,2,No Info,1,Sunday,6,9,25,4,25,1140
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,1,No Info,1,Sunday,5,18,5,23,30,325
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,1,No Info,0,Friday,3,16,50,21,35,285


In [41]:
y = np.log(df1["Price"])

In [42]:
x_cat=x[['Airline','Source','Destination','Route','Additional_Info','Day']]

In [43]:
x_cat.head()

Unnamed: 0,Airline,Source,Destination,Route,Additional_Info,Day
0,IndiGo,Banglore,Delhi,BLR → DEL,No Info,Sunday
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,No Info,Wednesday
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,No Info,Sunday
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,No Info,Sunday
4,IndiGo,Banglore,Delhi,BLR → NAG → DEL,No Info,Friday


In [44]:
x_cat.shape

(10462, 6)

In [45]:
#Encoding the categorical variable using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x_cat=x_cat.apply(le.fit_transform)

In [46]:
x_cat.head()

Unnamed: 0,Airline,Source,Destination,Route,Additional_Info,Day
0,3,0,2,18,6,3
1,1,3,0,84,6,6
2,4,2,1,118,6,3
3,3,3,0,91,6,3
4,3,0,2,29,6,0


In [47]:
x_numerical = x.select_dtypes(include=['int64','int32','float'])

In [48]:
x_numerical.shape

(10462, 7)

In [49]:
x_numerical.head()

Unnamed: 0,isWeekend,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,1,3,22,20,1,10,170
1,0,5,5,50,13,15,445
2,1,6,9,25,4,25,1140
3,1,5,18,5,23,30,325
4,0,3,16,50,21,35,285


In [50]:
x.dtypes

Airline                object
Source                 object
Destination            object
Route                  object
Total_Stops            object
Additional_Info        object
isWeekend               int32
Day                    object
Month                   int64
Depart_Time_Hour        int64
Depart_Time_Minute      int64
Arrival_Time_Hour       int64
Arrival_Time_Minute     int64
Duration_minutes        int64
dtype: object

In [51]:
df2.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,isWeekend,Day,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,No Info,0,Thursday,6,17,30,4,25,655
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,1,No Info,1,Sunday,5,6,20,10,20,240
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,In-flight meal not included,0,Tuesday,5,19,15,19,0,1425
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,1,No Info,0,Tuesday,5,8,0,21,0,780
4,Air Asia,Banglore,Delhi,BLR → DEL,2,No Info,0,Monday,6,23,55,2,45,170


In [52]:
xt=df2

In [53]:
xt.head()

Unnamed: 0,Airline,Source,Destination,Route,Total_Stops,Additional_Info,isWeekend,Day,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,No Info,0,Thursday,6,17,30,4,25,655
1,IndiGo,Kolkata,Banglore,CCU → MAA → BLR,1,No Info,1,Sunday,5,6,20,10,20,240
2,Jet Airways,Delhi,Cochin,DEL → BOM → COK,1,In-flight meal not included,0,Tuesday,5,19,15,19,0,1425
3,Multiple carriers,Delhi,Cochin,DEL → BOM → COK,1,No Info,0,Tuesday,5,8,0,21,0,780
4,Air Asia,Banglore,Delhi,BLR → DEL,2,No Info,0,Monday,6,23,55,2,45,170


In [54]:
xt_cat=x[['Airline','Source','Destination','Route','Additional_Info','Day']]

In [55]:
xt.shape

(2671, 14)

In [56]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
xt_cat=xt_cat.apply(le.fit_transform)

In [57]:
xt.dtypes

Airline                object
Source                 object
Destination            object
Route                  object
Total_Stops            object
Additional_Info        object
isWeekend               int32
Day                    object
Month                   int64
Depart_Time_Hour        int64
Depart_Time_Minute      int64
Arrival_Time_Hour       int64
Arrival_Time_Minute     int64
Duration_minutes        int64
dtype: object

In [58]:
xt_numerical=xt.select_dtypes(include=['int32','int64','float'])
xt_numerical.head()

Unnamed: 0,isWeekend,Month,Depart_Time_Hour,Depart_Time_Minute,Arrival_Time_Hour,Arrival_Time_Minute,Duration_minutes
0,0,6,17,30,4,25,655
1,1,5,6,20,10,20,240
2,0,5,19,15,19,0,1425
3,0,5,8,0,21,0,780
4,0,6,23,55,2,45,170


In [59]:
x = pd.concat([x_cat, x_numerical], axis=1)
xt =pd.concat([xt_cat, x_numerical], axis=1)

In [60]:
print(x.shape)
print(xt.shape)

(10462, 13)
(10462, 13)


In [61]:
max_rscore=0
for r_state in range(42,101):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=r_state)
    lr=LinearRegression()
    lr.fit(x_train,y_train)
    pred=lr.predict(x_test)
    r_scr=r2_score(y_test,pred)
    print("R2_score corresponding to random state:",r_state,"is:",r_scr)
    if r_scr>max_rscore:
        max_rscore=r_scr
        final_r_state=r_state 
print()
print("max accuracy score corresponding to r_state is",final_r_state,"is:",max_rscore)

R2_score corresponding to random state: 42 is: 0.44134217543511756
R2_score corresponding to random state: 43 is: 0.42619566952061705
R2_score corresponding to random state: 44 is: 0.4294951664176596
R2_score corresponding to random state: 45 is: 0.4579255865840447
R2_score corresponding to random state: 46 is: 0.4293071893428301
R2_score corresponding to random state: 47 is: 0.44165974212626713
R2_score corresponding to random state: 48 is: 0.4517891737115123
R2_score corresponding to random state: 49 is: 0.45709619165559734
R2_score corresponding to random state: 50 is: 0.44676026066432706
R2_score corresponding to random state: 51 is: 0.4354488626441195
R2_score corresponding to random state: 52 is: 0.44985947453995623
R2_score corresponding to random state: 53 is: 0.4237309731095982
R2_score corresponding to random state: 54 is: 0.43091292908302226
R2_score corresponding to random state: 55 is: 0.4285994084407434
R2_score corresponding to random state: 56 is: 0.4451590666917974
R2_

In [62]:
#Applying Random Forest Regressor to Check the score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score
rfc=RandomForestRegressor()
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [5,10, 15, 20, 25, None],    
    'min_samples_leaf': [1, 2, 4, 6],
    'min_samples_split': [2, 5, 10, 20],
    'n_estimators': [25, 50, 100, 200]}

grid_search = GridSearchCV(rfc, params, cv = 3)

In [63]:
grid_search.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [5, 10, 15, 20, 25

In [64]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [65]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
rfc = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
rfc.fit(x_train, y_train) 

def evaluate(x, y, dataset):
    pred = rfc.predict(x)
    score = rfc.score(x, y)
    rmsle=(np.sqrt(mean_squared_log_error(y,pred)))
    print("The score is:",score)
    print("The root mean squared error is:",rmsle)
        
evaluate(x_train, y_train, 'training')
evaluate(x_test, y_test, 'test')

The score is: 0.9642031175241266
The root mean squared error is: 0.00970502511665252
The score is: 0.8705386540272394
The root mean squared error is: 0.018503458470903567


In [66]:
y_predictions = rfc.predict(xt)

In [67]:
#Flight Price for the Test dataset
y_predictions = pd.DataFrame(y_predictions, columns = ['Prices'])
print(y_predictions)

         Prices
0      8.621118
1      9.041455
2      9.538023
3      8.772184
4      9.315361
...         ...
10457  8.340388
10458  8.400045
10459  8.885856
10460  9.205662
10461  9.408699

[10462 rows x 1 columns]


In [68]:
from sklearn.externals import joblib
joblib.dump(model_new,'flight_prices__regression.pkl')

NameError: name 'model_new' is not defined