In [146]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso


In [111]:
train_data = pd.read_excel('/home/hasan/DATA SET/Airline Ticket Price/Data_Train.xlsx')

In [112]:
train_data.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [113]:
train_data.shape

(10683, 11)

In [114]:
train_data.dtypes

Airline            object
Date_of_Journey    object
Source             object
Destination        object
Route              object
Dep_Time           object
Arrival_Time       object
Duration           object
Total_Stops        object
Additional_Info    object
Price               int64
dtype: object

### Feature Engineering

##### Date_of_Journey column

In [115]:
#creating invidual column of Day, Month, Year
train_data['Day'] = train_data['Date_of_Journey'].str.split('/').str[0]
train_data['Month'] = train_data['Date_of_Journey'].str.split('/').str[1]
train_data['Year'] = train_data['Date_of_Journey'].str.split('/').str[2]

#converting to integer type of Day, Month, Year column
train_data['Day'] = train_data['Day'].astype(int)
train_data['Month'] = train_data['Month'].astype(int)
train_data['Year'] = train_data['Year'].astype(int)

#dropping Date_of_Journey column
train_data.drop(['Date_of_Journey'], axis=1, inplace=True)

In [116]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


##### Arrival_Time column

In [117]:
train_data['Arrival_Time'] = train_data['Arrival_Time'].str.split(' ').str[0]

In [118]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,non-stop,No info,3897,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2 stops,No info,13882,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302,1,3,2019


##### null value checks

In [119]:
#checking null values
train_data.isnull().sum()

Airline            0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
Day                0
Month              0
Year               0
dtype: int64

In [120]:
train_data.dropna(inplace=True)

##### Total_Stops column

In [121]:
train_data['Total_Stops'].unique()

array(['non-stop', '2 stops', '1 stop', '3 stops', '4 stops'],
      dtype=object)

In [122]:
#replacing "non stop" with "0 stop"
train_data['Total_Stops'] = train_data['Total_Stops'].replace('non-stop', '0 stops')
train_data['Total_Stops'] = train_data['Total_Stops'].str.split(' ').str[0]
train_data['Total_Stops'] = train_data['Total_Stops'].astype(int)

In [123]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,0,No info,3897,24,3,2019
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662,1,5,2019
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25,19h,2,No info,13882,9,6,2019
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1,No info,6218,12,5,2019
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1,No info,13302,1,3,2019


##### Arrival_Time column

In [124]:
#creating new column for Arrival_Time column
train_data['Arrival Hour'] = train_data['Arrival_Time'].str.split(':').str[0]
train_data['Arrival Minute'] = train_data['Arrival_Time'].str.split(':').str[1]

#converting to integer
train_data['Arrival Hour'] = train_data['Arrival Hour'].astype(int)
train_data['Arrival Minute'] = train_data['Arrival Minute'].astype(int)

#dropping Arrival_Time column
train_data.drop(['Arrival_Time'], axis=1, inplace=True)

In [125]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Arrival Hour,Arrival Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,2h 50m,0,No info,3897,24,3,2019,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,7h 25m,2,No info,7662,1,5,2019,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,19h,2,No info,13882,9,6,2019,4,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,18:05,5h 25m,1,No info,6218,12,5,2019,23,30
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,16:50,4h 45m,1,No info,13302,1,3,2019,21,35


##### Dep_Time column

In [126]:
#splitting Dep_Time column
train_data['Dep Hour'] = train_data['Dep_Time'].str.split(':').str[0]
train_data['Dep Minute'] = train_data['Dep_Time'].str.split(':').str[1]

#changing type to integer
train_data['Dep Hour'] = train_data['Dep Hour'].astype(int)
train_data['Dep Minute'] = train_data['Dep Minute'].astype(int)

#dropping Dep_Time column
train_data.drop(['Dep_Time'], axis=1, inplace=True)


In [127]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Arrival Hour,Arrival Minute,Dep Hour,Dep Minute
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,0,No info,3897,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2,No info,7662,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2,No info,13882,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU → NAG → BLR,5h 25m,1,No info,6218,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR → NAG → DEL,4h 45m,1,No info,13302,1,3,2019,21,35,16,50


##### "Route" column

In [128]:
train_data['Route'] = train_data['Route'].str.replace(' →','')

In [129]:
train_data.head()

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Day,Month,Year,Arrival Hour,Arrival Minute,Dep Hour,Dep Minute
0,IndiGo,Banglore,New Delhi,BLR DEL,2h 50m,0,No info,3897,24,3,2019,1,10,22,20
1,Air India,Kolkata,Banglore,CCU IXR BBI BLR,7h 25m,2,No info,7662,1,5,2019,13,15,5,50
2,Jet Airways,Delhi,Cochin,DEL LKO BOM COK,19h,2,No info,13882,9,6,2019,4,25,9,25
3,IndiGo,Kolkata,Banglore,CCU NAG BLR,5h 25m,1,No info,6218,12,5,2019,23,30,18,5
4,IndiGo,Banglore,New Delhi,BLR NAG DEL,4h 45m,1,No info,13302,1,3,2019,21,35,16,50


##### Dropping "Duration" and "Additional_Info" column

In [130]:
train_data.drop(['Duration','Additional_Info'], axis=1, inplace=True)

##### Making dummy data for "Airline", "Source", "Destination", "Route" column

In [131]:
airline = pd.get_dummies(train_data['Airline'])
source = pd.get_dummies(train_data['Source'])
destination = pd.get_dummies(train_data['Destination'])
route = pd.get_dummies(train_data['Route'])


In [132]:
train_df = pd.concat([airline, source, destination, route, train_data], axis=1)

In [133]:
train_df.drop(['Airline','Source','Destination','Route'], axis=1, inplace=True)

In [134]:
train_df.head()

Unnamed: 0,Air Asia,Air India,GoAir,IndiGo,Jet Airways,Jet Airways Business,Multiple carriers,Multiple carriers Premium economy,SpiceJet,Trujet,...,MAA CCU,Total_Stops,Price,Day,Month,Year,Arrival Hour,Arrival Minute,Dep Hour,Dep Minute
0,0,0,0,1,0,0,0,0,0,0,...,0,0,3897,24,3,2019,1,10,22,20
1,0,1,0,0,0,0,0,0,0,0,...,0,2,7662,1,5,2019,13,15,5,50
2,0,0,0,0,1,0,0,0,0,0,...,0,2,13882,9,6,2019,4,25,9,25
3,0,0,0,1,0,0,0,0,0,0,...,0,1,6218,12,5,2019,23,30,18,5
4,0,0,0,1,0,0,0,0,0,0,...,0,1,13302,1,3,2019,21,35,16,50


##### Dividing dataset for Feature and Label

In [135]:
X = train_df.drop(['Price'], axis=1)
y = train_df['Price']

In [136]:
Xtrain,xtest, Ytrain,ytest = train_test_split(X,y,test_size=.25, random_state=0)

##### Define Algorithm

In [137]:
rfr = RandomForestRegressor(n_estimators=120, random_state=42)
rfr.fit(Xtrain,Ytrain)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=120,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [140]:
predict = rfr.predict(xtest)
predict

array([12519.59652778,  5858.775     ,  3559.92      , ...,
        7773.55      ,  6870.29708333,  6511.43333333])

In [141]:
rfr.score(xtest,ytest)

0.8127533061877287

##### score suing cross_val_score

In [144]:
#using cross_val_score
acc = cross_val_score(rfr, X,y, cv=5)
acc

array([0.81683958, 0.82058652, 0.84286669, 0.82095355, 0.81930947])