# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Importing DataSet

In [2]:
dataset=pd.read_excel("Data_Train.xlsx")

In [3]:
df = pd.DataFrame(dataset)


In [4]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


# Data Preprocessing 

In [5]:
df.shape

(10683, 11)

In [6]:
df.isnull().sum()

Airline            0
Date_of_Journey    0
Source             0
Destination        0
Route              1
Dep_Time           0
Arrival_Time       0
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
dtype: int64

In [7]:
#removing Null value rows
dataset.dropna(inplace= True)
df = pd.DataFrame(dataset)

In [8]:
df['Additional_Info'].value_counts()


No info                         8344
In-flight meal not included     1982
No check-in baggage included     320
1 Long layover                    19
Change airports                    7
Business class                     4
No Info                            3
2 Long layover                     1
1 Short layover                    1
Red-eye flight                     1
Name: Additional_Info, dtype: int64

In [9]:
#removing Unnecessary columns
df.drop(columns= ["Additional_Info"], inplace =True ,axis=1)
df.drop(columns= ["Route"], inplace =True ,axis=1)

In [10]:
df.columns

Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Dep_Time',
       'Arrival_Time', 'Duration', 'Total_Stops', 'Price'],
      dtype='object')

In [11]:
#Converting the Date_of_Journey column to datetime object followed by exctraction of date and month
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'])


In [12]:
print (df['Date_of_Journey'].dtype)

datetime64[ns]


In [13]:
df['month'] = df['Date_of_Journey'].dt.month
df['day'] = df['Date_of_Journey'].dt.day
df.drop(columns = {"Date_of_Journey"}, inplace= True, axis=1)
df.drop(columns = {'Arrival_Time'}, inplace = True, axis=1)

In [14]:
df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Duration,Total_Stops,Price,month,day
0,IndiGo,Banglore,New Delhi,22:20,2h 50m,non-stop,3897,3,24
1,Air India,Kolkata,Banglore,05:50,7h 25m,2 stops,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,19h,2 stops,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,5h 25m,1 stop,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,4h 45m,1 stop,13302,1,3


In [15]:
df['Total_Stops'].value_counts()

1 stop      5625
non-stop    3491
2 stops     1520
3 stops       45
4 stops        1
Name: Total_Stops, dtype: int64

In [16]:
# Replacing the Total_Stops columns to numeric data
df['Total_Stops'] = df['Total_Stops'].replace(['non-stop'],'0')
df['Total_Stops'] = df['Total_Stops'].replace(['1 stop'],'1')
df['Total_Stops'] = df['Total_Stops'].replace(['2 stops'],'2')
df['Total_Stops'] = df['Total_Stops'].replace(['3 stops'],'3')
df['Total_Stops'] = df['Total_Stops'].replace(['4 stops'],'4')

In [17]:
df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Duration,Total_Stops,Price,month,day
0,IndiGo,Banglore,New Delhi,22:20,2h 50m,0,3897,3,24
1,Air India,Kolkata,Banglore,05:50,7h 25m,2,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,19h,2,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,5h 25m,1,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,4h 45m,1,13302,1,3


In [18]:
#Rounding off the Dep_Time Column to only hours
time = pd.DatetimeIndex(df['Dep_Time'])

In [19]:
mn=time.minute
hr=time.hour

In [20]:
k =0
hour=([])
for m in mn:
    if m>=30:
        x = 1 + hr[k]
    else:
        x=hr[k]
    
    if x>=24:
        x=x-24
    hour.append(x)
    k +=1

In [21]:
df['Dep_Hour']=hour
df.drop(columns={"Dep_Time"}, inplace= True, axis=1)


In [22]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,month,day,Dep_Hour
0,IndiGo,Banglore,New Delhi,2h 50m,0,3897,3,24,22
1,Air India,Kolkata,Banglore,7h 25m,2,7662,1,5,6
2,Jet Airways,Delhi,Cochin,19h,2,13882,9,6,9
3,IndiGo,Kolkata,Banglore,5h 25m,1,6218,12,5,18
4,IndiGo,Banglore,New Delhi,4h 45m,1,13302,1,3,17


In [23]:
duration=df["Duration"]
duration = list(duration)
for i in range(len(duration)):
  if (len(duration[i].split()) !=2):
    if "h" in duration[i]:
      duration[i]=duration[i]+ " 0m"
    else:
      duration[i]= "0h "+ duration[i]



In [24]:
#extracting the duration in hours and mins and converting it to floating value of hours
duration_hours=[]
duration_mins=[]

for i in range(len(duration)):
  duration_hours.append(duration[i].split(sep='h')[0])
  duration_mins.append(duration[i].split(sep=' ')[-1].split(sep='m')[0])



In [25]:
k=0
x=[None]*len(duration_hours)
for hr in duration_hours:
    x[k]=(int(hr)*60+int(duration_mins[k]))
    k=k+1

In [26]:
df['Duration']=x

In [27]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,month,day,Dep_Hour
0,IndiGo,Banglore,New Delhi,170,0,3897,3,24,22
1,Air India,Kolkata,Banglore,445,2,7662,1,5,6
2,Jet Airways,Delhi,Cochin,1140,2,13882,9,6,9
3,IndiGo,Kolkata,Banglore,325,1,6218,12,5,18
4,IndiGo,Banglore,New Delhi,285,1,13302,1,3,17


In [28]:
print(df.columns.tolist())

['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops', 'Price', 'month', 'day', 'Dep_Hour']


In [29]:
# Reaggangement of Columns
df=df[['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops', 'month', 'day', 'Dep_Hour', 'Price']]

In [30]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,month,day,Dep_Hour,Price
0,IndiGo,Banglore,New Delhi,170,0,3,24,22,3897
1,Air India,Kolkata,Banglore,445,2,1,5,6,7662
2,Jet Airways,Delhi,Cochin,1140,2,9,6,9,13882
3,IndiGo,Kolkata,Banglore,325,1,12,5,18,6218
4,IndiGo,Banglore,New Delhi,285,1,1,3,17,13302


In [31]:
df['Airline'].value_counts()

Jet Airways                          3849
IndiGo                               2053
Air India                            1751
Multiple carriers                    1196
SpiceJet                              818
Vistara                               479
Air Asia                              319
GoAir                                 194
Multiple carriers Premium economy      13
Jet Airways Business                    6
Vistara Premium economy                 3
Trujet                                  1
Name: Airline, dtype: int64

In [32]:
df['Source'].value_counts()

Delhi       4536
Kolkata     2871
Banglore    2197
Mumbai       697
Chennai      381
Name: Source, dtype: int64

In [33]:
df['Destination'].value_counts()

Cochin       4536
Banglore     2871
Delhi        1265
New Delhi     932
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64

In [34]:
df.shape

(10682, 9)

In [35]:
df['Destination'] = df['Destination'].replace(['New Delhi'],'Delhi')

In [36]:
df['Destination'].value_counts()

Cochin       4536
Banglore     2871
Delhi        2197
Hyderabad     697
Kolkata       381
Name: Destination, dtype: int64

In [37]:
#Applying One hot encoding on columns=["Airline", "Source", "Destination"]
encoded_df=pd.get_dummies(data=df, columns=["Airline", "Source", "Destination"], drop_first=True)

In [38]:
encoded_df.shape

(10682, 25)

In [39]:
encoded_df.columns

Index(['Duration', 'Total_Stops', 'month', 'day', 'Dep_Hour', 'Price',
       'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata'],
      dtype='object')

In [40]:
# Rearrangement of Columns 
encoded_df=encoded_df[['Duration', 'Total_Stops', 'month', 'day', 'Dep_Hour',
       'Airline_Air India', 'Airline_GoAir', 'Airline_IndiGo',
       'Airline_Jet Airways', 'Airline_Jet Airways Business',
       'Airline_Multiple carriers',
       'Airline_Multiple carriers Premium economy', 'Airline_SpiceJet',
       'Airline_Trujet', 'Airline_Vistara', 'Airline_Vistara Premium economy',
       'Source_Chennai', 'Source_Delhi', 'Source_Kolkata', 'Source_Mumbai',
       'Destination_Cochin', 'Destination_Delhi', 'Destination_Hyderabad',
       'Destination_Kolkata', 'Price']]

In [41]:
e_df=pd.DataFrame(encoded_df)
e_df.shape

(10682, 25)

In [42]:
#Splitting dataset into X and Y followed by X_test,X_train,y_test,y_train
X=e_df.iloc[:,:-1].values
y=e_df.iloc[:,-1].values

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Training and Testing of Regression Model

### DecisionTreeClassifier

In [44]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
regressor1=DecisionTreeClassifier()
regressor1.fit(X_train,y_train)

DecisionTreeClassifier()

In [45]:
regressor1.score(X_test,y_test)

0.29761347683668693

In [46]:
preds1 = regressor1.predict(X_test)
r1_score=mean_squared_error(y_test, preds1)

In [47]:
r1_score

8518200.388862891

### ExtraTreesRegressor

In [48]:
from sklearn.ensemble import ExtraTreesRegressor
regressor2= ExtraTreesRegressor()
regressor2.fit(X_train,y_train)

ExtraTreesRegressor()

In [49]:
regressor2.score(X_test,y_test)

0.7418433572191793

In [50]:
preds2 = regressor2.predict(X_test)
r2_score=mean_squared_error(y_test, preds2)

In [51]:
r2_score

5294970.977149526

### RandomForestRegressor

In [52]:
from sklearn.ensemble import RandomForestRegressor
regressor3= RandomForestRegressor()
regressor3.fit(X_train,y_train)

RandomForestRegressor()

In [53]:
regressor3.score(X_test,y_test)

0.7914474936930852

In [54]:
preds3 = regressor3.predict(X_test)
r3_score=mean_squared_error(y_test, preds3)

In [55]:
r3_score

4277555.890918751

### XGBRegressor

In [56]:
from xgboost import XGBRegressor
regressor4 = XGBRegressor()
regressor4.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [57]:
regressor4.score(X_test,y_test)

0.8300710926053367

In [58]:
preds4 = regressor4.predict(X_test)
r4_score=mean_squared_error(y_test, preds4)

In [59]:
r4_score

3485359.2111413917

### As observed from the score of various Regressors, we find that XGBRegressor gives the best score.
### So we will be implementing XGBRegressor in our Web App which will be hosted using Heroku

In [60]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [61]:
import pickle
filename = 'finalized_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [62]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.9138419727179724


## The final XGBRegressor model is saved as pickle file named- finalized_model.pkl