In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.metrics import accuracy_score

import mlflow
import warnings
warnings.filterwarnings('ignore')

import pickle

In [3]:
df = pd.read_csv('flight.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
5,5,Vistara,UK-945,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.33,1,5955
6,6,Vistara,UK-927,Delhi,Morning,zero,Morning,Mumbai,Economy,2.08,1,6060
7,7,Vistara,UK-951,Delhi,Afternoon,zero,Evening,Mumbai,Economy,2.17,1,6060
8,8,GO_FIRST,G8-334,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.17,1,5954
9,9,GO_FIRST,G8-336,Delhi,Afternoon,zero,Evening,Mumbai,Economy,2.25,1,5954


In [4]:
df.shape

(300153, 12)

#Data Cleaning
1. Handling Missing values
2. Handling Missing values
3. Handling Duplicates
4. Check data type
5. Understand the dataset

In [5]:
#Check features with nan value
df.isnull().sum()

Unnamed: 0          0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [6]:
#df['flight'].unique.sum()
df['flight'].unique().sum()

'SG-8709SG-8157I5-764UK-995UK-963UK-945UK-927UK-951G8-334G8-336G8-392G8-3386E-50016E-62026E-5496E-6278AI-887AI-665I5-747G8-266G8-101G8-103AI-4416E-5328UK-9336E-2046I5-744SG-81696E-5041G8-1656E-2373UK-813UK-817UK-819UK-801UK-815AI-453SG-2976AI-504AI-502AI-506AI-803AI-479SG-339UK-955UK-627I5-784AI-9643AI-540AI-429AI-439AI-96456E-21936E-21686E-1526E-369UK-899AI-764UK-747UK-809UK-7376E-2338G8-237UK-871AI-762G8-1404AI-512AI-537UK-9776E-184SG-30026E-2102AI-801UK-637UK-835AI-531UK-705UK-707UK-673AI-839UK-879G8-191AI-767AI-401AI-473G8-213AI-409UK-837AI-877SG-8803UK-985UK-953G8-346G8-330G8-3236E-2186E-6722AI-868AI-805AI-624G8-188AI-6366E-2022AI-469AI-542AI-560UK-683AI-4036E-2154AI-9843AI-5446E-282UK-859UK-829AI-98576E-607AI-885SG-2277G8-719G8-119G8-717UK-706AI-4656E-5018UK-839UK-833AI-406AI-9809G8-286UK-847AI-411G8-113SG-8483AI-4756E-1816E-2092AI-99116E-50636E-7403UK-855AI-807G8-300AI-811AI-431SG-1089SG-1061SG-10636E-2054AI-678G8-1010AI-435AI-499AI-9887AI-483G8-357AI-451AI-471SG-1091UK-981UK-97

In [7]:
pd.set_option('display.max_columns', None )  #you can put any number in place of 'None' to see specific number of columns.


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [9]:
backup=df.copy()

In [10]:
## Remove Unnecessary Columns
df.drop('Unnamed: 0', axis=1, inplace=True)
df.drop('flight', axis=1, inplace=True)

In [11]:
df.rename(columns={'class': 'classes'}, inplace=True)

In [12]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Feature columns :', len(num_features))
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Feature columns :', len(cat_features))
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Feature columns :',len(discrete_features))
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Feature columns :',len(continuous_features))

Num of Numerical Feature columns : 3
Num of Categorical Feature columns : 7
Num of Discrete Feature columns : 0
Num of Continuous Feature columns : 3


In [13]:
#number of features for categorical columns:
print('airline:', len(df['airline'].unique()))
print('source_city:', len(df['source_city'].unique()))
print('departure_time:', len(df['departure_time'].unique()))
print('stops:', len(df['stops'].unique()))
print('arrival_time:', len(df['arrival_time'].unique()))
print('destination_city:', len(df['destination_city'].unique()))
print('classes:', len(df['classes'].unique()))

airline: 6
source_city: 6
departure_time: 6
stops: 3
arrival_time: 6
destination_city: 6
classes: 2


In [14]:
print(df['airline'].unique())
print(df['source_city'].unique())
print(df['departure_time'].unique())
print(df['stops'].unique())
print(df['arrival_time'].unique())
print(df['destination_city'].unique())
print(df['classes'].unique())

['SpiceJet' 'AirAsia' 'Vistara' 'GO_FIRST' 'Indigo' 'Air_India']
['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']
['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
['zero' 'one' 'two_or_more']
['Night' 'Morning' 'Early_Morning' 'Afternoon' 'Evening' 'Late_Night']
['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']
['Economy' 'Business']


In [15]:
df=df.reset_index(drop=True)

In [16]:
df

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,classes,duration,days_left,price
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955
...,...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49,69265
300149,Vistara,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49,77105
300150,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49,79099
300151,Vistara,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49,81585


In [17]:
df.to_csv('flight_cleaned.csv')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300153 non-null  object 
 1   source_city       300153 non-null  object 
 2   departure_time    300153 non-null  object 
 3   stops             300153 non-null  object 
 4   arrival_time      300153 non-null  object 
 5   destination_city  300153 non-null  object 
 6   classes           300153 non-null  object 
 7   duration          300153 non-null  float64
 8   days_left         300153 non-null  int64  
 9   price             300153 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [19]:
# Select the categorical and numerical column names
X = df[['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'classes','duration', 'days_left']]
y = df['price']

In [20]:
X

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,classes,duration,days_left
0,SpiceJet,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1
1,SpiceJet,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1
2,AirAsia,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1
3,Vistara,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1
4,Vistara,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1
...,...,...,...,...,...,...,...,...,...
300148,Vistara,Chennai,Morning,one,Evening,Hyderabad,Business,10.08,49
300149,Vistara,Chennai,Afternoon,one,Night,Hyderabad,Business,10.42,49
300150,Vistara,Chennai,Early_Morning,one,Night,Hyderabad,Business,13.83,49
300151,Vistara,Chennai,Early_Morning,one,Evening,Hyderabad,Business,10.00,49


In [21]:
y.shape

(300153,)

In [22]:
# Split the dataset into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [23]:
ohe=OneHotEncoder()
ohe.fit(X[['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'classes']])

In [24]:
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'classes']),
                                    remainder='passthrough')

In [25]:
dtr=DecisionTreeRegressor()

In [26]:
pipe=make_pipeline(column_trans,dtr)

In [27]:
pipe.fit(X_train,y_train)

In [28]:
y_pred  = pipe.predict(X_test)

In [29]:
r2_score(y_test,y_pred)

0.975865170214415

In [30]:
scores=[]
for i in range(2):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=i)
    dtr=DecisionTreeRegressor()
    pipe=make_pipeline(column_trans,dtr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [31]:
np.argmax(scores)

0

In [32]:
scores[np.argmax(scores)]

0.9770523790428041

In [33]:
pipe.predict(pd.DataFrame(columns=X_test.columns,data=np.array(['SpiceJet','Delhi','Early_Morning','zero','Morning','Mumbai','Economy',2.33,1]).reshape(1,9)))

array([5953.])

In [39]:
y

0          5953
1          5953
2          5956
3          5955
4          5955
          ...  
300148    69265
300149    77105
300150    79099
300151    81585
300152    81585
Name: price, Length: 300153, dtype: int64

The best model is found at a certain random state

In [35]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=np.argmax(scores))
dtr=DecisionTreeRegressor()
pipe=make_pipeline(column_trans,dtr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.9769923349512573

In [36]:
pickle.dump(pipe,open('Model.pkl','wb'))

In [37]:
pipe.predict(pd.DataFrame(columns=['airline', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'classes','duration', 'days_left'],data=np.array(['SpiceJet','Delhi','Early_Morning','zero','Morning','Mumbai','Economy',2.33,1]).reshape(1,9)))

array([5953.])

In [41]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df.head(50)

Unnamed: 0,Actual Value,Predicted Value,Difference
44712,10721,11088.0,-367.0
233644,66928,66928.0,0.0
121467,6909,6489.0,420.0
185846,2271,2586.0,-315.0
163599,4363,4363.0,0.0
192119,4357,4357.0,0.0
284224,74981,79461.0,-4480.0
287622,79349,67029.0,12320.0
134801,4914,4499.0,415.0
17827,4020,4400.0,-380.0


In [40]:
print(df['airline'].unique())
print(df['source_city'].unique())
print(df['departure_time'].unique())
print(df['stops'].unique())
print(df['arrival_time'].unique())
print(df['destination_city'].unique())
print(df['classes'].unique())

['SpiceJet' 'AirAsia' 'Vistara' 'GO_FIRST' 'Indigo' 'Air_India']
['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']
['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
['zero' 'one' 'two_or_more']
['Night' 'Morning' 'Early_Morning' 'Afternoon' 'Evening' 'Late_Night']
['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']
['Economy' 'Business']


In [38]:
# Print unique values sorted alphabetically
print(sorted(df['airline'].unique()))
print(sorted(df['source_city'].unique()))
print(sorted(df['departure_time'].unique()))
print(sorted(df['stops'].unique()))
print(sorted(df['arrival_time'].unique()))
print(sorted(df['destination_city'].unique()))
print(sorted(df['classes'].unique()))

['AirAsia', 'Air_India', 'GO_FIRST', 'Indigo', 'SpiceJet', 'Vistara']
['Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai']
['Afternoon', 'Early_Morning', 'Evening', 'Late_Night', 'Morning', 'Night']
['one', 'two_or_more', 'zero']
['Afternoon', 'Early_Morning', 'Evening', 'Late_Night', 'Morning', 'Night']
['Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai']
['Business', 'Economy']
