## 1. Importing Libraries & Data Understanding

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LinearRegression # for modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import joblib

C:\Users\moham\anaconda3\envs\NEWENV\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\moham\anaconda3\envs\NEWENV\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
df_train = pd.read_excel('Train.xlsx')
df_test = pd.read_excel('Test.xlsx')

In [3]:
df = pd.concat([df_train, df_test])

In [4]:
print(df.shape)
df.head(3)

(13354, 11)


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13354 entries, 0 to 2670
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.2+ MB


## 2. Data Cleaning & Preparation

### 2.1 Duplication and Null Handling

In [6]:
df.duplicated().sum()

246

In [7]:
df.drop_duplicates(inplace= True)
df.dropna(inplace= True)
df.reset_index(drop= True, inplace= True)
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10462 entries, 0 to 10461
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          10462 non-null  object 
 1   Date_of_Journey  10462 non-null  object 
 2   Source           10462 non-null  object 
 3   Destination      10462 non-null  object 
 4   Route            10462 non-null  object 
 5   Dep_Time         10462 non-null  object 
 6   Arrival_Time     10462 non-null  object 
 7   Duration         10462 non-null  object 
 8   Total_Stops      10462 non-null  object 
 9   Additional_Info  10462 non-null  object 
 10  Price            10462 non-null  float64
dtypes: float64(1), object(10)
memory usage: 899.2+ KB


In [9]:
# df.drop(['Route', 'Additional_Info'], axis= 1, inplace= True)

### 2.2 Type Transformation

In [10]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format= '%d/%m/%Y')

In [11]:
df.Total_Stops.value_counts()

1 stop      5625
non-stop    3475
2 stops     1318
3 stops       43
4 stops        1
Name: Total_Stops, dtype: int64

In [12]:
def stops_num(x):
    return (int(0) if x == 'non-stop' else int(x[0]))

In [13]:
df['Total_Stops'] = df['Total_Stops'].apply(stops_num)

In [14]:
def arrival_time(x):
    return x[:5]

In [15]:
df['Arrival_Time'] = df['Arrival_Time'].apply(arrival_time)

In [16]:
df.head(2)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2h 50m,0,No info,3897.0
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2,No info,7662.0


In [17]:
df.Duration.unique()

array(['2h 50m', '7h 25m', '19h', '5h 25m', '4h 45m', '2h 25m', '15h 30m',
       '21h 5m', '25h 30m', '7h 50m', '13h 15m', '2h 35m', '2h 15m',
       '12h 10m', '26h 35m', '4h 30m', '22h 35m', '23h', '20h 35m',
       '5h 10m', '15h 20m', '2h 55m', '13h 20m', '15h 10m', '5h 45m',
       '5h 55m', '13h 25m', '22h', '5h 30m', '10h 25m', '5h 15m',
       '2h 30m', '6h 15m', '11h 55m', '11h 5m', '8h 30m', '22h 5m',
       '2h 45m', '12h', '16h 5m', '19h 55m', '3h 15m', '25h 20m', '3h',
       '16h 15m', '15h 5m', '6h 30m', '25h 5m', '12h 25m', '27h 20m',
       '10h 15m', '10h 30m', '1h 30m', '1h 25m', '26h 30m', '7h 20m',
       '13h 30m', '5h', '19h 5m', '14h 50m', '2h 40m', '22h 10m',
       '9h 35m', '10h', '21h 20m', '18h 45m', '12h 20m', '18h', '9h 15m',
       '17h 30m', '16h 35m', '12h 15m', '7h 30m', '24h', '8h 55m',
       '7h 10m', '14h 30m', '30h 20m', '15h', '12h 45m', '10h 10m',
       '15h 25m', '14h 5m', '20h 15m', '23h 10m', '18h 10m', '16h',
       '2h 20m', '8h', '16h 5

In [18]:
df[df.Duration == '5m']

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
6397,Air India,2019-03-06,Mumbai,Hyderabad,BOM → GOI → PNQ → HYD,16:50,16:55,5m,2,No info,17327.0


In [19]:
df = df[df.Duration != '5m']

In [20]:
df['Month'] = df.Date_of_Journey.dt.month_name()
df['Day'] = df.Date_of_Journey.dt.day
df['Day_Name'] = df.Date_of_Journey.dt.day_name()

In [21]:
def duration_manip(x):
    if ('h' in x) & ('m' in x):
        x.strip()
        hrs = x.split(' ')[0].replace('h', '')
        mins = x.split(' ')[1].replace('m', '')
    elif ('h' in x) & ('m' not in x):
        hrs = x.strip().replace('h', '')
        mins = 0
    else:
        hrs = 0
        mins = x.strip().replace('m', '')

    return (int(hrs), int(mins))  

In [22]:
df['Duration_Hrs'], df['Duration_Mins'] = zip(*df.Duration.apply(duration_manip))

In [23]:
df['Duration'] = round((df['Duration_Hrs'] + (df['Duration_Mins'] / 60)), 2)

In [24]:
# airline_counts = df['Airline'].value_counts()
# other_airlines = airline_counts[airline_counts < 100].index.tolist()

In [25]:
# df.loc[df['Airline'].isin(other_airlines), 'Airline'] = 'others'

In [26]:
df.reset_index(drop= True, inplace= True)
print(df.shape)
df.head(2)

(10461, 16)


Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Month,Day,Day_Name,Duration_Hrs,Duration_Mins
0,IndiGo,2019-03-24,Banglore,New Delhi,BLR → DEL,22:20,01:10,2.83,0,No info,3897.0,March,24,Sunday,2,50
1,Air India,2019-05-01,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7.42,2,No info,7662.0,May,1,Wednesday,7,25


## 3. Data Preprocessing

In [27]:
df['Duration'] = df['Duration'] * 60

In [28]:
df.drop(['Date_of_Journey','Route','Dep_Time','Arrival_Time','Additional_Info','Day','Duration_Hrs','Duration_Mins'], axis= 1, inplace= True)
df.reset_index(inplace= True, drop= True)

In [29]:
print(df.shape)
df.head(2)

(10461, 8)


Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,Month,Day_Name
0,IndiGo,Banglore,New Delhi,169.8,0,3897.0,March,Sunday
1,Air India,Kolkata,Banglore,445.2,2,7662.0,May,Wednesday


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10461 entries, 0 to 10460
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Airline      10461 non-null  object 
 1   Source       10461 non-null  object 
 2   Destination  10461 non-null  object 
 3   Duration     10461 non-null  float64
 4   Total_Stops  10461 non-null  int64  
 5   Price        10461 non-null  float64
 6   Month        10461 non-null  object 
 7   Day_Name     10461 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 653.9+ KB


In [31]:
df.select_dtypes(include= 'object').columns

Index(['Airline', 'Source', 'Destination', 'Month', 'Day_Name'], dtype='object')

In [32]:
for i in df.select_dtypes(include= 'object').columns:
    print(f'{i} \n {df[i].nunique()}')

Airline 
 12
Source 
 5
Destination 
 6
Month 
 4
Day_Name 
 7


In [33]:
ohe_cols = ['Source', 'Destination', 'Month', 'Day_Name']
be_cols = ['Airline']

In [34]:
encoder = ColumnTransformer(transformers= [('ohe', OneHotEncoder(sparse_output= False, drop= 'first'), ohe_cols), ('be', BinaryEncoder(), be_cols)], remainder= 'passthrough')

In [35]:
linear_regression = LinearRegression()
random_forest = RandomForestRegressor()
knn = KNeighborsRegressor()
decision_tree = DecisionTreeRegressor()
xgboost = XGBRegressor()
models = {
          'LinearRegression': linear_regression,
          'RandomForestRegressor': random_forest,
          'knn': knn,
          'DecisionTreeRegressor': decision_tree,
          'XGBRegressor': xgboost
          }

In [36]:
kfold = KFold(n_splits= 5)

In [37]:
X = df.drop('Price', axis= 1)
y = df['Price']

In [38]:
for model_name, model in models.items():
    steps = []
    steps.append(('encoder', encoder))
    steps.append(('scaler', RobustScaler()))
    steps.append(('model', model))
    pipeline = Pipeline(steps= steps)
    results = cross_validate(pipeline, X, y, scoring= 'r2', cv= kfold, return_train_score= True)
    print(model_name)
    print(f'Train Score {results["train_score"].mean()}')
    print('_' * 10)
    print(f'Test Score {results["test_score"].mean()}')
    print('_' * 30)
    print('\n')

LinearRegression
Train Score 0.5432953361432835
__________
Test Score 0.5406773331402492
______________________________


RandomForestRegressor
Train Score 0.8856535946226586
__________
Test Score 0.6652427561113103
______________________________


knn
Train Score 0.7763004999462892
__________
Test Score 0.6580443738282837
______________________________


DecisionTreeRegressor
Train Score 0.9048710899116383
__________
Test Score 0.55737084082837
______________________________


XGBRegressor
Train Score 0.8462848897968138
__________
Test Score 0.7114076436631266
______________________________




In [39]:
params = {
    'model__n_estimators': [100, 150, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.1, 0.3, 0.4],
    'model__reg_lambda': [1, 3, 4]
}
model = XGBRegressor()
steps = []
steps.append(('encoder', encoder))
steps.append(('scaler', RobustScaler()))
steps.append(('model', model))
pipeline = Pipeline(steps= steps)
grid = GridSearchCV(estimator= pipeline, param_grid= params, scoring= 'r2', cv= kfold, return_train_score= True, n_jobs= -1)
grid.fit(X, y)
grid.best_params_

{'model__learning_rate': 0.1,
 'model__max_depth': 7,
 'model__n_estimators': 100,
 'model__reg_lambda': 4}

In [40]:
model = grid.best_estimator_
model

In [41]:
results_best = cross_validate(model, X, y, scoring= 'r2', cv= kfold, return_train_score= True)
print(f'Train Score {results["train_score"].mean()}')
print('_' * 10)
print(f'Test Score {results["test_score"].mean()}')

Train Score 0.8462848897968138
__________
Test Score 0.7114076436631266


In [42]:
joblib.dump(model, "Model.pkl")
joblib.dump(X.columns, "Inputs.pkl")

['Inputs.pkl']

In [43]:
X_sample = pd.DataFrame(X.iloc[2]).T
X_sample

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Month,Day_Name
2,Jet Airways,Delhi,Cochin,1140.0,2,June,Sunday


In [44]:
y.iloc[2]

13882.0

In [45]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,Month,Day_Name
0,IndiGo,Banglore,New Delhi,169.8,0,3897.0,March,Sunday
1,Air India,Kolkata,Banglore,445.2,2,7662.0,May,Wednesday
2,Jet Airways,Delhi,Cochin,1140.0,2,13882.0,June,Sunday
3,IndiGo,Kolkata,Banglore,325.2,1,6218.0,May,Sunday
4,IndiGo,Banglore,New Delhi,285.0,1,13302.0,March,Friday


In [46]:
y_pred = model.predict(X_sample)
y_pred[0]

12607.05

In [47]:
%%writefile app_airline.py
import streamlit as st
import pandas as pd
import joblib


Inputs = joblib.load("Inputs.pkl")
Model = joblib.load("Model.pkl")
def prediction(Airline, Source, Destination, Duration, Total_Stops, Month, Day_Name):
    test_df = pd.DataFrame(columns= Inputs)
    test_df.at[0,"Airline"] = Airline
    test_df.at[0,"Source"] = Source
    test_df.at[0,"Destination"] = Destination
    test_df.at[0,"Duration"] = Duration
    test_df.at[0,"Total_Stops"] = Total_Stops
    test_df.at[0,"Month"] = Month
    test_df.at[0,"Day_Name"] = Day_Name
    result = Model.predict(test_df)[0]
    return result

def main():
    st.title("Airline Price")
    Airline = st.selectbox("Airline" , ['IndiGo', 'Air India', 'Jet Airways', 'SpiceJet',
       'Multiple carriers', 'GoAir', 'Vistara', 'Air Asia',
       'Vistara Premium economy', 'Jet Airways Business',
       'Multiple carriers Premium economy', 'Trujet'])
    Source = st.selectbox("Source" , ['Banglore', 'Kolkata', 'Delhi', 'Chennai', 'Mumbai'])
    Destination = st.selectbox("Destination" , ['New Delhi', 'Banglore', 'Cochin', 'Kolkata', 'Delhi', 'Hyderabad'])
    Duration = st.slider("Duration" , min_value= 60 , max_value=5000 , value=0,step=10) 
    Total_Stops = st.slider("Total_Stops" , min_value= 0 , max_value=5 , value=0,step=1)   
    Month = st.selectbox("Month" , ['March', 'May', 'June', 'April'])
    Day_Name = st.selectbox("Day_Name" , ['Sunday', 'Wednesday', 'Friday', 'Monday', 'Tuesday', 'Saturday', 'Thursday'])

    if st.button("predict"):
        result = prediction(Airline, Source, Destination, Duration, Total_Stops, Month, Day_Name)
        st.text(f"The Price will be {result}")

if __name__ == '__main__':
    main()

Overwriting app_airline.py


In [48]:
!pip install pipreqs



In [49]:
!pipreqs ./



In [50]:
import category_encoders

print(category_encoders.__version__)

2.6.1


In [51]:
import sklearn

print(sklearn.__version__)

1.2.2
