In [1]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import explained_variance_score

from math import sqrt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from xgboost import XGBRegressor

In [2]:
df=pd.read_csv(r'C:\Users\kogla\OneDrive\SWE599-Flight-Ticket-Price-Estimation-Project\merged.csv').drop("Unnamed: 0", axis=1)
df.head()

Unnamed: 0,company,departure_airport,arrival_airport,departure_time,arrival_time,departure_date,price_try,departure_week_day,checked_week_day,departure_day,departure_month,remaining_day_to_flight,arrival_city,departure_airport_name,departure_hour,part_of_day,is_weekend,part_of_month,duration(min)
0,AnadoluJet,SAW,ADB,07:30:00,08:40:00,2022-11-14,506.99,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,7,Early Morning,Weekday,Middle,70
1,AnadoluJet,SAW,ADB,09:20:00,10:30:00,2022-11-14,506.99,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,9,Morning,Weekday,Middle,70
2,AnadoluJet,SAW,ADB,20:55:00,22:05:00,2022-11-14,506.99,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,20,Evening,Weekday,Middle,70
3,Pegasus,SAW,ADB,23:35:00,00:45:00,2022-11-14,539.99,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,23,Night,Weekday,Middle,70
4,Pegasus,SAW,ADB,09:10:00,10:20:00,2022-11-14,542.99,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,9,Morning,Weekday,Middle,70


In [3]:
# departure_airport and departure_airport_name columns include same info. 
# arrival_airport and arrival_city columns include same info. Drop arrival_airport.
# departure_date is held as departure_day, departure_month
# departure_time is held as departure_hour
# arrival_time info is not neccessary

df=df.drop(['departure_airport', 'arrival_airport', 'departure_date','departure_time', 'arrival_time'], axis=1)

In [4]:
# assign types of each column
df['company'] = df.company.astype('category')
df['departure_airport_name'] = df.departure_airport_name.astype('category')
df['arrival_city'] = df.arrival_city.astype('category')
df['part_of_day'] = df.part_of_day.astype('category')
df['departure_week_day'] = df.departure_week_day.astype('category')
df['part_of_month'] = df.part_of_month.astype('category')
df['is_weekend'] = df.is_weekend.astype('category')
df['checked_week_day'] = df.checked_week_day.astype('category')
df["price_try"] = df["price_try"].astype("int64")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129513 entries, 0 to 129512
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype   
---  ------                   --------------   -----   
 0   company                  129513 non-null  category
 1   price_try                129513 non-null  int64   
 2   departure_week_day       129513 non-null  category
 3   checked_week_day         129513 non-null  category
 4   departure_day            129513 non-null  int64   
 5   departure_month          129513 non-null  int64   
 6   remaining_day_to_flight  129513 non-null  int64   
 7   arrival_city             129513 non-null  category
 8   departure_airport_name   129513 non-null  category
 9   departure_hour           129513 non-null  int64   
 10  part_of_day              129513 non-null  category
 11  is_weekend               129513 non-null  category
 12  part_of_month            129513 non-null  category
 13  duration(min)            129513 non-null  in

In [5]:
df.head()

Unnamed: 0,company,price_try,departure_week_day,checked_week_day,departure_day,departure_month,remaining_day_to_flight,arrival_city,departure_airport_name,departure_hour,part_of_day,is_weekend,part_of_month,duration(min)
0,AnadoluJet,506,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,7,Early Morning,Weekday,Middle,70
1,AnadoluJet,506,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,9,Morning,Weekday,Middle,70
2,AnadoluJet,506,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,20,Evening,Weekday,Middle,70
3,Pegasus,539,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,23,Night,Weekday,Middle,70
4,Pegasus,542,Monday,Tuesday,14,11,13,Izmir,Sabiha Gökçen Airport,9,Morning,Weekday,Middle,70


In [6]:
df["price_try"].describe()

count    129513.000000
mean        874.649935
std         286.457553
min         432.000000
25%         734.000000
50%         869.000000
75%        1006.000000
max        4258.000000
Name: price_try, dtype: float64

In [7]:
# get list of categorical_columns
categorical_cols = df.select_dtypes(include=['category']).columns.to_list()

In [8]:
# convert categorical columns to numeric values 
dummies_data = pd.get_dummies(df[categorical_cols])
df = pd.concat([df.drop(categorical_cols, axis=1), dummies_data], axis=1)

In [9]:
df.head()

Unnamed: 0,price_try,departure_day,departure_month,remaining_day_to_flight,departure_hour,duration(min),company_AnadoluJet,company_Pegasus,company_Türk Hava Yolları,departure_week_day_Friday,...,part_of_day_Evening,part_of_day_Late Night,part_of_day_Morning,part_of_day_Night,part_of_day_Noon,is_weekend_Weekday,is_weekend_Weekend,part_of_month_Beginning,part_of_month_End,part_of_month_Middle
0,506,14,11,13,7,70,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,506,14,11,13,9,70,1,0,0,0,...,0,0,1,0,0,1,0,0,0,1
2,506,14,11,13,20,70,1,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3,539,14,11,13,23,70,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
4,542,14,11,13,9,70,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1


In [10]:
df.shape[0]

129513

In [11]:
# split X and y 
X = df.drop(['price_try'], axis=1)
y = df['price_try']

In [12]:
# preprocessing step for ML algorithms
# makes the different features of the data are in the same scale, 
# and to remove the mean from each feature so that the data is centered around zero.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# change range of price logaritmicly
y_log = np.log(y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size = 0.2, random_state = 0)

In [15]:
model_comparison={}

In [16]:
optimization_comparison={}

## MACHINE LEARNING ALGORITHMS

### 1) KNN Regressor

In [None]:
param_grid = dict()
param_grid["n_neighbors"] = list(range(1, 10))
param_grid["weights"] = ['uniform', 'distance']
param_grid["algorithm"] = ["auto"]
param_grid["leaf_size"] = list(range(3,20))
param_grid["p"] = [1,2]
knn_regressor = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 10, scoring = 'neg_mean_squared_error', n_jobs=-1)
knn_regressor.fit(X_train, y_train)

In [None]:
knn_regressor.best_params_

In [None]:
y_train_pred =knn_regressor.predict(X_train) ##Predict train result
y_test_pred =knn_regressor.predict(X_test) ##Predict test result

In [None]:
print("Train Results for KNN Regressor Model:")
print(50 * '-')
print("Root Mean Squared Error (RMSE): ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for KNN Regressor Model:")
print(50 * '-')
print("Root Mean Squared Error (RMSE): ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["KNN Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

RMSE of test > RMSE of train => OVER FITTING of the data.

RMSE of test < RMSE of train => UNDER FITTING of the data.

159.8 > 123.02 => OVER FITTING

In [None]:
plt.scatter(y_test, y_test_pred)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('R\u00B2 = %0.2f' % r2_score(y_test, y_test_pred))

plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_test_pred, 1))(np.unique(y_test)), color="#34495E")

plt.show()

### 2) Rigde Regressor

In [None]:
params ={'alpha' :[0.001, 0.1, 1, 10, 100, 1000]}
ridge_regressor =GridSearchCV(Ridge(), params ,cv =5,scoring = 'neg_mean_absolute_error', n_jobs =-1)
ridge_regressor.fit(X_train ,y_train)

In [None]:
ridge_regressor.best_params_

In [None]:
y_train_pred =ridge_regressor.predict(X_train) ##Predict train result
y_test_pred =ridge_regressor.predict(X_test) ##Predict test result

In [None]:
print("Train Results for Ridge Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for Ridge Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["Ridge Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

In [None]:
plt.scatter(y_test, y_test_pred)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('R\u00B2 = %0.2f' % r2_score(y_test, y_test_pred))

plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_test_pred, 1))(np.unique(y_test)), color="#34495E")

plt.show()

### 3) Lasso Regressor

In [None]:
params ={'alpha' :[0.001, 0.01, 1, 10, 1000]}
lasso_regressor =GridSearchCV(Lasso(), params ,cv =15,scoring = 'neg_mean_absolute_error', n_jobs =-1)
lasso_regressor.fit(X_train ,y_train)

In [None]:
lasso_regressor.best_params_

In [None]:
y_train_pred =lasso_regressor.predict(X_train) ##Predict train result
y_test_pred =lasso_regressor.predict(X_test) ##Predict test result

In [None]:
print("Train Results for Lasso Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for Lasso Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["Lasso Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

In [None]:
plt.scatter(y_test, y_test_pred)
plt.xlabel('Actual values')
plt.ylabel('Predicted values')
plt.title('R\u00B2 = %0.2f' % r2_score(y_test, y_test_pred))

plt.plot(np.unique(y_test), np.poly1d(np.polyfit(y_test, y_test_pred, 1))(np.unique(y_test)), color="#34495E")

plt.show()

### 4) Decision Tree Regressor

In [None]:
depth = list(range(10,20))
param_grid = dict(max_depth =depth)
tree = GridSearchCV(DecisionTreeRegressor(),param_grid,cv =10)
tree.fit(X_train,y_train)

In [None]:
tree.best_params_

In [None]:
y_train_pred =tree.predict(X_train) ##Predict train result
y_test_pred =tree.predict(X_test) ##Predict test result

In [None]:
print("Train Results for Decision Tree Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for Decision Tree Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["Desicion Tree Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

### 5) Random Forest Regressor

In [None]:
tuned_params = {'n_estimators': [100, 500], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}
random_regressor = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter = 20, scoring = 'neg_mean_absolute_error', cv = 5, n_jobs = -1)
random_regressor.fit(X_train, y_train)

In [None]:
random_regressor.best_params_

In [None]:
y_train_pred = random_regressor.predict(X_train)
y_test_pred = random_regressor.predict(X_test)

In [None]:
print("Train Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["Random Forest Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

In [None]:
optimization_comparison["Random Forest Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

### 6) XGBoost Regressor

In [None]:
tuned_params = {'max_depth': [1, 2, 3, 4, 5], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [100, 200, 300, 400, 500], 'reg_lambda': [0.001, 0.1, 1.0, 10.0, 100.0]}
model = RandomizedSearchCV(XGBRegressor(), tuned_params, n_iter=20, scoring = 'neg_mean_absolute_error', cv=5, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
model.best_params_

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
print("Train Results for XGBoost Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for XGBoost Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["XGBoost Regressor"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

### 7) SVR

In [None]:
svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)

In [None]:
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

In [None]:
print("Train Results for XGBoost Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train.values, y_train_pred)),2))
print("R-squared: ", round(r2_score(y_train.values, y_train_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_train.values, y_train_pred),2))

In [None]:
print("Test Results for XGBoost Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test, y_test_pred)),2))
print("R-squared: ", round(r2_score(y_test, y_test_pred),2))
print("Explained Variance Score:", round(explained_variance_score(y_test, y_test_pred),2))

In [None]:
model_comparison["SVR"]=[round(sqrt(mse(y_test, y_test_pred)),2),round(r2_score(y_test, y_test_pred),2)]

## ML Algorithm Comparison

In [None]:
df_comparison = pd.DataFrame.from_dict(model_comparison).T
df_comparison.columns = ['MSE', 'R2 Score']
df_comparison = df_comparison.sort_values('R2 Score', ascending=False)
df_comparison.style.background_gradient(cmap='Blues')

### OPTIMAZING RANDOM FOREST REGRESSOR

In [None]:
df_opt = df.drop(["is_weekend","departure_month","part_of_month","checked_week_day","departure_week_day"], axis=1)

In [None]:
categorical_cols_opt = df_opt.select_dtypes(include=['category']).columns.to_list()

In [None]:
dummies_data = pd.get_dummies(df_opt[categorical_cols_opt])
df_updated_opt = pd.concat([df_opt.drop(categorical_cols_opt, axis=1), dummies_data], axis=1)

In [None]:
X_opt = df_updated_opt.drop(['price_try'], axis=1)

y_opt = df_updated_opt['price_try']

In [None]:
X_opt_scled = scaler.fit_transform(X_opt)

In [None]:
y_opt_log = np.log(y_opt)

In [None]:
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X_opt_scled, y_opt_log, test_size = 0.2, random_state = 0)

In [None]:
random_regressor.fit(X_train_opt, y_train_opt)

In [None]:
y_train_pred_opt = random_regressor.predict(X_train_opt)
y_test_pred_opt = random_regressor.predict(X_test_opt)

In [None]:
print("Train Results for Optimized Random Forest Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_train_opt.values, y_train_pred_opt)),2))
print("R-squared: ", round(r2_score(y_train_opt.values, y_train_pred_opt),2))
print("Explained Variance Score:", round(explained_variance_score(y_train_opt.values, y_train_pred_opt),2))

In [None]:
print("Test Results for XGBoost Regressor Model:")
print(50 * '-')
print("Root mean squared error: ", round(sqrt(mse(y_test_opt, y_test_pred_opt)),2))
print("R-squared: ", round(r2_score(y_test_opt, y_test_pred_opt),2))
print("Explained Variance Score:", round(explained_variance_score(y_test_opt, y_test_pred_opt),2))

In [None]:
optimization_comparison["(Drop Least Imp Features)Random Forest Regressor"]=[round(sqrt(mse(y_test_opt, y_test_pred_opt)),2), round(r2_score(y_test_opt, y_test_pred_opt),2)]

In [None]:
optimization_comparison = pd.DataFrame.from_dict(optimization_comparison).T
optimization_comparison.columns = ['MSE', 'R2 Score']
optimization_comparison = optimization_comparison.sort_values('R2 Score', ascending=False)
optimization_comparison.style.background_gradient(cmap='Blues')