In [3]:
import pandas
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import math

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas


In [2]:
def get_outlier_indices(df, features):
    outlier_indices = []

    for c in features:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1

        outlier_step = 1.5 * IQR

        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)

    outlier_indices = list(set(outlier_indices))
    return outlier_indices

def remove_outliers(df, features):
    outlier_indices = get_outlier_indices(df, features)
    df_cleaned = df.drop(outlier_indices)
    df_cleaned.reset_index(drop=True, inplace=True)
    return df_cleaned

In [4]:
df = pandas.read_csv('Clean_Dataset.csv')
columns_to_drop = ['Unnamed: 0', 'flight']
df = df.drop(columns=columns_to_drop)
le = LabelEncoder()
df.flight_class = df.flight_class.replace({'Economy' : 1,'Business' :2})
df['airline'] = le.fit_transform(df['airline'])
df['source_city'] = le.fit_transform(df['source_city'])
df['destination_city'] = le.fit_transform(df['destination_city'])
df['departure_time'] = le.fit_transform(df['departure_time'])
df['arrival_time'] = le.fit_transform(df['arrival_time'])
df['stops'] = le.fit_transform(df['stops'])
# # One-hot encoding
# columns_to_onehot = ['airline', 'source_city', 'destination_city', 'departure_time', 'arrival_time', 'stops']

# ohe = OneHotEncoder(drop='first', sparse_output=False)
# one_hot_encoded = ohe.fit_transform(df[columns_to_onehot])

# df_encoded = pandas.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out(columns_to_onehot))
# df = pandas.concat([df.drop(columns=columns_to_onehot), df_encoded], axis=1)
df = remove_outliers(df, ['duration', 'price'])
X = df.drop(columns='price')
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

df_business = df[df['flight_class'] == 2]
X_business = df_business.drop(columns='price')
y_business = df_business.price
X_train_business, X_test_business, y_train_business, y_test_business = train_test_split(X_business, y_business, test_size=0.20, random_state=0)

df_economy = df[df['flight_class'] == 1]
X_economy = df_economy.drop(columns='price')
y_economy = df_economy.price
X_train_economy, X_test_economy, y_train_economy, y_test_economy = train_test_split(X_economy, y_economy, test_size=0.20, random_state=0)



  df.flight_class = df.flight_class.replace({'Economy' : 1,'Business' :2})


## Linear regression

In [5]:
lr = LinearRegression().fit(X, y)
y_pred_lr = lr.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred_lr)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred_lr)
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("MAE: " + str(mae))

MSE: 48220128.129044436
RMSE: 6944.071437495761
MAE: 4635.282149073735


## Linear regression for business class

In [6]:
lr_business = LinearRegression().fit(X_business, y_business)
y_pred_lr_business = lr_business.predict(X_test_business)
mse_business = metrics.mean_squared_error(y_test_business, y_pred_lr_business)
rmse_business = math.sqrt(mse_business)
mae_business = metrics.mean_absolute_error(y_test_business, y_pred_lr_business)
print("MSE: " + str(mse_business))
print("RMSE: " + str(rmse_business))
print("MAE: " + str(mae_business))

MSE: 94337423.44612947
RMSE: 9712.745412401659
MAE: 7376.578364521865


## Linear regression for economy class

In [7]:
lr_economy = LinearRegression().fit(X_economy, y_economy)
y_pred_lr_economy = lr_economy.predict(X_test_economy)
mse_economy = metrics.mean_squared_error(y_test_economy, y_pred_lr_economy)
rmse_economy = math.sqrt(mse_economy)
mae_economy = metrics.mean_absolute_error(y_test_economy, y_pred_lr_economy)
print("MSE: " + str(mse_economy))
print("RMSE: " + str(rmse_economy))
print("MAE: " + str(mae_economy))

MSE: 8168976.152494205
RMSE: 2858.1420805296234
MAE: 2065.8963861229067


## Random forest

In [8]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_features': [2, 4, 6, 8],
# }
# rf = RandomForestRegressor(n_jobs=-1)
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_
# print(best_params)
final_rf = RandomForestRegressor(n_estimators=500, max_features=8, n_jobs=-1).fit(X_train, y_train)
y_pred_rf = final_rf.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred_rf)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred_rf)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)

MSE: 7311588.434043385
RMSE: 2703.9949027399043
MAE: 1070.847516833957


## Random forest for business class

In [37]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_features': [2, 4, 6, 8],
# }
# rf = RandomForestRegressor(n_jobs=-1)
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search.fit(X_train_business, y_train_business)
# best_params = grid_search.best_params_
# print(best_params)
final_rf = RandomForestRegressor(n_estimators=500, max_features=8, n_jobs=-1).fit(X_train_business, y_train_business)
y_pred_rf_business = final_rf.predict(X_test_business)
mse = metrics.mean_squared_error(y_test_business, y_pred_rf_business)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test_business, y_pred_rf_business)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)

MSE: 18960993.733425427
RMSE: 4354.422319140098
MAE: 2059.5050704621267


## Random forest for economy class

In [41]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_features': [2, 4, 6, 8],
# }
# rf = RandomForestRegressor(n_jobs=-1)
# grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search.fit(X_train_economy, y_train_economy)
# best_params = grid_search.best_params_
# print(best_params)
final_rf = RandomForestRegressor(n_estimators=500, max_features=8, n_jobs=-1).fit(X_train_economy, y_train_economy)
y_pred_rf = final_rf.predict(X_test_economy)
mse = metrics.mean_squared_error(y_test_economy, y_pred_rf)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test_economy, y_pred_rf)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)

MSE: 1908115.2719808498
RMSE: 1381.3454571470706
MAE: 614.64680957267


## Multilayer Perceptron

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import models, layers
from sklearn.metrics import mean_squared_error
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = models.Sequential()

# param_grid = {
#     'neurons1': [64, 128, 256],
#     'neurons2': [32, 64, 128],
#     'activation': ['relu', 'tanh', 'sigmoid'],
# }

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search_result = grid_search.fit(X_train_scaled, y_train)
# best_params = grid_search_result.best_params_
# print(best_params)

model.add(layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test_scaled, y_test)
print(f'Test MAE: {test_mae}')

y_pred = model.predict(X_test_scaled)

# Calculate and print MSE and RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Test MSE: {mse}')
print(f'Test RMSE: {rmse}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 2627.634765625
Test MSE: 19842597.290863883
Test RMSE: 4454.503035228945


## Multilayer Perceptron for business class

In [43]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_business)
X_test_scaled = scaler.transform(X_test_business)

model = models.Sequential()

# param_grid = {
#     'neurons1': [64, 128, 256],
#     'neurons2': [32, 64, 128],
#     'activation': ['relu', 'tanh', 'sigmoid'],
# }

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search_result = grid_search.fit(X_train_scaled, y_train_business)
# best_params = grid_search_result.best_params_
# print(best_params)

model.add(layers.Dense(128, activation='relu', input_shape=(X_train_business.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.fit(X_train_scaled, y_train_business, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test_scaled, y_test_business)
print(f'Test MAE: {test_mae}')

y_pred = model.predict(X_test_scaled)

# Calculate and print MSE and RMSE
mse = mean_squared_error(y_test_business, y_pred)
rmse = np.sqrt(mse)

print(f'Test MSE: {mse}')
print(f'Test RMSE: {rmse}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 6884.03369140625
Test MSE: 85973699.30518411
Test RMSE: 9272.200348632687


## Multilayer Perceptron for economy class

In [47]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_economy)
X_test_scaled = scaler.transform(X_test_economy)

model = models.Sequential()

# param_grid = {
#     'neurons1': [64, 128, 256],
#     'neurons2': [32, 64, 128],
#     'activation': ['relu', 'tanh', 'sigmoid'],
# }

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
# grid_search_result = grid_search.fit(X_train_scaled, y_train_economy)
# best_params = grid_search_result.best_params_
# print(best_params)

model.add(layers.Dense(128, activation='relu', input_shape=(X_train_economy.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model.fit(X_train_scaled, y_train_economy, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_mae = model.evaluate(X_test_scaled, y_test_economy)
print(f'Test MAE: {test_mae}')

y_pred = model.predict(X_test_scaled)

# Calculate and print MSE and RMSE
mse = mean_squared_error(y_test_economy, y_pred)
rmse = np.sqrt(mse)

print(f'Test MSE: {mse}')
print(f'Test RMSE: {rmse}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MAE: 1623.48583984375
Test MSE: 5618096.273775256
Test RMSE: 2370.2523649972923


## Bagging Regression Tree

In [17]:
base_regressor = DecisionTreeRegressor(random_state=42)

# Create a Bagging Regressor with Decision Trees as base estimators
bagging_regressor = BaggingRegressor(base_regressor, n_estimators=10, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [5, 10, 15],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(bagging_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
tree_bg = grid_search.fit(X_train, y_train)

parameters = {
    'splitter' : ['best', 'random'],
    'max_features' : [None, 8, 7, 6, 5],
}


tree = DecisionTreeRegressor()
tree_cv = GridSearchCV(estimator=tree, param_grid=parameters, cv=20).fit(X_train, y_train)

tree = DecisionTreeRegressor(**tree_cv.best_params_)
bagging_regressor = BaggingRegressor(base_regressor,**tree_bg.best_params_, random_state=42)
# Fit the Bagging Regressor on the training data
bagging_regressor.fit(X_train, y_train)
y_pred = bagging_regressor.predict(X_test)

mse = metrics.mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred)
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("MAE: " + str(mae))

Tuned Hyper Parameters :
{'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 15}
Tuned Hyper Parameters :
{'max_features': None, 'splitter': 'best'}
MSE: 7295633.8596384935
RMSE: 2701.043105846053
MAE: 1145.898434661537


## Bagging Regression Tree for economy class

In [10]:
base_regressor = DecisionTreeRegressor(random_state=42)

# Create a Bagging Regressor with Decision Trees as base estimators
bagging_regressor = BaggingRegressor(base_regressor, n_estimators=10, random_state=42)

# Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [5, 10, 15],
#     'max_samples': [0.5, 0.7, 1.0],
#     'max_features': [0.5, 0.7, 1.0]
# }

# # Use GridSearchCV for hyperparameter tuning
# grid_search = GridSearchCV(bagging_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
# tree_bg = grid_search.fit(X_train_economy, y_train_economy)

# parameters = {
#     'splitter' : ['best', 'random'],
#     'max_features' : [None, 8, 7, 6, 5],
# }


# tree = DecisionTreeRegressor()
# tree_cv = GridSearchCV(estimator=tree, param_grid=parameters, cv=20).fit(X_train, y_train)

tree = DecisionTreeRegressor(splitter='best',max_features=None)
bagging_regressor = BaggingRegressor(base_regressor,n_estimators=15,max_samples=0.5,max_features=1.0, random_state=42)
# Fit the Bagging Regressor on the training data
bagging_regressor.fit(X_train_economy, y_train_economy)
y_pred = bagging_regressor.predict(X_test_economy)

mse = metrics.mean_squared_error(y_test_economy, y_pred)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test_economy, y_pred)
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("MAE: " + str(mae))

MSE: 2028511.0323822298
RMSE: 1424.2580638291047
MAE: 627.2349961072184


## Bagging Regression Tree for business class

In [13]:
base_regressor = DecisionTreeRegressor(random_state=42)

# Create a Bagging Regressor with Decision Trees as base estimators
bagging_regressor = BaggingRegressor(base_regressor, n_estimators=10, random_state=42)

# Define the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [5, 10, 15],
#     'max_samples': [0.5, 0.7, 1.0],
#     'max_features': [0.5, 0.7, 1.0]
# }

# # Use GridSearchCV for hyperparameter tuning
# grid_search = GridSearchCV(bagging_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
# tree_bg = grid_search.fit(X_train_business, y_train_business)

# parameters = {
#     'splitter' : ['best', 'random'],
#     'max_features' : [None, 8, 7, 6, 5],
# }


# tree = DecisionTreeRegressor()
# tree_cv = GridSearchCV(estimator=tree, param_grid=parameters, cv=20).fit(X_train, y_train)

tree = DecisionTreeRegressor(splitter='best',max_features=None)
bagging_regressor = BaggingRegressor(base_regressor,n_estimators=15,max_samples=0.5,max_features=1.0, random_state=42)
# Fit the Bagging Regressor on the training data
bagging_regressor.fit(X_train_business, y_train_business)
y_pred = bagging_regressor.predict(X_test_business)

mse = metrics.mean_squared_error(y_test_business, y_pred)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test_business, y_pred)
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("MAE: " + str(mae))

MSE: 18615411.670074083
RMSE: 4314.55810832049
MAE: 2161.473036317215


## Results for whole dataset

In [14]:
data = { 'Algorithm': ['Linear Regression', 'Random Forest', 'Multilayer Perceptron', 'Bagging Regression Tree', 'SVM Regression', 'Lasso Regression', 'XGBoost'],
         'MSE':[8168976, 7311588, 19842597, 7295633,0,0,0],
         'RMSE':[2858, 2703, 4454, 2701,0,0,0],
               'MAE':[2065, 1070, 2627, 1145,0,0,0]}
dataframe = pandas.DataFrame(data)
print(dataframe.to_string(index=False))

              Algorithm      MSE  RMSE  MAE
      Linear Regression  8168976  2858 2065
          Random Forest  7311588  2703 1070
  Multilayer Perceptron 19842597  4454 2627
Bagging Regression Tree  7295633  2701 1145
         SVM Regression        0     0    0
       Lasso Regression        0     0    0
                XGBoost        0     0    0


## Results for business class

In [15]:
data = { 'Algorithm': ['Linear Regression', 'Random Forest', 'Multilayer Perceptron', 'Bagging Regression Tree', 'SVM Regression', 'Lasso Regression', 'XGBoost'],
         'MSE':[94337423, 18960993, 85973699, 18615411,0,0,0],
         'RMSE':[9712, 4354, 9272, 4314,0,0,0],
         'MAE':[7376, 2059, 6884, 2161,0,0,0]}
dataframe = pandas.DataFrame(data)
print(dataframe.to_string(index=False))

              Algorithm      MSE  RMSE  MAE
      Linear Regression 94337423  9712 7376
          Random Forest 18960993  4354 2059
  Multilayer Perceptron 85973699  9272 6884
Bagging Regression Tree 18615411  4314 2161
         SVM Regression        0     0    0
       Lasso Regression        0     0    0
                XGBoost        0     0    0


## Results for economy class

In [16]:
data = { 'Algorithm': ['Linear Regression', 'Random Forest', 'Multilayer Perceptron', 'Bagging Regression Tree', 'SVM Regression', 'Lasso Regression', 'XGBoost'],
         'MSE':[8168976, 1908115, 5618096, 2028511,0,0,0],
         'RMSE':[2858, 1381, 2370, 1424,0,0,0],
         'MAE':[2065, 614, 1623, 627,0,0,0]}
dataframe = pandas.DataFrame(data)
print(dataframe.to_string(index=False))

              Algorithm     MSE  RMSE  MAE
      Linear Regression 8168976  2858 2065
          Random Forest 1908115  1381  614
  Multilayer Perceptron 5618096  2370 1623
Bagging Regression Tree 2028511  1424  627
         SVM Regression       0     0    0
       Lasso Regression       0     0    0
                XGBoost       0     0    0
