In [4]:
import pandas
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import math

In [5]:
def get_outlier_indices(df, features):
    outlier_indices = []

    for c in features:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1

        outlier_step = 1.5 * IQR

        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)

    outlier_indices = list(set(outlier_indices))
    return outlier_indices

def remove_outliers(df, features):
    outlier_indices = get_outlier_indices(df, features)
    df_cleaned = df.drop(outlier_indices)
    df_cleaned.reset_index(drop=True, inplace=True)
    return df_cleaned

In [7]:
df = pandas.read_csv('Clean_Dataset.csv')
columns_to_drop = ['Unnamed: 0', 'flight']
df = df.drop(columns=columns_to_drop)
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# One-hot encoding
columns_to_onehot = ['airline', 'source_city', 'destination_city', 'departure_time', 'arrival_time', 'stops']

ohe = OneHotEncoder(drop='first', sparse_output=False)
one_hot_encoded = ohe.fit_transform(df[columns_to_onehot])

df_encoded = pandas.DataFrame(one_hot_encoded, columns=ohe.get_feature_names_out(columns_to_onehot))
df = pandas.concat([df.drop(columns=columns_to_onehot), df_encoded], axis=1)
df = remove_outliers(df, ['duration', 'price'])
X = df.drop(columns='price')
y = df.price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Linear regression

In [18]:
lr = LinearRegression().fit(X, y)
y_pred_lr = lr.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred_lr)
rmse = math.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred_lr)
print("MSE: " + str(mse))
print("RMSE: " + str(rmse))
print("MAE: " + str(mae))

MSE: 44809694.61106912
RMSE: 6694.004377879441
MAE: 4558.017302847958
