In [2]:
import os
import pandas as pd
from sklearn.model_selection import KFold
import math
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler

**Load The Preprocessing Data**

In [3]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'preprocessed_train_data.csv')
relative_path_test = os.path.join('..', 'data', 'preprocessed_test_data.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

combine the data

In [4]:
# Combine train and test data
combined_data = pd.concat([preprocessed_train_data, preprocessed_test_data], axis=0)

In [5]:
combined_data.columns

Index(['Age', 'Flight Distance', 'Inflight wifi service',
       'Ease of Online booking', 'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment', 'On-board service',
       'Leg room service', 'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness', 'Departure Delay in Minutes',
       'satisfaction', 'Customer Type_Loyal Customer',
       'Customer Type_disloyal Customer', 'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco',
       'Class_Eco Plus'],
      dtype='object')

Apply grouping on columns with continuous variables

In [6]:
continuous_variables = ['Age', 'Flight Distance','Departure Delay in Minutes']

In [7]:
# apply grouping on the original data before normalization to get the original values of the age and delay
# Calculate the quintiles for each continuous variable
for col in continuous_variables:
    combined_data[f'{col}_quintile'] = pd.qcut(combined_data[col], q=5, labels=False, duplicates='drop')

# Define the range titles for each quintile group
range_titles = []

for col in continuous_variables:
    quintile_ranges = []
    quintile_series = combined_data.groupby(f'{col}_quintile')[col].apply(lambda x: f'{x.min()} - {x.max()}')
    for quintile in range(5):
        if quintile in quintile_series.index:
            quintile_range = quintile_series.loc[quintile]
        else:
            quintile_range = ''  # Handle the case where the quintile does not exist
        quintile_ranges.append(quintile_range)
    range_titles.append(quintile_ranges)

In [8]:
combined_data.columns

Index(['Age', 'Flight Distance', 'Inflight wifi service',
       'Ease of Online booking', 'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment', 'On-board service',
       'Leg room service', 'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness', 'Departure Delay in Minutes',
       'satisfaction', 'Customer Type_Loyal Customer',
       'Customer Type_disloyal Customer', 'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business', 'Class_Eco',
       'Class_Eco Plus', 'Age_quintile', 'Flight Distance_quintile',
       'Departure Delay in Minutes_quintile'],
      dtype='object')

In [9]:
# drop the columns of contiiuous variables as we have now their quantiles
combined_data.drop(columns=continuous_variables, inplace=True)

In [10]:
combined_data.columns

Index(['Inflight wifi service', 'Ease of Online booking', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness', 'satisfaction',
       'Customer Type_Loyal Customer', 'Customer Type_disloyal Customer',
       'Type of Travel_Business travel', 'Type of Travel_Personal Travel',
       'Class_Business', 'Class_Eco', 'Class_Eco Plus', 'Age_quintile',
       'Flight Distance_quintile', 'Departure Delay in Minutes_quintile'],
      dtype='object')

Split the data again to the train and test sets

In [11]:
new_preprocessed_train_data = combined_data.iloc[:len(preprocessed_train_data)]
new_preprocessed_test_data = combined_data.iloc[len(preprocessed_train_data):]

In [12]:
new_preprocessed_train_data

Unnamed: 0,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,...,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,Age_quintile,Flight Distance_quintile,Departure Delay in Minutes_quintile
0,0.203579,0.173776,1.352264,-0.185532,1.183099,1.231704,0.479403,-0.266840,0.311769,0.549799,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,1,2
1,0.203579,0.173776,-1.656326,-0.185532,-1.849315,-1.769081,-1.849161,1.253380,-0.535045,-1.821012,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,0,1
2,-0.549533,-0.541060,1.352264,1.296496,1.183099,1.231704,0.479403,-0.266840,0.311769,0.549799,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1,3,0
3,-0.549533,1.603448,-0.904178,-0.926545,-1.091211,-1.018885,-1.072973,1.253380,-0.535045,-1.821012,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1,2
4,0.203579,0.173776,0.600117,1.296496,1.183099,-0.268688,-0.296785,0.493270,0.311769,-0.240472,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,-0.549533,-0.541060,-0.904178,-0.926545,-1.091211,-1.018885,-0.296785,-1.787061,0.311769,-1.030742,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,0,2
103900,0.956691,0.888612,-0.904178,0.555482,1.183099,1.231704,1.255590,1.253380,1.158582,1.340069,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3,4,0
103901,-1.302646,-1.255895,0.600117,-1.667559,1.183099,0.481508,-0.296785,-1.026951,0.311769,1.340069,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1,3,2
103902,-1.302646,-1.255895,-1.656326,-1.667559,-1.849315,-1.769081,0.479403,1.253380,-2.228672,1.340069,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,2,0


In [13]:
new_preprocessed_test_data

Unnamed: 0,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,...,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus,Age_quintile,Flight Distance_quintile,Departure Delay in Minutes_quintile
0,1.708162,0.175511,-0.159769,0.554015,-0.332044,1.231122,1.263057,1.255180,1.164466,-1.032618,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3,0,3
1,-1.294891,0.175511,1.340321,0.554015,1.179517,0.485408,0.483410,0.497920,0.314005,-0.244906,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1,4,0
2,-0.544128,-0.533962,-0.909815,-0.919613,-1.087825,-1.006021,0.483410,-1.773862,-0.536456,-1.032618,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,0,0
3,-2.045654,-1.952907,-0.159769,0.554015,0.423737,-1.751735,-1.855531,-1.773862,-2.237379,-0.244906,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2,4,0
4,-0.544128,0.884984,0.590276,-1.656427,-1.087825,-1.006021,-1.075884,-1.016601,-1.386918,0.542807,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,3,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25308,0.206636,0.175511,0.590276,-0.182799,0.423737,0.485408,-0.296237,-1.016601,0.314005,0.542807,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1,1,0
25309,0.957399,0.884984,0.590276,0.554015,0.423737,0.485408,0.483410,1.255180,1.164466,1.330519,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,2,0
25310,-0.544128,-1.243434,-0.909815,-1.656427,-1.087825,-1.006021,0.483410,-0.259341,0.314005,1.330519,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,2,0
25311,0.206636,0.175511,0.590276,0.554015,0.423737,0.485408,-0.296237,-1.016601,1.164466,0.542807,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,3,0


**Write the train and test data to CSV files**

In [12]:
relative_path = os.path.join('..', 'data', 'preprocessed_train_data_after_grouping.csv')
new_preprocessed_train_data.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.

In [13]:
relative_path = os.path.join('..', 'data', 'preprocessed_test_data_after_grouping.csv')
new_preprocessed_test_data.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.