In [36]:
#!pip install xgboost
#!pip install catboost
#!pip install tabulate

In [1]:
import os
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler

**Load The Data**

In [2]:
current_dir = os.getcwd() 
relative_path_train = os.path.join('..', 'data', 'train_data_after_EDA.csv')
relative_path_test = os.path.join('..', 'data', 'test_data_after_EDA.csv')

preprocessed_train_data = pd.read_csv(os.path.join(current_dir, relative_path_train))
preprocessed_test_data = pd.read_csv(os.path.join(current_dir, relative_path_test))

**Encode target variable**

In [3]:
# Initialize a LabelEncoder instance
le = LabelEncoder()

# Fit the LabelEncoder on the combined data of both train and test datasets
combined_data = pd.concat([preprocessed_train_data, preprocessed_test_data])
col_encoded = le.fit_transform(combined_data['satisfaction'])

# Transform the 'satisfaction' column in both train and test datasets using the fitted LabelEncoder
preprocessed_train_data['satisfaction'] = col_encoded[:len(preprocessed_train_data)]
preprocessed_test_data['satisfaction'] = col_encoded[len(preprocessed_train_data):]

**Drop unnecessary columns**

In [4]:
# Columns that didn't affect the satisfaction

drop_columns = ['Gender','Gate location','Departure/Arrival time convenient']
preprocessed_train_data.drop(drop_columns,axis=1,inplace=True)
preprocessed_train_data.head()

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,satisfaction
0,Loyal Customer,13,Personal Travel,Eco Plus,460,3,3,5,3,5,5,4,3,4,4,5,5,25,0
1,disloyal Customer,25,Business travel,Business,235,3,3,1,3,1,1,1,5,3,1,4,1,1,0
2,Loyal Customer,26,Business travel,Business,1142,2,2,5,5,5,5,4,3,4,4,4,5,0,1
3,Loyal Customer,25,Business travel,Business,562,2,5,2,2,2,2,2,5,3,1,4,2,11,0
4,Loyal Customer,61,Business travel,Business,214,3,3,4,5,5,3,3,4,4,3,3,3,0,1


In [5]:
# Columns that didn't affect the satisfaction

drop_columns = ['Gender','Gate location','Departure/Arrival time convenient']
preprocessed_test_data.drop(drop_columns,axis=1,inplace=True)
preprocessed_test_data.head()

Unnamed: 0,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,satisfaction
0,Loyal Customer,52,Business travel,Eco,160,5,3,3,4,3,5,5,5,5,2,5,5,3.931826,1
1,Loyal Customer,36,Business travel,Business,2863,1,3,5,4,5,4,4,4,4,3,4,5,0.0,1
2,disloyal Customer,20,Business travel,Eco,192,2,2,2,2,2,2,4,1,3,2,2,2,0.0,0
3,Loyal Customer,44,Business travel,Business,3377,0,0,3,4,4,1,1,1,1,3,1,4,0.0,1
4,Loyal Customer,49,Business travel,Eco,1182,2,4,4,1,2,2,2,2,2,4,2,4,0.0,1


**Data Encoding**

In [6]:
# Initialize a OneHotEncoder instance
encoder = OneHotEncoder(sparse=False)  # Set sparse=False for easier handling

columns_to_encode = ['Customer Type', 'Type of Travel', 'Class']

# Fit the OneHotEncoder on the combined data
combined_data = pd.concat([preprocessed_train_data, preprocessed_test_data])
encoded_data = encoder.fit_transform(combined_data[columns_to_encode])

# Get the encoded column names
encoded_column_names = []
for i, column in enumerate(columns_to_encode):
  categories = encoder.categories_[i]
  encoded_column_names.extend([column + '_' + str(category) for category in categories])

# Separate encoded data for train and test sets
encoded_train_data = encoded_data[:len(preprocessed_train_data)]
encoded_test_data = encoded_data[len(preprocessed_train_data):]

# Create new DataFrames for encoded features
encoded_train_df = pd.DataFrame(encoded_train_data, columns=encoded_column_names)
encoded_test_df = pd.DataFrame(encoded_test_data, columns=encoded_column_names)

# Drop original categorical columns and add encoded features
preprocessed_train_data.drop(columns_to_encode, axis=1, inplace=True)
preprocessed_train_data = pd.concat([preprocessed_train_data, encoded_train_df], axis=1)

preprocessed_test_data.drop(columns_to_encode, axis=1, inplace=True)
preprocessed_test_data = pd.concat([preprocessed_test_data, encoded_test_df], axis=1)



In [7]:
preprocessed_test_data.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,...,Cleanliness,Departure Delay in Minutes,satisfaction,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,52,160,5,3,3,4,3,5,5,5,...,5,3.931826,1,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,36,2863,1,3,5,4,5,4,4,4,...,5,0.0,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,20,192,2,2,2,2,2,2,4,1,...,2,0.0,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,44,3377,0,0,3,4,4,1,1,1,...,4,0.0,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,49,1182,2,4,4,1,2,2,2,2,...,4,0.0,1,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [8]:
preprocessed_train_data.head()

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,...,Cleanliness,Departure Delay in Minutes,satisfaction,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,13,460,3,3,5,3,5,5,4,3,...,5,25,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,25,235,3,3,1,3,1,1,1,5,...,1,1,0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,26,1142,2,2,5,5,5,5,4,3,...,5,0,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,25,562,2,5,2,2,2,2,2,5,...,2,11,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,61,214,3,3,4,5,5,3,3,4,...,3,0,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0


**Standardization**

Standardization scales features by subtracting the mean and then dividing by the standard deviation.

This results in features that have a mean of 0 and a standard deviation of 1.

In [9]:
columns_to_scale = ['Age', 'Flight Distance', 'Inflight wifi service',
       'Ease of Online booking', 'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment', 'On-board service',
       'Leg room service', 'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness', 'Departure Delay in Minutes']

In [10]:
scaler = StandardScaler()
# Standarize train data
scaled_values = scaler.fit_transform(preprocessed_train_data[columns_to_scale])
preprocessed_train_data[columns_to_scale] = scaled_values

In [11]:
aggregated_train = preprocessed_train_data[columns_to_scale].agg(['mean', 'min', 'max', 'median', 'std']).style.background_gradient(cmap='Blues')
aggregated_train

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes
mean,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0
min,-2.142239,-1.161768,-2.055758,-1.970731,-2.408473,-2.408573,-2.607418,-2.519278,-2.625348,-2.547172,-2.228672,-2.611283,-3.096504,-2.504332,-0.387532
max,3.018235,3.804423,1.709804,1.603448,1.352264,1.296496,1.183099,1.231704,1.25559,1.25338,1.158582,1.340069,1.156436,1.30587,41.254377
median,0.041039,-0.347441,0.203579,0.173776,-0.152031,-0.185532,0.424996,0.481508,0.479403,0.49327,0.311769,-0.240472,0.305848,-0.218211,-0.387532
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005


In [12]:
# Standarize Test data
scaled_values = scaler.fit_transform(preprocessed_test_data[columns_to_scale])
preprocessed_test_data[columns_to_scale] = scaled_values

In [14]:
aggregated_test = preprocessed_test_data[columns_to_scale].agg(['mean', 'min', 'max', 'median', 'std']).style.background_gradient(cmap='Blues')
aggregated_test

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Ease of Online booking,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes
mean,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0
min,-2.141219,-1.195433,-2.045654,-1.952907,-2.409906,-2.393241,-1.843606,-2.497449,-2.635178,-2.531122,-2.237379,-1.82033,-3.089359,-2.481636,-0.761127
max,2.992966,2.831393,1.708162,1.594457,1.340321,1.290829,1.179517,1.231122,1.263057,1.25518,1.164466,1.330519,1.145456,1.301968,3.622441
median,0.030936,-0.337506,0.206636,0.175511,-0.159769,-0.182799,0.423737,0.485408,0.48341,0.49792,0.314005,-0.244906,0.298493,-0.211474,-0.761127
std,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002


**Write the train and test data to CSV files**

In [15]:
relative_path = os.path.join('..', 'data', 'preprocessed_train_data.csv')
preprocessed_train_data.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.

In [16]:
relative_path = os.path.join('..', 'data', 'preprocessed_test_data.csv')
preprocessed_test_data.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.