In [33]:
import numpy as np
import pandas as pd

# Visualizing
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [23]:
# Getting data
train_data = pd.read_csv('https://raw.githubusercontent.com/ioa2205/Airline-satisfaction/main/train_dataset.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/ioa2205/Airline-satisfaction/main/test_dataset.csv')
sample_solution = pd.read_csv('https://raw.githubusercontent.com/ioa2205/Airline-satisfaction/main/sample_submission.csv')

In [26]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [27]:
train_data.shape


(10000, 24)

In [5]:
train_data.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,Male,disloyal Customer,33,Business travel,Eco,571,2,3,2,...,4,3,1,3,4,3,4,10,3.0,0
1,2,Female,Loyal Customer,49,Business travel,Business,1431,4,1,4,...,5,5,5,5,3,5,3,0,0.0,1
2,3,Female,Loyal Customer,43,Business travel,Eco,867,1,4,4,...,1,1,1,1,1,1,2,0,18.0,0
3,4,Female,Loyal Customer,27,Business travel,Business,1550,3,3,3,...,2,4,4,5,5,4,2,0,0.0,1
4,5,Male,Loyal Customer,11,Personal Travel,Eco,526,3,4,3,...,4,5,2,5,3,5,4,0,10.0,0


In [28]:
# # Choose categorical data, in order to encode
categorical_cols = [cal for cal in train_data.columns if train_data[cal].dtype == 'object']

# Use OneHotEncoder for categorical data
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical = pd.DataFrame(encoder.fit_transform(train_data[categorical_cols]))
encoded_categorical.columns = encoder.get_feature_names_out(categorical_cols)

# Drop original categorical columns and concatenate encoded columns
train_data = train_data.drop(categorical_cols, axis=1)
train_data = pd.concat([train_data, encoded_categorical], axis=1)

# Repeat the same for test data
test_encoded_categorical = pd.DataFrame(encoder.transform(test_data[categorical_cols]))
test_encoded_categorical.columns = encoder.get_feature_names_out(categorical_cols)
test_data = test_data.drop(categorical_cols, axis=1)
test_data = pd.concat([test_data, test_encoded_categorical], axis=1)

# Scale numerical features using StandarScaler()
scaler = StandardScaler()
numerical_cols = ['Age','Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])
train_data.corrwith(train_data['satisfaction']).abs().sort_values(ascending=False)

Unnamed: 0,0
satisfaction,1.0
Class_Business,0.524424
Online boarding,0.504986
Type of Travel_Business travel,0.482266
Type of Travel_Personal Travel,0.482266
Class_Eco,0.478398
Inflight entertainment,0.4009
Seat comfort,0.342294
On-board service,0.326426
Leg room service,0.324368


In [34]:
# Prepare train data before splitting
X = train_data.drop(['id','satisfaction'], axis=1)
y = train_data['satisfaction']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Make predictions on the validation set
y_pred = model.predict(X_val)

# Check its errors
print(classification_report(y_val, y_pred))
print("ROC-AUC:", roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1002
           1       0.96      0.94      0.95       998

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000

ROC-AUC: 0.9904324617298469


In [36]:
X_test = test_data.drop(columns=["id"])
test_predictions = model.predict(X_test)
submission = pd.DataFrame({"id": test_data["id"], "satisfaction": test_predictions})
submission.to_csv("submission.csv", index=False)

In [43]:
# I tested this dataset on another model(xgboost). Result was almost the same.

# pip install xgboost

# from xgboost import XGBClassifier
# from sklearn.model_selection import RandomizedSearchCV

# xgb = XGBClassifier(eval_metric="logloss", random_state=42, use_label_encoder=False)
# xgb.fit(X_train, y_train)

# # Evaluate the model
# y_pred = xgb.predict(X_val)
# print(classification_report(y_val, y_pred))

# param_grid = {
#     "n_estimators": [100, 200, 300, 400],
#     "learning_rate": [0.01, 0.05, 0.1, 0.2],
#     "max_depth": [3, 5, 7, 10],
#     "subsample": [0.6, 0.8, 1.0],
#     "colsample_bytree": [0.6, 0.8, 1.0],
#     "min_child_weight": [1, 3, 5]
# }

# random_search = RandomizedSearchCV(
#     estimator=XGBClassifier(eval_metric="logloss", random_state=42, use_label_encoder=False),
#     param_distributions=param_grid,
#     n_iter=50,
#     scoring="accuracy",
#     cv=3,
#     verbose=1,
#     random_state=42
# )

# random_search.fit(X_train, y_train)

# # Best hyperparameters
# print("Best Parameters:", random_search.best_params_)

# best_xgb = random_search.best_estimator_
# y_pred_optimized = best_xgb.predict(X_val)

# print(classification_report(y_val, y_pred_optimized))