In [56]:
import xgboost
import catboost
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import tree

import matplotlib.pyplot as plt
import seaborn as sns
from catboost.utils import get_confusion_matrix

In [3]:
df = pd.read_csv('data/AirPass.csv')

In [5]:
df.isnull().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [7]:
df['Arrival Delay in Minutes'].fillna(df['Arrival Delay in Minutes'].median(), inplace=True)

In [8]:
df['Arrival Delay in Minutes'].mean()

15.133392362180475

In [12]:
df['satisfaction'].value_counts()

neutral or dissatisfied    58879
satisfied                  45025
Name: satisfaction, dtype: int64

In [13]:
gen = df.groupby('Gender')['satisfaction'].value_counts()
gen

Gender  satisfaction           
Female  neutral or dissatisfied    30193
        satisfied                  22534
Male    neutral or dissatisfied    28686
        satisfied                  22491
Name: satisfaction, dtype: int64

In [19]:
df.groupby('Type of Travel')['satisfaction'].value_counts()

Type of Travel   satisfaction           
Business travel  satisfied                  41746
                 neutral or dissatisfied    29909
Personal Travel  neutral or dissatisfied    28970
                 satisfied                   3279
Name: satisfaction, dtype: int64

In [20]:
df.groupby('Class')['satisfaction'].value_counts()

Class     satisfaction           
Business  satisfied                  34480
          neutral or dissatisfied    15185
Eco       neutral or dissatisfied    38044
          satisfied                   8701
Eco Plus  neutral or dissatisfied     5650
          satisfied                   1844
Name: satisfaction, dtype: int64

In [21]:
df['satisfaction'] = df['satisfaction'].map({'neutral or dissatisfied':0 , 'satisfied':1})
df['Customer Type'] = df['Customer Type'].map({'Loyal Customer':1, 'disloyal Customer':0})
df['Type of Travel'] = df['Type of Travel'].map({'Personal Travel':0, 'Business travel':1})

In [22]:
dfd = pd.get_dummies(df)
dfd.shape

(103904, 28)

In [30]:
dfd.columns

Index(['Unnamed: 0', 'id', 'Customer Type', 'Age', 'Type of Travel',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction', 'Gender_Female', 'Gender_Male', 'Class_Business',
       'Class_Eco', 'Class_Eco Plus'],
      dtype='object')

In [33]:
dfd.drop('Unnamed: 0', axis=1, inplace=True)

In [34]:
dfd.shape

(103904, 27)

In [35]:
X = dfd.drop('satisfaction', axis=1)
y = dfd['satisfaction']

In [36]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=26)

In [37]:
X_test.shape

(20781, 26)

In [39]:
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [40]:
print(X_test_scaled[0][0])

0.9408251379303


In [43]:
def fit_predict_and_print_metrics(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(metrics.f1_score(y_train, y_train_pred))
    print(metrics.f1_score(y_test, y_test_pred)) 

In [44]:
model = linear_model.LogisticRegression()

fit_predict_and_print_metrics(model, X_train_scaled, X_test_scaled, y_train, y_test)

0.8538275813094731
0.8546883773161146


In [46]:
dt = tree.DecisionTreeClassifier(random_state=26)
adabost = ensemble.AdaBoostClassifier(
    base_estimator=dt,
    learning_rate=0.01,
    random_state=26
)

fit_predict_and_print_metrics(adabost, X_train_scaled, X_test_scaled, y_train, y_test)

1.0
0.9398901098901099


In [48]:
params = {
    "n_estimators":2**np.arange(8),
    "learning_rate":0.1**np.arange(3)
}

gbd = ensemble.GradientBoostingClassifier()
gs = model_selection.GridSearchCV(
    estimator=gbd,
    param_grid=params,
    cv=3,
    scoring=metrics.make_scorer(metrics.f1_score)
)
gs.fit(X_train_scaled, y_train)

In [49]:
gs.best_params_

{'learning_rate': 1.0, 'n_estimators': 128}

In [50]:
gs.best_score_

0.9491271464770464

In [51]:
model = xgboost.XGBClassifier()
fit_predict_and_print_metrics(model, X_train_scaled, X_test_scaled, y_train, y_test)

0.9752680003376382
0.9579785161685312


In [52]:
model = catboost.CatBoostClassifier()

fit_predict_and_print_metrics(model, X_train_scaled, X_test_scaled, y_train, y_test)

Learning rate set to 0.068023
0:	learn: 0.6018089	total: 58.8ms	remaining: 58.7s
1:	learn: 0.5020769	total: 72.2ms	remaining: 36s
2:	learn: 0.4472534	total: 83.8ms	remaining: 27.9s
3:	learn: 0.4096076	total: 98.8ms	remaining: 24.6s
4:	learn: 0.3625895	total: 114ms	remaining: 22.8s
5:	learn: 0.3353514	total: 128ms	remaining: 21.2s
6:	learn: 0.3077176	total: 143ms	remaining: 20.3s
7:	learn: 0.2921075	total: 156ms	remaining: 19.4s
8:	learn: 0.2790148	total: 169ms	remaining: 18.6s
9:	learn: 0.2644624	total: 182ms	remaining: 18s
10:	learn: 0.2486163	total: 195ms	remaining: 17.6s
11:	learn: 0.2333942	total: 210ms	remaining: 17.3s
12:	learn: 0.2255868	total: 224ms	remaining: 17s
13:	learn: 0.2155886	total: 238ms	remaining: 16.8s
14:	learn: 0.2061542	total: 254ms	remaining: 16.7s
15:	learn: 0.2008751	total: 284ms	remaining: 17.4s
16:	learn: 0.1953162	total: 314ms	remaining: 18.2s
17:	learn: 0.1889280	total: 343ms	remaining: 18.7s
18:	learn: 0.1829713	total: 365ms	remaining: 18.9s
19:	learn: 0.

In [57]:
get_confusion_matrix(model, catboost.Pool(X_train_scaled, y_train))

array([[46668.,   538.],
       [ 1257., 34660.]])

In [61]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names':scaler.get_feature_names_out()}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

Unnamed: 0,feature_importance,feature_names
5,25.406075,Inflight wifi service
3,18.991249,Type of Travel
10,7.043878,Online boarding
1,6.895336,Customer Type
23,5.309154,Class_Business
16,3.855697,Checkin service
2,3.745753,Age
15,3.474255,Baggage handling
8,3.453294,Gate location
11,2.963778,Seat comfort
