In [44]:
from matplotlib import pyplot as plt
from scipy import stats
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

data_train = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')
print(data_train.shape)
data_train.head(1).transpose()

(3000, 24)


Unnamed: 0,0
id,1
Gender,Female
Customer Type,disloyal Customer
Age,22
Type of Travel,Business travel
Class,Eco
Flight Distance,1599
Seat comfort,3
Departure/Arrival time convenient,0
Food and drink,3


In [45]:
numeric_feature = ['Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes']
ordinal_feature = ['Seat comfort', 'Departure/Arrival time convenient', 'Food and drink', 'Gate location', 
           'Inflight wifi service', 'Inflight entertainment', 'Online support', 'Ease of Online booking', 
           'On-board service', 'Leg room service', 'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding']
categorical_feature = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

inflight_feature = ['Seat comfort','Food and drink','Inflight wifi service','Inflight entertainment','Leg room service','Cleanliness']
outflight_feature = ['Departure/Arrival time convenient'
                              ,'Gate location'
                              ,'Online support'
                              ,'Ease of Online booking'
                              ,'On-board service'
                              ,'Baggage handling'
                              ,'Checkin service'
                              ,'Online boarding']

# online booking and boarding

In [46]:
data = []
for i in range(len(data_train)):
    if data_train['Flight Distance'].iloc[i] >= 3000:
        data.append(2)
    elif data_train['Flight Distance'].iloc[i] >= 1000:
        data.append(1)
    else:
        data.append(0)

data_train['distance'] = data

In [47]:
data_train["Ease of Online booking"][data_train["Ease of Online booking"] == 0] = 3
data_train["Online boarding"][data_train["Online boarding"] == 0] = 3

In [48]:
for i in range(len(data_train)):
    for feature in ordinal_feature:
        if data_train[feature].iloc[i] == 0:
            data_train[feature].iloc[i] = np.nan

for i in range(len(data_test)):
    for feature in ordinal_feature:
        if data_test[feature].iloc[i] == 0:
            data_test[feature].iloc[i] = np.nan

In [49]:
for feature in ordinal_feature:
    if data_train[feature].isnull().sum() > 0:
        print(feature, data_train[feature].isnull().sum())
print("================")
for feature in ordinal_feature:
    if data_test[feature].isnull().sum() > 0:
        print(feature, data_test[feature].isnull().sum())

Seat comfort 106
Departure/Arrival time convenient 154
Food and drink 129
Inflight wifi service 2
Inflight entertainment 73
Leg room service 11
Seat comfort 64
Departure/Arrival time convenient 90
Food and drink 80
Inflight wifi service 1
Inflight entertainment 39
Leg room service 10


# Inflight wifi service

In [50]:
import datawig

target_feature = "Inflight wifi service"

imputer = datawig.SimpleImputer(input_columns=['Online boarding', 'Ease of Online booking', 'Online support'],
                                output_column=target_feature)
imputer.fit(train_df=data_train, num_epochs=50)

null_train = data_train[data_train[target_feature].isnull()]
null_imputed = imputer.predict(null_train)
imputed_train = pd.DataFrame(null_imputed)

n = 0
for i in range(len(data_train)):
    if np.isnan(data_train[target_feature].iloc[i]) == True:
        data_train[target_feature].iloc[i] = int(round(imputed_train[target_feature+"_imputed"].iloc[n]))
        n += 1
        
null_train = data_test[data_test[target_feature].isnull()]
null_imputed = imputer.predict(null_train)
imputed_train = pd.DataFrame(null_imputed)

n = 0
for i in range(len(data_test)):
    if np.isnan(data_test[target_feature].iloc[i]) == True:
        data_test[target_feature].iloc[i] = int(round(imputed_train[target_feature+"_imputed"].iloc[n]))
        n += 1

# Leg room service

In [51]:
import datawig

target_feature = "Leg room service"

imputer = datawig.SimpleImputer(input_columns=['Baggage handling', 'On-board service', 'Cleanliness'],
                                output_column=target_feature)
imputer.fit(train_df=data_train, num_epochs=50)

null_train = data_train[data_train[target_feature].isnull()]
null_imputed = imputer.predict(null_train)
imputed_train = pd.DataFrame(null_imputed)

n = 0
for i in range(len(data_train)):
    if np.isnan(data_train[target_feature].iloc[i]) == True:
        data_train[target_feature].iloc[i] = int(round(imputed_train[target_feature+"_imputed"].iloc[n]))
        n += 1
        
null_train = data_test[data_test[target_feature].isnull()]
null_imputed = imputer.predict(null_train)
imputed_train = pd.DataFrame(null_imputed)

n = 0
for i in range(len(data_test)):
    if np.isnan(data_test[target_feature].iloc[i]) == True:
        data_test[target_feature].iloc[i] = int(round(imputed_train[target_feature+"_imputed"].iloc[n]))
        n += 1

In [52]:
data_train['Delayed'] = (data_train['Departure Delay in Minutes'] 
                         + data_train['Arrival Delay in Minutes']).apply(lambda x : 0 if x > 0 else 1)
data_test['Delayed'] = (data_test['Departure Delay in Minutes'] 
                         + data_test['Arrival Delay in Minutes']).apply(lambda x : 0 if x > 0 else 1)

In [53]:
data_train = data_train.drop(['Departure/Arrival time convenient', 'Departure Delay in Minutes', 'id'], axis = 1)
data_test = data_test.drop(['Departure/Arrival time convenient', 'Departure Delay in Minutes', 'id'], axis = 1)

In [54]:
#log 변환 실행

data_train['Arrival Delay in Minutes'] = np.log1p(data_train['Arrival Delay in Minutes'])

#test 데이터에도 변환 실행

data_test['Arrival Delay in Minutes'] = np.log1p(data_test['Arrival Delay in Minutes'])

In [55]:
from pycaret.classification import *
clf1 = setup(data = data_train,
             silent = True,
             target = 'target',
             session_id = 20201809,
             #ordinal_features = ordinal_dic,
             #categorical_imputation = 'mode',
             #imputation_type = 'iterative',
             #iterative_imputation_iters = 20,
             #categorical_iterative_imputer = 'rf',
             #fix_imbalance = True,
             #categorical_features = ['Age'],
             #remove_outliers = True,
             #feature_selection = True,
             #create_clusters = True,
             n_jobs = 7)

Unnamed: 0,Description,Value
0,session_id,20201809
1,Target,target
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(3000, 23)"
5,Missing Values,True
6,Numeric Features,5
7,Categorical Features,17
8,Ordinal Features,False
9,High Cardinality Features,False


In [56]:
best = compare_models(n_select=3, fold = 10) #상위 모델 3개를 선택합니다

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9362,0.9822,0.9496,0.9393,0.944,0.8697,0.8707,0.084
xgboost,Extreme Gradient Boosting,0.9357,0.9819,0.9529,0.9354,0.9438,0.8686,0.8696,0.213
catboost,CatBoost Classifier,0.9328,0.982,0.947,0.9357,0.9411,0.8629,0.8635,0.854
gbc,Gradient Boosting Classifier,0.9233,0.9744,0.9352,0.9303,0.9324,0.8437,0.8446,0.101
et,Extra Trees Classifier,0.9209,0.9755,0.9386,0.9237,0.9308,0.8385,0.8394,0.189
rf,Random Forest Classifier,0.9161,0.9728,0.9319,0.9217,0.9264,0.829,0.8299,0.199
lr,Logistic Regression,0.8942,0.9577,0.9016,0.9114,0.9059,0.7851,0.7864,0.134
ridge,Ridge Classifier,0.8909,0.0,0.8999,0.9075,0.9033,0.7782,0.7792,0.006
dt,Decision Tree Classifier,0.8904,0.8876,0.9092,0.9001,0.9039,0.7764,0.7782,0.008
lda,Linear Discriminant Analysis,0.8904,0.9563,0.8999,0.9067,0.9029,0.7772,0.7782,0.01


In [57]:
lr = create_model('lr')
lr = tune_model(lr)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9238,0.9756,0.9076,0.9558,0.931,0.8461,0.8475
1,0.9,0.9543,0.8739,0.9455,0.9083,0.7987,0.8017
2,0.9095,0.9739,0.9496,0.8968,0.9224,0.8141,0.816
3,0.8857,0.942,0.8655,0.9279,0.8957,0.7697,0.772
4,0.9095,0.9766,0.9412,0.9032,0.9218,0.8146,0.8155
5,0.8571,0.9343,0.8739,0.8739,0.8739,0.7091,0.7091
6,0.8667,0.9419,0.8739,0.8889,0.8814,0.7292,0.7293
7,0.9333,0.967,0.958,0.9268,0.9421,0.8635,0.8642
8,0.8905,0.9427,0.916,0.8934,0.9046,0.7761,0.7765
9,0.8947,0.9654,0.9237,0.8934,0.9083,0.7848,0.7854


In [58]:
lr

LogisticRegression(C=0.381, class_weight={}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=20201809, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
catboost = create_model('catboost', learning_rate = 0.1, iterations = 460, max_depth = 8)
predict_model(catboost).head(20)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9476,0.9876,0.9328,0.9737,0.9528,0.894,0.8951
1,0.9524,0.9843,0.9328,0.9823,0.9569,0.9038,0.9053
2,0.919,0.9862,0.9748,0.8923,0.9317,0.8328,0.8377
3,0.9095,0.9743,0.9328,0.9098,0.9212,0.8151,0.8154
4,0.9381,0.9829,0.958,0.9344,0.9461,0.8735,0.8738
5,0.919,0.9855,0.9412,0.918,0.9295,0.8345,0.8349
6,0.9429,0.9744,0.9496,0.9496,0.9496,0.8836,0.8836
7,0.9476,0.9833,0.958,0.95,0.954,0.8932,0.8932
8,0.9286,0.9812,0.9328,0.9407,0.9367,0.8547,0.8548
9,0.9282,0.9813,0.9492,0.9256,0.9372,0.8535,0.8538


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.9279,0.9811,0.9645,0.9059,0.9343,0.8545,0.8566


Unnamed: 0,Age,Flight Distance,Inflight wifi service,Leg room service,Arrival Delay in Minutes,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Class_Business,Class_Eco,...,Online boarding_3,Online boarding_4,Online boarding_5,distance_0,distance_1,distance_2,Delayed_0,target,Label,Score
0,47.0,974.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.7153
1,38.0,1236.0,5.0,4.0,2.890372,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
2,26.0,2065.0,3.0,4.0,2.890372,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9053
3,25.0,2934.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9992
4,14.0,3119.0,4.0,5.0,2.70805,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,1,0.9924
5,43.0,4113.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0,0.8437
6,26.0,2233.0,4.0,5.0,4.127134,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9523
7,37.0,4188.0,5.0,4.0,1.791759,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9592
8,48.0,2522.0,4.0,5.0,1.94591,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
9,36.0,534.0,5.0,1.0,2.397895,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.9089


In [60]:
lightgbm = create_model('lightgbm',boosting_type='gbdt', class_weight=None, colsample_bytree=1,
               importance_type='split', learning_rate=0.08, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=240, n_jobs=6, num_leaves=32, objective=None,
               random_state=20201809, reg_alpha=0.0, reg_lambda=0.0,
               silent=True, subsample=1.0, subsample_for_bin=5000,
               subsample_freq=0)
predict_model(lightgbm).head(20)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9476,0.99,0.9328,0.9737,0.9528,0.894,0.8951
1,0.9381,0.9789,0.916,0.9732,0.9437,0.8751,0.8771
2,0.9286,0.9862,0.9748,0.9062,0.9393,0.8528,0.8562
3,0.9143,0.9719,0.9328,0.9174,0.925,0.825,0.8252
4,0.9429,0.9775,0.9748,0.928,0.9508,0.8827,0.8843
5,0.9095,0.9819,0.916,0.9237,0.9198,0.816,0.816
6,0.9381,0.9752,0.9412,0.9492,0.9451,0.8741,0.8742
7,0.9524,0.9812,0.958,0.958,0.958,0.903,0.903
8,0.9381,0.9824,0.9328,0.9569,0.9447,0.8744,0.8748
9,0.9187,0.9831,0.9407,0.9174,0.9289,0.8339,0.8343


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9223,0.9817,0.9457,0.9115,0.9283,0.8436,0.8443


Unnamed: 0,Age,Flight Distance,Inflight wifi service,Leg room service,Arrival Delay in Minutes,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Class_Business,Class_Eco,...,Online boarding_3,Online boarding_4,Online boarding_5,distance_0,distance_1,distance_2,Delayed_0,target,Label,Score
0,47.0,974.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.8806
1,38.0,1236.0,5.0,4.0,2.890372,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
2,26.0,2065.0,3.0,4.0,2.890372,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9515
3,25.0,2934.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9999
4,14.0,3119.0,4.0,5.0,2.70805,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,1,0.9864
5,43.0,4113.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0,0.9265
6,26.0,2233.0,4.0,5.0,4.127134,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9833
7,37.0,4188.0,5.0,4.0,1.791759,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9878
8,48.0,2522.0,4.0,5.0,1.94591,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
9,36.0,534.0,5.0,1.0,2.397895,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.9853


In [61]:
xgboost = create_model('xgboost', base_score=0.4, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=8,
              min_child_weight=0.7, monotone_constraints='()',
              n_estimators=500, n_jobs=6, num_parallel_tree=1,
              objective='binary:logistic', random_state=20201809, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto',
              use_label_encoder=True, validate_parameters=1, verbosity=0)
predict_model(xgboost).head(20)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9571,0.991,0.9328,0.9911,0.961,0.9135,0.9156
1,0.9286,0.9779,0.916,0.9561,0.9356,0.8555,0.8565
2,0.9476,0.9901,0.9832,0.9286,0.9551,0.8924,0.8945
3,0.9095,0.9772,0.9328,0.9098,0.9212,0.8151,0.8154
4,0.9524,0.9805,0.9748,0.9431,0.9587,0.9025,0.9032
5,0.9143,0.9813,0.9244,0.9244,0.9244,0.8255,0.8255
6,0.9333,0.9746,0.9496,0.9339,0.9417,0.8639,0.8641
7,0.9429,0.9802,0.958,0.9421,0.95,0.8833,0.8835
8,0.919,0.9812,0.9244,0.9322,0.9283,0.8354,0.8354
9,0.9139,0.9816,0.9322,0.9167,0.9244,0.8244,0.8245


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9279,0.981,0.9478,0.919,0.9332,0.8548,0.8553


Unnamed: 0,Age,Flight Distance,Inflight wifi service,Leg room service,Arrival Delay in Minutes,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Class_Business,Class_Eco,...,Online boarding_3,Online boarding_4,Online boarding_5,distance_0,distance_1,distance_2,Delayed_0,target,Label,Score
0,47.0,974.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.7871
1,38.0,1236.0,5.0,4.0,2.890372,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
2,26.0,2065.0,3.0,4.0,2.890372,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9871
3,25.0,2934.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
4,14.0,3119.0,4.0,5.0,2.70805,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,1,0.9963
5,43.0,4113.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0,0.8971
6,26.0,2233.0,4.0,5.0,4.127134,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9612
7,37.0,4188.0,5.0,4.0,1.791759,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9806
8,48.0,2522.0,4.0,5.0,1.94591,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
9,36.0,534.0,5.0,1.0,2.397895,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.9054


In [74]:
et = create_model('et', bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=210, n_jobs=7,
                     oob_score=False, random_state=20201809,
                     warm_start=False)
predict_model(et).head(20)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9429,0.9799,0.9328,0.9652,0.9487,0.8842,0.8849
1,0.9429,0.9768,0.9328,0.9652,0.9487,0.8842,0.8849
2,0.9238,0.9806,0.958,0.912,0.9344,0.8436,0.8451
3,0.9095,0.9664,0.9244,0.9167,0.9205,0.8155,0.8156
4,0.9381,0.9867,0.9496,0.9417,0.9456,0.8738,0.8738
5,0.919,0.9701,0.9328,0.925,0.9289,0.835,0.835
6,0.9095,0.9741,0.9244,0.9167,0.9205,0.8155,0.8156
7,0.9381,0.9759,0.958,0.9344,0.9461,0.8735,0.8738
8,0.9286,0.9734,0.9496,0.9262,0.9378,0.854,0.8544
9,0.9043,0.966,0.9492,0.8889,0.918,0.8034,0.8059


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9168,0.9766,0.9562,0.8945,0.9243,0.8321,0.8343


Unnamed: 0,Age,Flight Distance,Inflight wifi service,Leg room service,Arrival Delay in Minutes,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Class_Business,Class_Eco,...,Cleanliness_5,Online boarding_1,Online boarding_2,Online boarding_3,Online boarding_4,Online boarding_5,Delayed_0,target,Label,Score
0,47.0,974.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0.5571
1,38.0,1236.0,5.0,4.0,2.890372,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,1.0
2,26.0,2065.0,3.0,4.0,2.890372,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.5429
3,25.0,2934.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9952
4,14.0,3119.0,4.0,5.0,2.70805,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0.9143
5,43.0,4113.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,0.7714
6,26.0,2233.0,4.0,5.0,4.127134,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.781
7,37.0,4188.0,5.0,4.0,1.791759,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.8
8,48.0,2522.0,4.0,5.0,1.94591,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9667
9,36.0,534.0,5.0,1.0,2.397895,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.7238


In [76]:
stack_3 = stack_models(estimator_list = [et, lightgbm, xgboost, catboost], meta_model = lr)
predict_model(stack_3).head(20)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9476,0.9937,0.9328,0.9737,0.9528,0.894,0.8951
1,0.9429,0.9806,0.9244,0.9735,0.9483,0.8845,0.886
2,0.9381,0.9861,0.9664,0.9274,0.9465,0.8731,0.8742
3,0.9143,0.966,0.9244,0.9244,0.9244,0.8255,0.8255
4,0.9286,0.976,0.9664,0.9127,0.9388,0.8532,0.8552
5,0.919,0.9681,0.9328,0.925,0.9289,0.835,0.835
6,0.9286,0.9692,0.9412,0.9333,0.9372,0.8544,0.8544
7,0.9524,0.9807,0.958,0.958,0.958,0.903,0.903
8,0.9238,0.9791,0.9328,0.9328,0.9328,0.8449,0.8449
9,0.9282,0.9831,0.9407,0.9328,0.9367,0.8538,0.8539


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.9312,0.9794,0.952,0.9212,0.9363,0.8615,0.8621


Unnamed: 0,Age,Flight Distance,Inflight wifi service,Leg room service,Arrival Delay in Minutes,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Class_Business,Class_Eco,...,Cleanliness_5,Online boarding_1,Online boarding_2,Online boarding_3,Online boarding_4,Online boarding_5,Delayed_0,target,Label,Score
0,47.0,974.0,1.0,4.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,0,0.5681
1,38.0,1236.0,5.0,4.0,2.890372,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9915
2,26.0,2065.0,3.0,4.0,2.890372,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.8376
3,25.0,2934.0,4.0,4.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9857
4,14.0,3119.0,4.0,5.0,2.70805,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1,1,0.9505
5,43.0,4113.0,4.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,0.8419
6,26.0,2233.0,4.0,5.0,4.127134,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0.9579
7,37.0,4188.0,5.0,4.0,1.791759,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0,0.9646
8,48.0,2522.0,4.0,5.0,1.94591,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,0.9911
9,36.0,534.0,5.0,1.0,2.397895,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0.8218


In [77]:
final_stack_3 = finalize_model(stack_3)
prediction = predict_model(final_stack_3, data = data_test)

In [78]:
print(prediction["Label"])

0       1
1       0
2       1
3       1
4       1
       ..
1995    0
1996    1
1997    0
1998    1
1999    1
Name: Label, Length: 2000, dtype: int32


In [79]:
submission = pd.read_csv("./sample_submission.csv")
submission.head()

Unnamed: 0,id,target
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [82]:
submission['target'] = prediction['Label']
submission.head()

Unnamed: 0,id,target
0,1,1
1,2,0
2,3,1
3,4,1
4,5,1


In [81]:
submission.to_csv("flight_predict9.csv",index=False)