In [87]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier



In [88]:
PCA = False
if PCA:
    X = pd.read_csv("msia420PA_project/Dimension Reduction/PCA_dat.csv", index_col=0) # PCA Model
else:
    X = pd.read_csv('data.csv') # full model
    X.drop(['booking_status', 'Booking_ID'], axis = 1, inplace = True)

y = pd.read_csv("data.csv")['booking_status']

In [89]:
le = LabelEncoder()
y = pd.DataFrame(le.fit_transform(y))

In [90]:
X = pd.get_dummies(X)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=None, train_size=0.3, random_state=1)

In [92]:
y_test.value_counts()

1    17064
0     8329
dtype: int64

In [93]:
from collections import defaultdict
from sklearn import linear_model
from sklearn import neighbors
from sklearn import neural_network

result = defaultdict(int)
MLA_compare = pd.DataFrame()
modelList = [
    # linear model
    linear_model.LogisticRegression(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    # knn
    neighbors.KNeighborsClassifier(),
    #tree
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    xgb.XGBClassifier(),
    CatBoostClassifier(),
    #NN
    neural_network.MLPClassifier()

]

def modeling(models, X_train,y_train, X_test, y_test, metric):
    row_index = 0
    for classifier in models:
        pipeline = make_pipeline(StandardScaler(), classifier)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        classifier_metric = metric(y_test, y_pred)
        result[classifier.__class__.__name__] = classifier_metric
        row_index+=1
    return result


In [94]:
performance = modeling(modelList, X_train,y_train, X_test, y_test,roc_auc_score)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.02855
0:	learn: 0.6728159	total: 60.9ms	remaining: 1m
1:	learn: 0.6542934	total: 67ms	remaining: 33.4s
2:	learn: 0.6372579	total: 84ms	remaining: 27.9s
3:	learn: 0.6207151	total: 98.2ms	remaining: 24.5s
4:	learn: 0.6056729	total: 110ms	remaining: 21.9s
5:	learn: 0.5929589	total: 126ms	remaining: 20.9s
6:	learn: 0.5785463	total: 131ms	remaining: 18.6s
7:	learn: 0.5681950	total: 142ms	remaining: 17.7s
8:	learn: 0.5567664	total: 151ms	remaining: 16.7s
9:	learn: 0.5472977	total: 165ms	remaining: 16.4s
10:	learn: 0.5381386	total: 193ms	remaining: 17.4s
11:	learn: 0.5306355	total: 200ms	remaining: 16.4s
12:	learn: 0.5219766	total: 205ms	remaining: 15.5s
13:	learn: 0.5138563	total: 210ms	remaining: 14.8s
14:	learn: 0.5065350	total: 215ms	remaining: 14.1s
15:	learn: 0.4992767	total: 220ms	remaining: 13.5s
16:	learn: 0.4931755	total: 226ms	remaining: 13.1s
17:	learn: 0.4880581	total: 230ms	remaining: 12.6s
18:	learn: 0.4816486	total: 234ms	remaining: 12.1s
19:	learn: 0.47

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Learning rate set to 0.02855
0:	learn: 0.6728159	total: 5.92ms	remaining: 5.92s
1:	learn: 0.6542934	total: 19.4ms	remaining: 9.66s
2:	learn: 0.6372579	total: 26.1ms	remaining: 8.66s
3:	learn: 0.6207151	total: 29.8ms	remaining: 7.42s
4:	learn: 0.6056729	total: 34.4ms	remaining: 6.84s
5:	learn: 0.5929589	total: 38.7ms	remaining: 6.42s
6:	learn: 0.5785463	total: 46.1ms	remaining: 6.54s
7:	learn: 0.5681950	total: 50.9ms	remaining: 6.31s
8:	learn: 0.5567664	total: 55.3ms	remaining: 6.09s
9:	learn: 0.5472977	total: 59.3ms	remaining: 5.87s
10:	learn: 0.5381386	total: 71.3ms	remaining: 6.41s
11:	learn: 0.5306355	total: 76.5ms	remaining: 6.3s
12:	learn: 0.5219766	total: 81.1ms	remaining: 6.16s
13:	learn: 0.5138563	total: 85.4ms	remaining: 6.02s
14:	learn: 0.5065350	total: 90ms	remaining: 5.91s
15:	learn: 0.4992767	total: 94.8ms	remaining: 5.83s
16:	learn: 0.4931755	total: 98.6ms	remaining: 5.7s
17:	learn: 0.4880581	total: 103ms	remaining: 5.63s
18:	learn: 0.4816486	total: 117ms	remaining: 6.06s

  y = column_or_1d(y, warn=True)


In [95]:
sorted(performance.items(), key=lambda x:x[1])

[('RidgeClassifierCV', 0.7434586308368397),
 ('SGDClassifier', 0.7497951994108667),
 ('LogisticRegression', 0.7581825988332498),
 ('KNeighborsClassifier', 0.7976130675152204),
 ('GradientBoostingClassifier', 0.8130585675296584),
 ('MLPClassifier', 0.8226697995475228),
 ('DecisionTreeClassifier', 0.8237694114300899),
 ('CatBoostClassifier', 0.8523719570463562),
 ('XGBClassifier', 0.8605311822625966),
 ('RandomForestClassifier', 0.8622871481074519)]

[('RidgeClassifierCV', 0.7434586308368397),
 ('LogisticRegression', 0.7581825988332498),
 ('SGDClassifier', 0.7623736705956295),
 ('KNeighborsClassifier', 0.7976130675152204),
 ('GradientBoostingClassifier', 0.8130585675296584),
 ('MLPClassifier', 0.8174178244979936),
 ('DecisionTreeClassifier', 0.8253831127207245),
 ('CatBoostClassifier', 0.8523719570463562),
 ('XGBClassifier', 0.8605311822625966),
 ('RandomForestClassifier', 0.863086857205128)]