# Test out different models

In [33]:
# load packages
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier


In [34]:
anomoly  = False
# load data either by anomoly or not
if anomoly:
    X=pd.read_csv("https://raw.githubusercontent.com/KelvinYQC/msia420PA_project/main/Data/with_anomaly.csv")
else:
    X=pd.read_csv("https://raw.githubusercontent.com/KelvinYQC/msia420PA_project/main/Data/without_anomaly.csv")
y = X['booking_status']
X.drop(['booking_status'], axis = 1, inplace = True)

In [35]:
# train test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=None, train_size=0.3, random_state=1)

In [36]:
# the dataset is somewhat imbalanced
y_test.value_counts()

0    14975
1     7215
Name: booking_status, dtype: int64

In [37]:
from sklearn.svm import SVC
from collections import defaultdict
from sklearn import linear_model
from sklearn import neighbors
from sklearn import neural_network

result = defaultdict(int)
MLA_compare = pd.DataFrame()
# list of model for comparison
modelList = [
    # linear model
    linear_model.LogisticRegression(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    # knn
    neighbors.KNeighborsClassifier(),
    DecisionTreeClassifier(),
    SVC(),
    #tree
    DecisionTreeClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    xgb.XGBClassifier(),
    CatBoostClassifier(verbose = False),
    #NN
    neural_network.MLPClassifier()

]

def modeling(models, X_train,y_train, X_test, y_test, metric):
    row_index = 0
    for classifier in models:
        pipeline = make_pipeline(StandardScaler(), classifier)
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        classifier_metric = metric(y_test, y_pred)
        result[classifier.__class__.__name__] = classifier_metric
        row_index+=1
    return result


In [38]:
# train and predict the mode
performance = modeling(modelList, X_train,y_train, X_test, y_test,roc_auc_score)



In [39]:
# rank the performance of different models
sorted(performance.items(), key=lambda x:x[1])

[('SGDClassifier', 0.730818400267482),
 ('RidgeClassifierCV', 0.7442300808577937),
 ('LogisticRegression', 0.7607287729491403),
 ('KNeighborsClassifier', 0.8023580534431953),
 ('GradientBoostingClassifier', 0.811148495355507),
 ('MLPClassifier', 0.8225210185143407),
 ('DecisionTreeClassifier', 0.826992967026356),
 ('CatBoostClassifier', 0.8495884223764023),
 ('XGBClassifier', 0.8567963468798193),
 ('RandomForestClassifier', 0.8614405390365324)]

We found XGboost, random forest, catboost has better performance. It can be due to we have more categorical featues. We would further fine tune some of the best performing models of come up with a final model