# Tunning Logistic and NN model

In [36]:
# load package
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [37]:
# load the data
anomoly  = False
# load data either by anomoly or not
if anomoly:
    X=pd.read_csv("https://raw.githubusercontent.com/KelvinYQC/msia420PA_project/main/Data/with_anomaly.csv")
else:
    X=pd.read_csv("https://raw.githubusercontent.com/KelvinYQC/msia420PA_project/main/Data/without_anomaly.csv")
y = X['booking_status']
X.drop(['booking_status'], axis = 1, inplace = True)


# Baseline model-- Logistic Regression


## Modeling

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape


(21708, 22)

In [39]:
LR_model = LogisticRegression(random_state=0).fit(X_train, y_train)


In [40]:
# score = correct predictions / total number of data
score = LR_model.score(X_test, y_test)
print(score)


0.8015524174693724


In [41]:
y_pred = LR_model.predict(X_test)
# y_pred_prob = LR_model.predict_proba(X_test)
# y_pred_prob
# print(roc_auc_score(y, LR_model.predict_proba(X_test)[:, 1]))

In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      7287
           1       0.72      0.62      0.66      3406

    accuracy                           0.80     10693
   macro avg       0.78      0.75      0.76     10693
weighted avg       0.80      0.80      0.80     10693



## Logistic regression with ridge

In [43]:
from sklearn.linear_model import RidgeClassifierCV
LR_ridge = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, y_train)
LR_ridge.score(X_test, y_test)

0.8013653792200505

In [44]:
y_pred_ridge = LR_ridge.predict(X_test)
y_pred_ridge

array([0, 0, 1, ..., 0, 0, 0])

In [45]:
print(classification_report(y_test, y_pred_ridge))


              precision    recall  f1-score   support

           0       0.82      0.90      0.86      7287
           1       0.73      0.59      0.65      3406

    accuracy                           0.80     10693
   macro avg       0.78      0.75      0.76     10693
weighted avg       0.80      0.80      0.80     10693



# Second Model: Neural Network

In [46]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_standardize = scaler.fit_transform(X_train)
X_test_standardize = scaler.fit_transform(X_test)



In [47]:
nn1 = MLPClassifier(solver='lbfgs', 
                    # alpha=1e-5,
                    # hidden_layer_sizes=(6,), 
                    random_state=123)
nn1.fit(X_train_standardize,y_train)
y_predNN = nn1.predict(X_test_standardize)

In [48]:
print(classification_report(y_test,y_predNN))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89      7287
           1       0.78      0.76      0.77      3406

    accuracy                           0.86     10693
   macro avg       0.83      0.83      0.83     10693
weighted avg       0.85      0.86      0.85     10693



In [63]:
params = {'hidden_layer_sizes': [(30,),(50,),(70,),(100,)],
         'learning_rate_init': [0.0001, 0.001,0.01, 0.1,1]}
nn_model = MLPClassifier()


In [64]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV

gs_nn1 = HalvingRandomSearchCV(
    nn_model, params, scoring="roc_auc", n_jobs=-1, factor=4, cv = 10
)
# gs_nn1 = GridSearchCV(nn_model,
#                       param_grid=params,
#                       scoring='roc_auc',
#                       cv=10)


In [65]:
gs_nn1.fit(X_train_standardize,y_train)
print(gs_nn1.best_params_)

Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 219, in __call__
    return self._score(
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 384, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 571, in roc_auc_score
    return _average_binary_score(
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/metrics/_base.py", line 75, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "/opt/homebrew/lib/python3.10/site-packages/sklearn/metrics/_ranking.py", line 339, in _binary_roc_auc_score
    raise ValueError(
ValueError: Only one class present in y_true. ROC

{'learning_rate_init': 0.001, 'hidden_layer_sizes': (30,)}


In [66]:
gs_knn_pred = gs_nn1.predict(X_test_standardize)

In [67]:
print(classification_report(y_test,gs_knn_pred))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7287
           1       0.77      0.74      0.75      3406

    accuracy                           0.85     10693
   macro avg       0.82      0.82      0.82     10693
weighted avg       0.84      0.85      0.85     10693



In [68]:
print('The final auc score for NN is: ')
print(round(roc_auc_score(y_test, gs_nn1.predict_proba(X_test_standardize)[:, 1]), 3))

The final auc score for NN is: 
0.909
