# Import Module

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss, f1_score, roc_auc_score
import optuna
import datetime as dt
import pickle

# Import Data

In [5]:
test = pd.read_parquet('test.parquet', engine='pyarrow')
train = pd.read_parquet('train.parquet', engine='pyarrow')
valid = pd.read_parquet('valid.parquet', engine='pyarrow')
test.head()

Unnamed: 0,is_canceled,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,booking_changes,adr,required_car_parking_spaces,...,assigned_room_type_I,assigned_room_type_K,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,season_Spring,season_Summer,season_Winter
0,0,0.074579,-0.793444,0.293007,-0.258811,-0.08708,-0.161708,-0.37404,0.177462,-0.254353,...,-0.053689,-0.039663,-0.373801,-0.038007,-0.068571,-1.729525,1.932726,1.623363,-0.673744,-0.456737
1,0,-0.932919,-0.263044,-1.784639,-0.258811,-0.08708,-0.161708,5.460749,-1.535995,-0.254353,...,-0.053689,-0.039663,-0.373801,-0.038007,-0.068571,0.578193,-0.517404,1.623363,-0.673744,-0.456737
2,0,0.074579,-0.793444,0.293007,-0.258811,-0.08708,-0.161708,-0.37404,0.134625,-0.254353,...,-0.053689,-0.039663,-0.373801,-0.038007,-0.068571,0.578193,-0.517404,-0.616005,1.484243,-0.456737
3,0,-0.932919,0.267356,0.293007,-0.258811,-0.08708,-0.161708,-0.37404,-0.722103,-0.254353,...,-0.053689,-0.039663,-0.373801,-0.038007,-0.068571,-1.729525,1.932726,-0.616005,-0.673744,-0.456737
4,1,0.074579,-0.793444,0.293007,-0.258811,-0.08708,-0.161708,-0.37404,-0.850612,-0.254353,...,-0.053689,-0.039663,2.675218,-0.038007,-0.068571,-1.729525,1.932726,-0.616005,1.484243,-0.456737


# Model Training

In [6]:
# Define features (exclude is_canceled and reservation_status)
features = [col for col in train.columns if col not in ['is_canceled']]

X_train = train[features]
y_train = train['is_canceled']

X_valid = valid[features]
y_valid = valid['is_canceled']

X_test = test[features]
y_test = test['is_canceled']

In [None]:
# Train logistic regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred_train = log_reg.predict(X_train)
y_pred_test = log_reg.predict(X_test)
y_prob_train = log_reg.predict_proba(X_train)[:, 1]  # Probability for positive class (cancellation)
y_prob_test = log_reg.predict_proba(X_test)[:, 1]

# Print results
print("Logistic Regression Results:")
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Training F1-Score:", f1_score(y_train, y_pred_train))
print("Test F1-Score:", f1_score(y_test, y_pred_test))
print("Training AUC-ROC:", roc_auc_score(y_train, y_prob_train))
print("Test AUC-ROC:", roc_auc_score(y_test, y_prob_test))
print("Training Log Loss:", log_loss(y_train, log_reg.predict_proba(X_train)))
print("Test Log Loss:", log_loss(y_test, log_reg.predict_proba(X_test)))
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))

Logistic Regression Results:
Training Accuracy: 0.8301225545191311
Test Accuracy: 0.8346470185058259
Training F1-Score: 0.7539197325450016
Test F1-Score: 0.7656060238037407
Training AUC-ROC: 0.9075547472844792
Test AUC-ROC: 0.9145720087991329
Training Log Loss: 0.35681028146422245
Test Log Loss: 0.34835993670134646
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      7245
           1       0.83      0.71      0.77      4427

    accuracy                           0.83     11672
   macro avg       0.83      0.81      0.82     11672
weighted avg       0.83      0.83      0.83     11672



# Model Hyperparameter Tuning

In [8]:
# Define objective function for Optuna
def objective(trial):
    C = trial.suggest_float('C', 1e-4, 1e2, log=True)  # Regularization strength
    solver = trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga'])
    model = LogisticRegression(C=C, solver=solver, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred)  # Optimize for F1-score

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Run 20 trials
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best parameters
log_reg = LogisticRegression(**best_params, max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred_train = log_reg.predict(X_train)
y_pred_valid = log_reg.predict(X_valid)
y_prob_train = log_reg.predict_proba(X_train)[:, 1]  # Probability for positive class (cancellation)
y_prob_valid = log_reg.predict_proba(X_valid)[:, 1]

# Print results
print("Logistic Regression Results:")
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Validation Accuracy:", accuracy_score(y_valid, y_pred_valid))
print("Training F1-Score:", f1_score(y_train, y_pred_train))
print("Validation F1-Score:", f1_score(y_valid, y_pred_valid))
print("Training AUC-ROC:", roc_auc_score(y_train, y_prob_train))
print("Validation AUC-ROC:", roc_auc_score(y_valid, y_prob_valid))
print("Training Log Loss:", log_loss(y_train, log_reg.predict_proba(X_train)))
print("Validation Log Loss:", log_loss(y_valid, log_reg.predict_proba(X_valid)))
print("Classification Report (Validation):\n", classification_report(y_valid, y_pred_valid))

[I 2025-07-05 22:51:33,674] A new study created in memory with name: no-name-a9afecd1-1558-4631-9eb8-8f58cb07ef72
[I 2025-07-05 22:53:10,658] Trial 0 finished with value: 0.758780487804878 and parameters: {'C': 35.41661231071705, 'solver': 'saga'}. Best is trial 0 with value: 0.758780487804878.
[I 2025-07-05 22:53:11,887] Trial 1 finished with value: 0.7326732673267328 and parameters: {'C': 0.00021080659080759763, 'solver': 'liblinear'}. Best is trial 0 with value: 0.758780487804878.
[I 2025-07-05 22:54:49,216] Trial 2 finished with value: 0.7588730332967434 and parameters: {'C': 0.30803465861009216, 'solver': 'saga'}. Best is trial 2 with value: 0.7588730332967434.
[I 2025-07-05 22:54:53,651] Trial 3 finished with value: 0.7590831504511094 and parameters: {'C': 80.7968714279661, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7590831504511094.
[I 2025-07-05 22:54:56,523] Trial 4 finished with value: 0.7586797066014669 and parameters: {'C': 0.010915244902664567, 'solver': 'liblinear'

Best Hyperparameters: {'C': 5.746512745020719, 'solver': 'lbfgs'}
Logistic Regression Results:
Training Accuracy: 0.8301118697310639
Validation Accuracy: 0.83079033087605
Training F1-Score: 0.7539156812975918
Validation F1-Score: 0.75932699341624
Training AUC-ROC: 0.9075576389127726
Validation AUC-ROC: 0.911256137927355
Training Log Loss: 0.356804582445139
Validation Log Loss: 0.3537601295866459
Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.83      0.91      0.87      7224
           1       0.83      0.70      0.76      4442

    accuracy                           0.83     11666
   macro avg       0.83      0.81      0.81     11666
weighted avg       0.83      0.83      0.83     11666



In [9]:
# Train logistic regression
log_reg = LogisticRegression(max_iter=1000, random_state=42, C= 5.746512745020719, solver = 'lbfgs')
log_reg.fit(X_train, y_train)

# Predict and evaluate
y_pred_train = log_reg.predict(X_train)
y_pred_test = log_reg.predict(X_test)
y_prob_train = log_reg.predict_proba(X_train)[:, 1]  # Probability for positive class (cancellation)
y_prob_test = log_reg.predict_proba(X_test)[:, 1]

# Print results
print("Logistic Regression Results:")
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Training F1-Score:", f1_score(y_train, y_pred_train))
print("Test F1-Score:", f1_score(y_test, y_pred_test))
print("Training AUC-ROC:", roc_auc_score(y_train, y_prob_train))
print("Test AUC-ROC:", roc_auc_score(y_test, y_prob_test))
print("Training Log Loss:", log_loss(y_train, log_reg.predict_proba(X_train)))
print("Test Log Loss:", log_loss(y_test, log_reg.predict_proba(X_test)))
print("Classification Report (Test):\n", classification_report(y_test, y_pred_test))

Logistic Regression Results:
Training Accuracy: 0.8301118697310639
Test Accuracy: 0.8346470185058259
Training F1-Score: 0.7539156812975918
Test F1-Score: 0.7656060238037407
Training AUC-ROC: 0.9075576389127726
Test AUC-ROC: 0.9145795695309057
Training Log Loss: 0.356804582445139
Test Log Loss: 0.34833466539064795
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      7245
           1       0.83      0.71      0.77      4427

    accuracy                           0.83     11672
   macro avg       0.83      0.81      0.82     11672
weighted avg       0.83      0.83      0.83     11672



In [10]:
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': np.abs(log_reg.coef_[0])
})

feature_importances.sort_values(by = 'importance').tail(20)

Unnamed: 0,feature,importance
118,distribution_channel_TA/TO,0.177804
110,market_segment_Corporate,0.186477
112,market_segment_Groups,0.193612
6,booking_changes,0.230854
5,is_repeated_guest,0.235178
116,distribution_channel_Direct,0.251051
142,season_Spring,0.260086
7,adr,0.328917
114,market_segment_Online TA,0.36775
113,market_segment_Offline TA/TO,0.368812


In [None]:
# Save the model to a .pkl file
with open('log_reg_model.pkl', 'wb') as file:
    pickle.dump(log_reg, file)