In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [11]:
# Read in iris_raw.csv
df = pd.read_csv("../data/customer_raw.csv")

# Build X, y (target -> y), drop species if present
y = df["churn"]
X = df.drop(columns=["churn",'product_id','sale_id', 'customer_id', 'price', 'total_value','feedback_text'], errors="ignore")

In [12]:
# one hot encode categorical variables
categorical_cols = ['gender', 'region', 'segment', 'category', 'product_name', 'sentiment']
ohe = OneHotEncoder(sparse_output=False, drop='first')
X_categorical = ohe.fit_transform(X[categorical_cols])
X_numerical = X.drop(columns=categorical_cols, errors="ignore")
# Combine numerical and categorical features
X = pd.concat([X_numerical, pd.DataFrame(X_categorical, columns=ohe.get_feature_names_out(categorical_cols))], axis=1)
X.head()

Unnamed: 0,quantity,sale_date,age,tenure_months,last_purchase_date,log_price,log_total_value,gender_Male,region_North,region_South,...,product_name_Laptop,product_name_Monitor,product_name_Notebook,product_name_Office Chair,product_name_Pen Pack,product_name_Printer,product_name_Projector,product_name_Smartphone,sentiment_Neutral,sentiment_Positive
0,2,2020-01-07,40,37,2023-04-04,9.392745,10.085851,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,2023-11-10,50,44,2023-03-02,10.71444,11.407576,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,4,2022-07-04,33,25,2023-11-04,10.71444,12.100718,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1,2023-10-18,41,56,2023-08-18,9.392745,9.392745,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,2023-09-13,60,52,2023-05-14,9.392745,10.778977,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Convert date columns to datetime
# Convert to datetime
df["sale_date"] = pd.to_datetime(df["sale_date"], errors="coerce")
df["last_purchase_date"] = pd.to_datetime(df["last_purchase_date"], errors="coerce")

# Derive features (example)
df["days_since_last_purchase"] = (pd.Timestamp("today") - df["last_purchase_date"]).dt.days
df["sale_year"] = df["sale_date"].dt.year
df["sale_month"] = df["sale_date"].dt.month

In [14]:
# add the new features to X
X["days_since_last_purchase"] = df["days_since_last_purchase"]
X["sale_year"] = df["sale_year"]
X["sale_month"] = df["sale_month"]
# drop the original date columns
X = X.drop(columns=["sale_date", "last_purchase_date"], errors="ignore")

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
log_model = LogisticRegression(max_iter=500, class_weight="balanced")
param_grid = {"C": [0.01, 0.1, 1, 10, 100]}
search = GridSearchCV(log_model, param_grid, cv=5, scoring="roc_auc")
search.fit(X_train, y_train)
print("Best C:", search.best_params_)
print("Best CV AUC:", search.best_score_)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to sca

Best C: {'C': 0.1}
Best CV AUC: 0.5402363552251457


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Best model after tuning
best_model = search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

In [21]:
# Metrics
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_prob)

print("Best C:", search.best_params_)
print("Best CV AUC:", search.best_score_)
print("Test Accuracy:", acc)
print("Test ROC AUC:", roc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best C: {'C': 0.1}
Best CV AUC: 0.5402363552251457
Test Accuracy: 0.53
Test ROC AUC: 0.5458932102834542

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.53      0.64      1230
           1       0.25      0.52      0.34       370

    accuracy                           0.53      1600
   macro avg       0.52      0.53      0.49      1600
weighted avg       0.66      0.53      0.57      1600

