In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import numpy as np
import itertools
import pickle

In [12]:
df = pd.read_csv("../data/processed/churn_enum.csv")
df.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,Partner,MultipleLines,StreamingMovies,DeviceProtection,SeniorCitizen,...,PaperlessBilling,OnlineSecurity,InternetService,TechSupport,PaymentMethod,PhoneService,Dependents,OnlineBackup,Contract,gender
0,7590-VHVEG,1,29.85,29.85,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5575-GNVDE,34,56.95,1889.5,0,1,1,0,1,0,...,1,1,0,0,1,1,0,1,1,1
2,3668-QPYBK,2,53.85,108.15,1,1,1,0,0,0,...,0,1,0,0,1,1,0,0,0,1
3,7795-CFOCW,45,42.3,1840.75,0,1,0,0,1,0,...,1,1,0,1,2,0,0,1,1,1
4,9237-HQITU,2,70.7,151.65,1,1,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0


In [13]:
y = df["Churn"]
x = df[["tenure", "Contract", "InternetService", "MonthlyCharges"]]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [15]:
parameters = {"penalty": ["l1", "l2", "elasticnet"],
              "C": [1/(1e-1), 1/(1e-2), 1/(1e-3), 1/(1e-4)],
              "tol": [1e-4, 1e-5],
              "l1_ratio": [0.4, 0.5, 0.6]}

In [16]:
grid_search = list(itertools.product(*parameters.values()))

In [17]:
roc_auc_scores = []
acc_scores = []

for params in grid_search:
    if params[0] == "elasticnet":
        lr = LogisticRegression(penalty=params[0], 
                            C=params[1],
                            tol=params[2],
                            solver="saga", l1_ratio=params[3],
                            max_iter=1000)
    else:
        lr = LogisticRegression(penalty=params[0], 
                            C=params[1],
                            tol=params[2],
                            solver="saga",
                            max_iter=1000)
        
    lr.fit(x_train, y_train)
    pred = lr.predict(x_test)
    roc_auc_scores.append(roc_auc_score(y_test, pred))
    acc_scores.append(accuracy_score(y_test, pred))

In [18]:
acc_scores[np.argmax(roc_auc_scores)]

0.7700170357751278

In [19]:
roc_auc_scores[np.argmax(roc_auc_scores)]

0.6644048001895703

In [20]:
best_params = grid_search[np.argmax(roc_auc_scores)]
if best_params[0] == "elasticnet":
    best_model = LogisticRegression(penalty=best_params[0], 
                        C=best_params[1],
                        tol=best_params[2],
                        solver="saga", l1_ratio=best_params[3],
                        max_iter=1000)
else:
    best_model = LogisticRegression(penalty=best_params[0], 
                        C=best_params[1],
                        tol=best_params[2],
                        solver="saga",
                        max_iter=1000)
best_model.fit(x_train, y_train)

LogisticRegression(C=10.0, max_iter=1000, penalty='l1', solver='saga')

In [21]:
filename = "2021_05_25_2128"

with open(f"../models/logistic_regression/{filename}.pkl", "wb") as f:
    pickle.dump(best_model, f)

In [22]:
best_model

LogisticRegression(C=10.0, max_iter=1000, penalty='l1', solver='saga')