# Regularised Logistic regression as the benchmark model

## Imports

In [130]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, precision_recall_curve
from time import time

## Data
already preprocessed

In [110]:
client_attrition = pd.read_csv('../data/preprocessed/client_attrition_train.csv', sep=";")
X = client_attrition.drop("account_status",axis=1)
y = client_attrition["account_status"]
print(X.shape)

(10127, 37)


## Train /  test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345,stratify=y)

## Logistic regression fit

In [21]:
# instantiate the model (using the default parameters)
logit = LogisticRegression(penalty='l1',solver='saga',max_iter=1000,random_state=12345,verbose=1)

# fit the model with data
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 183 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished


In [29]:
balanced_accuracy_score(y_true=y_test,y_pred=y_pred)

0.716503414281192

In [45]:
# Whic coefficients are 0
X_train.columns.to_series().loc[(abs(logit.coef_)==0).tolist()[0]]

customer_education_High School          customer_education_High School
customer_civil_status_Divorced          customer_civil_status_Divorced
credit_card_classification_Silver    credit_card_classification_Silver
customer_salary_range_below 40K        customer_salary_range_below 40K
dtype: object

## Grid Search
Using Repeated Stratified KFold Cross Validation

In [149]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid[0].keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i in range(len(grid_search.cv_results_['params'])):
        new_row = grid_search.cv_results_['params'][i] | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                                                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                                                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                                                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df
    

In [146]:
grid_search.cv_results_['params'][0],grid_search.cv_results_['std_test_score']

({'penalty': 'l2', 'solver': 'lbfgs'},
 array([0.01684924, 0.01679872, 0.01681834, 0.01684924, 0.01684924,
        0.01679872, 0.01684924, 0.0168373 , 0.01684924, 0.01679872,
               nan, 0.01681834, 0.01684924, 0.01679872]))

In [132]:
start_time = time()
# Grid 
param_grid = [{'penalty': ['l2',None], 'solver': ["lbfgs"]}, {'penalty': ['l1', 'l2'], 'solver': ["liblinear"]},
                {'penalty': ['l2',None], 'solver': ["newton-cg"]}, {'penalty': ['l2',None], 'solver': ["newton-cholesky"]},
                {'penalty': ['l2',None], 'solver': ["sag"]}, {'penalty': ['elasticnet', 'l1', 'l2', None], 'solver': ["saga"]}]
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 14 candidates, totalling 280 fits
--- 8.515522241592407 seconds ---


20 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\jakub\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jakub\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "C:\Users\jakub\AppData\Lo

In [150]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/logit_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,penalty,solver,mean_train_score,std_train_score,mean_test_score,std_test_score
1,,lbfgs,0.7400043606,0.0024195763,0.7357383661,0.0167987152
5,,newton-cg,0.7400043606,0.0024195763,0.7357383661,0.0167987152
7,,newton-cholesky,0.7399393109,0.0024338617,0.7357383661,0.0168372954
9,,sag,0.7400010926,0.0024129954,0.7357383661,0.0167987152
13,,saga,0.73999382,0.0024525187,0.7357383661,0.0167987152
0,l2,lbfgs,0.7396087128,0.002543948,0.7356722804,0.0168492376
3,l2,liblinear,0.7396679515,0.0025837998,0.7356722804,0.0168492376
4,l2,newton-cg,0.7396087128,0.002543948,0.7356722804,0.0168492376
6,l2,newton-cholesky,0.7396087128,0.002543948,0.7356722804,0.0168492376
8,l2,sag,0.7396290573,0.0025422555,0.7356722804,0.0168492376


## Kfold Cross Validation
Although there are no hyperparameters to be tuned other than penalty and solvers we want to get more accurate approximation of true balanced accuracy.

In [120]:
def cv_rskf(model ,n_splits, n_repeats, seed):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats,random_state=seed)
    cv_results = cross_validate(estimator=model, X=X, y=y, scoring="balanced_accuracy", cv=rskf, verbose=2,
                                n_jobs=12, return_train_score=True)
    return cv_results

In [121]:
summary_df2 = pd.DataFrame(columns=['penalty', 'solver','average train score', 'std of train score','average val score', 'std of val score'])

In [127]:
summary_df2

Unnamed: 0,penalty,solver,average train score,std of train score,average val score,std of val score
0,l1,saga,0.7398893308,0.0026416632,0.7365539231,0.0168197751


In [126]:
start_time = time()

rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10,random_state=12345)
cv_results = cross_validate(estimator=logit, X=X, y=y, scoring="balanced_accuracy", cv=rskf, verbose=2,
                                n_jobs=12, return_train_score=True, )

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit.get_params()['penalty'], 'solver': logit.get_params()['solver'],
                'average train score': cv_results['train_score'].mean(),"std of train score": cv_results['train_score'].std(),
                'average val score': cv_results['test_score'].mean(),"std of val score": cv_results['test_score'].std()}
summary_df2.loc[len(summary_df2)] = new_row      


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    6.9s


--- 20.125411987304688 seconds ---
average score:  0.7365539231279878 std of score:  0.01681977513915903


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:   19.9s finished


Test another solver and penalty

In [95]:
start_time = time()

logit1 = LogisticRegression(penalty='l1',solver='liblinear',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit1,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 

# insert to df
new_row = {'penalty': logit1.get_params()['penalty'], 'solver': logit1.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.6s


--- 2.228346586227417 seconds ---
average score:  0.7365480407750467 std of score:  0.016820266758700964


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    2.0s finished


In [96]:
start_time = time()

logit2 = LogisticRegression(penalty='l2',solver='liblinear',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit2,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit2.get_params()['penalty'], 'solver': logit2.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.3s


--- 1.8189518451690674 seconds ---
average score:  0.736549809535266 std of score:  0.01679068582016181


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.6s finished


In [97]:
start_time = time()

logit3 = LogisticRegression(penalty=None,solver='newton-cholesky',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit3,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit3.get_params()['penalty'], 'solver': logit3.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.2s


--- 0.8427543640136719 seconds ---
average score:  0.7366449282916985 std of score:  0.016845227273181097


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


In [98]:
start_time = time()

logit4 = LogisticRegression(penalty='l2',solver='newton-cholesky',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit4,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std())
# insert to df
new_row = {'penalty': logit4.get_params()['penalty'], 'solver': logit4.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.1s


--- 0.9306094646453857 seconds ---
average score:  0.7364076319553041 std of score:  0.016803309646989683


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.7s finished


In [99]:
start_time = time()

logit5 = LogisticRegression(penalty='l2',solver='saga',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit5,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit5.get_params()['penalty'], 'solver': logit5.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.6s


--- 2.2714762687683105 seconds ---
average score:  0.736438496152835 std of score:  0.016775969535535683


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    2.2s finished


In [100]:
start_time = time()

logit6 = LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit5,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit6.get_params()['penalty'], 'solver': logit6.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.5s


--- 2.1578879356384277 seconds ---
average score:  0.736438496152835 std of score:  0.016775969535535683


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.9s finished


In [101]:
start_time = time()

logit7 = LogisticRegression(penalty=None,solver='saga',max_iter=1000,random_state=12345,verbose=1)
cv_results = cv_rskf(logit5,n_splits=10,n_repeats=10,seed=12345)

print("--- %s seconds ---" % (time() - start_time))     
print("average score: ", cv_results['test_score'].mean(), "std of score: ", cv_results['test_score'].std()) 
# insert to df
new_row = {'penalty': logit7.get_params()['penalty'], 'solver': logit7.get_params()['solver'], 
                'average score': cv_results['test_score'].mean(),"std of score": cv_results['test_score'].std()}
summary_df.loc[len(summary_df)] = new_row   

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:    0.4s


--- 2.0087006092071533 seconds ---
average score:  0.736438496152835 std of score:  0.016775969535535683


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.8s finished


In [109]:
pd.set_option("display.precision", 10)
summary_df.sort_values(ascending=False,by="average score")

Unnamed: 0,penalty,solver,average score,std of score
3,,newton-cholesky,0.7366449283,0.0168452273
0,l1,saga,0.7365539231,0.0168197751
2,l2,liblinear,0.7365498095,0.0167906858
1,l1,liblinear,0.7365480408,0.0168202668
5,l2,saga,0.7364384962,0.0167759695
6,elasticnet,saga,0.7364384962,0.0167759695
7,,saga,0.7364384962,0.0167759695
4,l2,newton-cholesky,0.736407632,0.0168033096


In [119]:
summary_df.sort_values(ascending=False,by="average score").to_csv('./results/logit_metrics.csv', index=False,sep=';')