## Logistic regression with GridSearchCV

In [1]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [2]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [3]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [4]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [5]:
# standardization 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [6]:
y_train = y_train.values.ravel()

In [7]:
# model creation + grid cv
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 

# param_grid ={
#     "penalty": ["l1", "l2", "elasticnet",None],
#     "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
# }

param_grid = [
    # L2 penalties
    {
        "solver": ["lbfgs", "newton-cg", "newton-cholesky", "sag"],
        "penalty": ["l2", None]
    },
    # L1 penalties
    {
        "solver": ["liblinear", "saga"],
        "penalty": ["l1"]
    },
    # ElasticNet
    {
        "solver": ["saga"],
        "penalty": ["elasticnet"],
        "l1_ratio": [0.1, 0.5, 0.9]
    }
]
model = LogisticRegression()

grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)


Fitting 5 folds for each of 13 candidates, totalling 65 fits


In [8]:
# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
grid_prediction

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1])

In [9]:
# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print(cm)
print(clf_report)

[[56  2]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.93      0.97      0.95        58
           1       0.90      0.82      0.86        22

    accuracy                           0.93        80
   macro avg       0.92      0.89      0.90        80
weighted avg       0.92      0.93      0.92        80



## ROC AUC 

In [18]:
print(grid_model.predict_proba(x_test)[:,1])

[0.12098332 0.14986437 0.20891039 0.07982086 0.08535763 0.00739655
 0.01199936 0.77359319 0.00466399 0.53084587 0.0363908  0.02446923
 0.16964643 0.39761379 0.01571587 0.32573193 0.30165491 0.01233039
 0.98995916 0.0416782  0.0777169  0.96679876 0.25623787 0.90322023
 0.00345504 0.97571518 0.06936149 0.08066234 0.17663902 0.14398891
 0.02069049 0.31618378 0.94386087 0.16027286 0.01417335 0.00327438
 0.02016587 0.05615142 0.02266028 0.52995213 0.07010111 0.30107012
 0.05163003 0.03904287 0.80757865 0.02197833 0.2835928  0.93045514
 0.00840071 0.88194002 0.99136071 0.03123475 0.10588708 0.45456834
 0.98218397 0.33741742 0.07442238 0.03701283 0.5092668  0.0031811
 0.01969784 0.9450901  0.00910694 0.42489018 0.00138517 0.98580965
 0.03303805 0.02643954 0.2271677  0.44843404 0.59760725 0.18956832
 0.00914515 0.26597416 0.07235227 0.0090367  0.52480182 0.27461498
 0.71093144 0.89208492]


In [21]:
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
roc_auc_score

np.float64(0.9780564263322884)

## f1 score 

In [19]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
f1_score

0.9238498789346247

## GridSearchCV results 

In [13]:
grid_results = grid_model.cv_results_
print(grid_results)

{'mean_fit_time': array([1.0293416 , 0.02113385, 0.15006466, 0.0445013 , 0.00962172,
       0.01420012, 0.01130333, 0.0083168 , 0.02251401, 0.0084281 ,
       0.00816622, 0.00812411, 0.00845046]), 'std_fit_time': array([1.27252948e+00, 1.26021164e-02, 1.71316088e-01, 4.33910529e-02,
       1.10941603e-03, 2.66943180e-03, 2.70378921e-03, 2.65963907e-03,
       1.83948637e-02, 1.78342349e-03, 2.19896561e-03, 2.30591530e-03,
       1.97176579e-03]), 'mean_score_time': array([0.01457877, 0.00941539, 0.01042681, 0.0080997 , 0.00851221,
       0.00859118, 0.00909972, 0.00894666, 0.01378212, 0.01294465,
       0.01377277, 0.01085324, 0.01091523]), 'std_score_time': array([0.00732101, 0.00182618, 0.00286603, 0.00089125, 0.00100992,
       0.00087243, 0.00176057, 0.00114756, 0.00278772, 0.00419374,
       0.00310005, 0.00173189, 0.00263791]), 'param_penalty': masked_array(data=['l2', 'l2', 'l2', 'l2', None, None, None, None, 'l1',
                   'l1', 'elasticnet', 'elasticnet', 'elasticnet

In [14]:
# table
table = pd.DataFrame.from_dict(grid_results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_solver,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.029342,1.272529,0.014579,0.007321,l2,lbfgs,,"{'penalty': 'l2', 'solver': 'lbfgs'}",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,6
1,0.021134,0.012602,0.009415,0.001826,l2,newton-cg,,"{'penalty': 'l2', 'solver': 'newton-cg'}",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,6
2,0.150065,0.171316,0.010427,0.002866,l2,newton-cholesky,,"{'penalty': 'l2', 'solver': 'newton-cholesky'}",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,6
3,0.044501,0.043391,0.0081,0.000891,l2,sag,,"{'penalty': 'l2', 'solver': 'sag'}",0.828694,0.79104,0.740864,0.83804,0.920683,0.823864,0.05927,6
4,0.009622,0.001109,0.008512,0.00101,,lbfgs,,"{'penalty': None, 'solver': 'lbfgs'}",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
5,0.0142,0.002669,0.008591,0.000872,,newton-cg,,"{'penalty': None, 'solver': 'newton-cg'}",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
6,0.011303,0.002704,0.0091,0.001761,,newton-cholesky,,"{'penalty': None, 'solver': 'newton-cholesky'}",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
7,0.008317,0.00266,0.008947,0.001148,,sag,,"{'penalty': None, 'solver': 'sag'}",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
8,0.022514,0.018395,0.013782,0.002788,l1,liblinear,,"{'penalty': 'l1', 'solver': 'liblinear'}",0.828694,0.808442,0.726744,0.83804,0.920683,0.82452,0.062074,1
9,0.008428,0.001783,0.012945,0.004194,l1,saga,,"{'penalty': 'l1', 'solver': 'saga'}",0.828694,0.79104,0.726744,0.83804,0.920683,0.82104,0.063353,11


In [15]:
grid_model.best_params_

{'penalty': None, 'solver': 'lbfgs'}

In [16]:
grid_model.best_estimator_

In [17]:
""" Mean cross-validation score on the TRAINING data """
grid_model.best_score_

np.float64(0.8245204640224871)