In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
pd.set_option('display.max_columns', None)

  from pandas.core import (


In [2]:
from sklearn.datasets import make_classification

## Step 1 : Create the dataset using make_classification

In [3]:
X,y = make_classification(n_samples=1000,n_features=10,n_classes=2,random_state=42)

In [4]:
X.shape, y.shape

((1000, 10), (1000,))

## Step 2 : Divide into Training and Testing 

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=42)

## Step 3 : Model Training

In [6]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()

In [7]:
logistic.fit(X_train, y_train)

In [8]:
y_pred = logistic.predict(X_test)
print(y_pred)
#logistic.predict_proba(X_test) ## If you want to see the probabilities.

[0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 1
 1 1 1 0 1 1 0 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 0
 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1
 1 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1
 0 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 0 1 0 1 0 1 1 0
 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1
 1 1 1 0]


## Step 4 : Performance Metrics

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [10]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(score)
print(cm)
print(classification_report(y_test, y_pred))

0.8466666666666667
[[118  17]
 [ 29 136]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.84       135
           1       0.89      0.82      0.86       165

    accuracy                           0.85       300
   macro avg       0.85      0.85      0.85       300
weighted avg       0.85      0.85      0.85       300



## Step 5 : Hyperparameter Tuning and Cross Validation

In [11]:
model = LogisticRegression()

In [12]:
penalty = ['l1','l2','elasticnet']
c_values = [100, 10, 1.0, 0.1, 0.01 ]
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [13]:
params = dict(penalty=penalty, C=c_values, solver=solver)

### Step 5.1 : Using GridSearchCV

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold()

grid = GridSearchCV(estimator=model,
                   param_grid=params,
                   scoring='accuracy',
                   cv=cv)

In [15]:
print(grid)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(),
             param_grid={'C': [100, 10, 1.0, 0.1, 0.01],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['lbfgs', 'liblinear', 'newton-cg',
                                    'newton-cholesky', 'sag', 'saga']},
             scoring='accuracy')


In [16]:
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)



{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.8785714285714287


250 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/si

In [17]:
y_pred_grid = grid.predict(X_test)

In [18]:
grid_score = accuracy_score(y_test, y_pred_grid)
grid_cm = confusion_matrix(y_test, y_pred_grid)

print(grid_score)
print(grid_cm)
print(classification_report(y_test, y_pred_grid))

0.8533333333333334
[[124  11]
 [ 33 132]]
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300



### Step 5.2 : Using RandomizedSearchCV

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold()

randomcv = RandomizedSearchCV(estimator=model,
                   param_distributions=params,
                   scoring='accuracy',
                   cv=cv)

In [20]:
print(randomcv)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
                   estimator=LogisticRegression(),
                   param_distributions={'C': [100, 10, 1.0, 0.1, 0.01],
                                        'penalty': ['l1', 'l2', 'elasticnet'],
                                        'solver': ['lbfgs', 'liblinear',
                                                   'newton-cg',
                                                   'newton-cholesky', 'sag',
                                                   'saga']},
                   scoring='accuracy')


In [21]:
randomcv.fit(X_train, y_train)
print(randomcv.best_params_)
print(randomcv.best_score_)

{'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.01}
0.8785714285714287


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/hardiksharma/anaconda3/lib/python3.10/site-

In [22]:
y_pred_rscv = randomcv.predict(X_test)

In [23]:
rscv_score = accuracy_score(y_test, y_pred_rscv)
rscv_cm = confusion_matrix(y_test, y_pred_rscv)

print(rscv_score)
print(rscv_cm)
print(classification_report(y_test, y_pred_rscv))

0.8533333333333334
[[124  11]
 [ 33 132]]
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       135
           1       0.92      0.80      0.86       165

    accuracy                           0.85       300
   macro avg       0.86      0.86      0.85       300
weighted avg       0.86      0.85      0.85       300

