In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import numpy as np
from collections import Counter

df = pd.read_csv('Notebooks/Datasets/diabetes.csv')

feature_cols = ['Pregnancies', 'Insulin', 'BMI', 'Age']

X = df[feature_cols]

y = df['Outcome']

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

y_test.value_counts()

0    130
1     62
Name: Outcome, dtype: int64

In [3]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

# fit model
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)



In [4]:
logreg.intercept_

array([-4.32142123])

In [5]:
Counter(y_pred)

Counter({0: 165, 1: 27})

In [6]:
Counter(y_test)

Counter({1: 62, 0: 130})

In [7]:
#Determining the accuracy
np.mean(y_test == y_pred)

0.6927083333333334

In [8]:
confusion_matrix(y_test, y_pred)

array([[118,  12],
       [ 47,  15]])

In [9]:
len(X)*0.25

192.0

In [10]:
def comp_yt_yp(y_test, y_predict):
    # create a blank 2x2 confusion matrix (all 0s)
    conf_matrix  = np.zeros((2, 2))
    # indices that will create all confusion matrix values
    # TP (1,1), TN (0,0), FP (0, 1), FN (1, 0)
    for row_index in [0, 1]:
        for column_index in [0, 1]:
            counter = 0
            # iterate through all elements of y_test, y_predict,
            # which are all values of either 0 or 1
            for (yt_index, yp_index) in zip(y_test, y_predict):
                # comparing the elements of y_test and y_predict with each confusion matrix value (TP, TN, FP, FN),
                # and if there's a match for the confusion matrix value we're looking at, increment the counter
                if (yt_index == row_index) & (yp_index == column_index):
                        counter += 1
            # Add the total number of elements for the confusion matrix value,
            # then look at the next value in the loop
            conf_matrix[row_index, column_index] = counter 
    return conf_matrix

# print the result of calculating our confusion matrix
print(comp_yt_yp(y_test, y_pred))

[[118.  12.]
 [ 47.  15.]]


In [11]:
from sklearn import metrics

confusion = metrics.confusion_matrix(y_test, y_pred)
print(confusion)

[[118  12]
 [ 47  15]]


In [12]:
logreg.predict_proba(X_test)

array([[0.63247571, 0.36752429],
       [0.71643656, 0.28356344],
       [0.71104114, 0.28895886],
       [0.5858938 , 0.4141062 ],
       [0.84103973, 0.15896027],
       [0.82934844, 0.17065156],
       [0.50110974, 0.49889026],
       [0.48658459, 0.51341541],
       [0.72321388, 0.27678612],
       [0.32810562, 0.67189438],
       [0.64244443, 0.35755557],
       [0.25912035, 0.74087965],
       [0.63949765, 0.36050235],
       [0.76987637, 0.23012363],
       [0.57345769, 0.42654231],
       [0.80896485, 0.19103515],
       [0.54236399, 0.45763601],
       [0.8809859 , 0.1190141 ],
       [0.56071047, 0.43928953],
       [0.63038849, 0.36961151],
       [0.55812011, 0.44187989],
       [0.62388338, 0.37611662],
       [0.80183978, 0.19816022],
       [0.58322696, 0.41677304],
       [0.84451719, 0.15548281],
       [0.7468329 , 0.2531671 ],
       [0.90256923, 0.09743077],
       [0.30366288, 0.69633712],
       [0.84641691, 0.15358309],
       [0.7802164 , 0.2197836 ],
       [0.