In [1]:
import numpy as np
import pandas as pd

In [2]:
confusion = np.array([[118, 12], [47, 15]])

In [3]:
confusion

array([[118,  12],
       [ 47,  15]])

In [4]:
def true_positive(c):
    return c[0][0]

def false_positive(c):
    return c[0][1]

def false_negative(c):
    return c[1][0]

def true_negative(c):
    return c[1][1]

In [5]:
def accuracy(c):
    return (true_positive(c) + true_negative(c))/(true_positive(c) + true_negative(c) + false_positive(c) + false_negative(c))

In [6]:
accuracy(confusion)

0.6927083333333334

In [7]:
def error(c):
    return 1 - accuracy(c)

In [8]:
error(confusion)

0.30729166666666663

In [9]:
def recall(c):
    return true_positive(c)/(true_positive(c) + false_negative(c))

In [10]:
recall(confusion)

0.7151515151515152

In [11]:
def precision(c):
    return true_positive(c)/(true_positive(c) + false_positive(c))

In [12]:
precision(confusion)

0.9076923076923077

In [13]:
def specificity(c):
    return true_negative(c)/(true_negative(c) + false_positive(c))

In [14]:
specificity(confusion)

0.5555555555555556

In [15]:
def f1_score(c):
    return 2 * precision(c) * recall(c) / (precision(c) + recall(c))

In [16]:
f1_score(confusion)

0.8

In [45]:
def argmax(classes, threshold=.5):
    if classes[1] >= threshold:
        return 1
    
    return 0

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
lr = LogisticRegression()

In [23]:
df = pd.read_csv('data/diabetes.csv')

data = df[['Pregnancies', 'Insulin', 'BMI', 'Age']]

targets = df['Outcome']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.25, random_state=0)

In [25]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
y_pred = lr.predict_proba(X_test)

In [27]:
threshold = y_train.value_counts()[1]/len(y_train)

In [28]:
threshold

0.3576388888888889

In [41]:
np.mean(lr.predict(X_test) == y_test)

0.6770833333333334

In [42]:
original = confusion_matrix(lr.predict(X_test), y_test)

In [43]:
original

array([[114,  46],
       [ 16,  16]])

In [48]:
new_threshold = confusion_matrix(np.array([argmax(x, 0.357) for x in y_pred]), y_test)

In [49]:
new_threshold

array([[89, 26],
       [41, 36]])

In [50]:
recall(new_threshold)

0.6846153846153846

In [35]:
recall(original)

0.8769230769230769

In [36]:
precision(new_threshold)

0.7142857142857143

In [37]:
precision(original)

0.7125

In [38]:
f1_score(new_threshold)

0.819672131147541

In [39]:
f1_score(original)

0.7862068965517242

In [None]:
from sklearn import svm, 