In [46]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

# Load data
data = pd.read_csv('data/diabetes.csv')

In [47]:
# Clean data
clean_data = data.copy()

# Drop all na values
rows_to_remove = []
for i in range(len(clean_data)):
    if clean_data.iloc[i].isna().sum() > 0:
        rows_to_remove.append(i)
clean_data = clean_data.drop(index=rows_to_remove)

# Convert categorical data to dummy variables: N/A

In [48]:
# Generate training and testing set: 25%
X = clean_data.copy().drop(columns=['Outcome'])
y = clean_data.copy()['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [49]:
# Pass through LogisticRegressionCV
log_reg_cv = LogisticRegressionCV(cv=5, random_state=0, max_iter=200).fit(X_train, y_train)

# Form predictions
predictions = log_reg_cv.predict(X_test)

# Show Cs hyperparameter
print(f'Cs hyperparameter: {log_reg_cv.Cs}')

Cs hyperparameter: 10


In [50]:
# Form confusion matrix
conf_mat = confusion_matrix(y_test, predictions)
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

print(f'Sensitivity: {tp/(tp+fn)}')
print(f'Specificity: {tn/(tn+fp)}')
print(f'MCC: {matthews_corrcoef(y_test, predictions)}')

Sensitivity: 0.5806451612903226
Specificity: 0.9076923076923077
MCC: 0.5273339157746447
