In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV


**Part 1**

*Import the raw data set into a Pandas DataFrame*

In [51]:
diabetes = pd.read_csv('diabetes.csv')

**Part 2**

*Clean the data and remove missing values. Drop any column that is not categorical or numeric. Separate the independent variables from the dependent variables.*

In [52]:
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [53]:
diabetes.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [54]:
diabetes.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [55]:
keep_columns = [x for x in diabetes.columns if x != 'Outcome']
# X is independent variables
X = diabetes[keep_columns].copy()
# Y is dependent variable
Y = diabetes['Outcome'].copy()

**Part 3**

*Generate dummy variables for the categorical features.*

In [56]:
#No categorical variables so not needed

**Part 4**

*Create a training set that's 75% of your data set and a complementary test set with the remaining 25%. Specify random_state=0.*

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.75, random_state = 0)

**Part 5**

*Train the model using the LogisticRegressionCV class, which has cross-validation capability built in. Use this to tune your Cs hyperparameter. What value of this hyperparameter performs best in cross-validation?*

In [58]:
log_regr = LogisticRegressionCV(max_iter = 200).fit(X_train, Y_train)
print("Best Hyperparameter:", log_regr.C_)

Best Hyperparameter: [0.00599484]


**Part 6**

*After cross-validation, use your model to generate predictions on the test set, then create a confusion matrix from those results. Print the Matthews correlation coefficient.*

In [59]:
predictions = log_regr.predict(X_test)
print(confusion_matrix(Y_test, predictions))

[[116  14]
 [ 26  36]]


In [60]:
print("Matthews correlation coefficient", matthews_corrcoef(Y_test, predictions))

Matthews correlation coefficient 0.5039137358988678
