In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

### Step 1: Import the raw data set into a pandas dataframe

In [2]:
diabetes_df = pd.read_csv('data/diabetes.csv')
diabetes_df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


### Step 2:
####      2a: Clean the data and remove missing values. 

In [3]:
diabetes_df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

> No missing Values Found
#### 2b: Drop any column that is not categorical or numeric.

In [4]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


> All Columns are good for the regression
#### 2c: Separate independent variables from dependent variables

In [5]:
dependent_vars = diabetes_df['Outcome'].copy()
independent_vars = diabetes_df.drop(columns = ['Outcome']).copy()

### Step 3: Generate Dummy Variables for the categorical features.
> No Categorical Features

### Step 4: Create a Training Set that's 75% of your data set and a complentary test set with the remaining 25%.
> Set Random State = 0

In [6]:
X = independent_vars
y = dependent_vars

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.25, random_state=0)

### Step 5: Train the model using the LogisticRegressionCV class
> Required Imports:
>> **from sklearn.linear_model import LogisticRegressionCV**
#### What value of Cs hyper parameter performs best in cross-validation?

In [7]:
log_reg_cv = LogisticRegressionCV(max_iter = 500)

log_reg_cv.fit(X_train, y_train)
log_coef = log_reg_cv.coef_
log_intercept = log_reg_cv.intercept_

best_c = log_reg_cv.C_[0]
log_reg_cv.Cs_

print(f'Best C: {best_c}')

Best C: 21.54434690031882


### Step 6: 
#### 6a: Use your model to generate predictions on the test set.

In [8]:
predictions = log_reg_cv.predict(X_test)

#
**I have moved my predictions and actuals to a dataframe**

In [12]:
prediction_df = pd.DataFrame(y_test.copy(), columns = ['Outcome'])

prediction_df['predictions'] = predictions

#### 6b: Create a confusion matrix
> Required Imports
>> **from sklearn.metrics import confusion_matrix**

In [13]:
prediction_df

Unnamed: 0,Outcome,predictions
762,0,0
127,0,0
564,0,0
375,1,1
663,1,1
...,...,...
763,0,0
192,1,1
629,0,0
559,0,0


In [14]:
matrix = confusion_matrix(prediction_df['Outcome'], prediction_df['predictions'], labels = [0,1])
pd.DataFrame(matrix, columns = ['Negative','Positive'], index = ['Negative','Positive'])

Unnamed: 0,Negative,Positive
Negative,329,41
Positive,92,114


#### 6c: Print the Matthews correlation coeficient.
> Required Imports:
>> **from sklearn.metrics import matthews_corrcoef**

In [15]:
mccoef = matthews_corrcoef(prediction_df['Outcome'], prediction_df['predictions'])
print(mccoef)

0.4783287350221806
