 # Predicting Diabetes

In [12]:
from pathlib import Path
import pandas as pd

In [13]:
data = Path('../Resources/diabetes.csv')
df = pd.read_csv(data)
df.head()
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


 ## Separate the Features (X) from the Target (y)

In [14]:
y = df["Outcome"]
X = df.drop(columns="Outcome")

In [15]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


 ## Split our data into training and testing

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(614, 8)

 ## Create a Logistic Regression Model

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
classifier

 ## Fit (train) or model using the training data

In [18]:
classifier.fit(X_train, y_train)

 ## Score the model using the test data

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7850162866449512
Testing Data Score: 0.7857142857142857


In [20]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Predictions": predictions, "Actual": y_test}).reset_index(drop=True)
results.head()

Unnamed: 0,Predictions,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


### Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# Create a confusion matrix
confusion_matrix(y_test, predictions)

array([[92,  8],
       [25, 29]], dtype=int64)

In [22]:
# Create a classification report
categories_names = ["Non Diabetes", "Diabetes"]
print(classification_report(y_test, predictions, target_names=categories_names))

              precision    recall  f1-score   support

Non Diabetes       0.79      0.92      0.85       100
    Diabetes       0.78      0.54      0.64        54

    accuracy                           0.79       154
   macro avg       0.79      0.73      0.74       154
weighted avg       0.79      0.79      0.77       154

