In [43]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [44]:
df = pd.read_csv("./Resources/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [45]:
df.shape

(768, 9)

# Part I
### Prepare the data

In [46]:
# Split the data into X and y
X = df.drop(['Outcome'], axis=1)
y = df['Outcome']

In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X)
X_scale = X_scaler.transform(X)

In [48]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, random_state=1)

# Part II (Logistic Regression)
### Create a Logistic Regression classifier, then fit and score the classifier

In [49]:
# Import the LogisticRegression classifier from sklearn.linear_model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [50]:
# Fit the classifier
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [51]:
# Calculate the score for the test data
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
lg_accuracy_score = accuracy_score(y_test, y_pred)
print(lg_accuracy_score)

0.7760416666666666


In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [53]:
print("Confusion Matrix")
display(cm_df)
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,109,14
Actual 1,29,40


Classification Report
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       123
           1       0.74      0.58      0.65        69

    accuracy                           0.78       192
   macro avg       0.77      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192



# Part III (Decision Tree)
### Create a Decision Tree classifier, then fit and score the classifier

In [54]:
# Import the DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier()

In [55]:
# Fit the classifier
tree_classifier = tree_classifier.fit(X_train, y_train)
tree_pred = tree_classifier.predict(X_test)

In [56]:
# Calculate the score for the test data
tree_cm = confusion_matrix(y_test, tree_pred)
tree_cm_df = pd.DataFrame(
    tree_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [57]:
print("Confusion Matrix")
display(tree_cm_df)
print("Classification Report")
print(classification_report(y_test, tree_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,98,25
Actual 1,35,34


Classification Report
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       123
           1       0.58      0.49      0.53        69

    accuracy                           0.69       192
   macro avg       0.66      0.64      0.65       192
weighted avg       0.68      0.69      0.68       192



# Part IV (Random Forest)
### Create a Random Forest classifier, then fit and score the classifier
### Print the feature importances

In [58]:
# Import the RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators=500, random_state=1)

In [59]:
# Fit the classifier
forest_classifier = forest_classifier.fit(X_train, y_train)

In [60]:
# Calculate the score for the test data
forest_pred = forest_classifier.predict(X_test)

In [61]:
forest_cm = confusion_matrix(y_test, forest_pred)
forest_cm_df = pd.DataFrame(
    forest_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

In [62]:
print("Confusion Matrix")
display(forest_cm_df)
print("Classification Report")
print(classification_report(y_test, forest_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,109,14
Actual 1,22,47


Classification Report
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       123
           1       0.77      0.68      0.72        69

    accuracy                           0.81       192
   macro avg       0.80      0.78      0.79       192
weighted avg       0.81      0.81      0.81       192



In [63]:
importances = forest_classifier.feature_importances_
importances

array([0.08520399, 0.24651609, 0.09444064, 0.06877378, 0.0757266 ,
       0.1720237 , 0.12201139, 0.1353038 ])

In [64]:
# Print the feature importances from the Random Forest classifier
sorted(zip(forest_classifier.feature_importances_, X.columns), reverse=True)

[(0.24651609185606513, 'Glucose'),
 (0.17202370330097824, 'BMI'),
 (0.13530380135048406, 'Age'),
 (0.12201139090268344, 'DiabetesPedigreeFunction'),
 (0.09444063931967663, 'BloodPressure'),
 (0.08520398921965537, 'Pregnancies'),
 (0.07572660432470443, 'Insulin'),
 (0.06877377972575278, 'SkinThickness')]

In [65]:
# BONUS: Try to improve the score by changing the number of estimators in the Random Forest classifier
bonus_forest_classifier = RandomForestClassifier(n_estimators=50000, random_state=1)
bonus_forest_classifier = bonus_forest_classifier.fit(X_train, y_train)
bonus_forest_pred = bonus_forest_classifier.predict(X_test)
bonus_forest_cm = confusion_matrix(y_test, bonus_forest_pred)
bonus_forest_cm_df = pd.DataFrame(
    bonus_forest_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(bonus_forest_cm_df)
print("Classification Report")
print(classification_report(y_test, bonus_forest_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,109,14
Actual 1,23,46


Classification Report
              precision    recall  f1-score   support

           0       0.83      0.89      0.85       123
           1       0.77      0.67      0.71        69

    accuracy                           0.81       192
   macro avg       0.80      0.78      0.78       192
weighted avg       0.80      0.81      0.80       192



# Part V (Support Vector Machine)
### Create a Support Vector Machine classifier, then fit and score the classifier

In [66]:
# Import the SVC classifier from sklearn.svm
from sklearn.svm import SVC

In [67]:
# Fit the classifier, using a linear kernel
svm_classifier = SVC(kernel = 'linear')
svm_classifier.fit(X_train, y_train)

SVC(kernel='linear')

In [68]:
# Calculate the score for the test data
# BONUS: Try to improve the score by changing the number of estimators in the Random Forest classifier
svm_pred = svm_classifier.predict(X_test)
svm_cm = confusion_matrix(y_test, svm_pred)
svm_cm_df = pd.DataFrame(
    svm_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
print("Confusion Matrix")
display(svm_cm_df)
print("Classification Report")
print(classification_report(y_test, svm_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,108,15
Actual 1,27,42


Classification Report
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       123
           1       0.74      0.61      0.67        69

    accuracy                           0.78       192
   macro avg       0.77      0.74      0.75       192
weighted avg       0.78      0.78      0.78       192



In [69]:
# BONUS: Try to improve the score by changing the `C` and `gamma` parameters
