In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df = pd.read_csv("../Resources/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.count()

Pregnancies                 768
Glucose                     768
BloodPressure               768
SkinThickness               768
Insulin                     768
BMI                         768
DiabetesPedigreeFunction    768
Age                         768
Outcome                     768
dtype: int64

# Part I
### Prepare the data

In [34]:
# Split the data into X and y
X = df.drop(columns="Outcome")
y = df["Outcome"]
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [37]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, 
                                                    stratify=y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(576, 8)
(576,)
(192, 8)
(192,)


# Part II (Logistic Regression)
### Create a Logistic Regression classifier, then fit and score the classifier

In [38]:
# Import the LogisticRegression classifier from sklearn.linear_model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver="liblinear", max_iter=100, random_state=1)
classifier

LogisticRegression(random_state=1, solver='liblinear')

In [39]:
# Fit the classifier
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [40]:
# Calculate the score for the test data
from sklearn.metrics import accuracy_score
y_pred = classifier.predict(X_test)
accuracy_score(y_pred, y_test)

0.7552083333333334

# Part III (Decision Tree)
### Create a Decision Tree classifier, then fit and score the classifier

In [44]:
# Import the DecisionTreeClassifier from sklearn.tree
from sklearn.tree import tree
model = tree.DecisionTreeClassifier()



In [45]:
# Fit the classifier
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [46]:
# Calculate the score for the test data
y_pred = model.predict(X_test)
accuracy_score(y_pred, y_test)

0.734375

# Part IV (Random Forest)
### Create a Random Forest classifier, then fit and score the classifier
### Print the feature importances

In [49]:
# Import the RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model

RandomForestClassifier(n_estimators=500, random_state=78)

In [50]:
# Fit the classifier
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=78)

In [51]:
# Calculate the score for the test data
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.75

In [52]:
# Print the feature importances from the Random Forest classifier
feature_names = X.columns
importances = rf_model.feature_importances_
sorted(zip(importances, feature_names), reverse=True)

[(0.25775545895685975, 'Glucose'),
 (0.1564009588563633, 'BMI'),
 (0.13902625926751333, 'Age'),
 (0.13112929494944797, 'DiabetesPedigreeFunction'),
 (0.08743852050203478, 'BloodPressure'),
 (0.08342159105028388, 'Pregnancies'),
 (0.07843735482904993, 'Insulin'),
 (0.06639056158844708, 'SkinThickness')]

In [54]:
# BONUS: Try to improve the score by changing the number of estimators in the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_pred, y_test)

0.7552083333333334

# Part V (Support Vector Machine)
### Create a Support Vector Machine classifier, then fit and score the classifier

In [55]:
# Create a support vector machine linear classifer, fit the classifier to the data, and calculate the score for the test data

In [56]:
# Import the SVC classifier from sklearn.svm
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [57]:
# Fit the classifier, using a linear kernel
model.fit(X_train,y_train)

SVC(kernel='linear')

In [61]:
# Calculate the score for the test data
y_pred = model.predict(X_test)
accuracy_score(Y_pred, y_test)
y_pred

array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0])

In [62]:
# BONUS: Try to improve the score by changing the `C` and `gamma` parameters
model = SVC(kernel='linear', C=10, gamma=0.001, random_state=42)
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.7760416666666666