<h1>Ensemble classifiers on Diabetes dataset</h1>

<h2>Load data</h2>

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load data
df = pd.read_csv("../data/diabetes.csv")
np_data = df.values

# Split data into X and y
X = np_data[:,0:-1]

# Convert class label strings to integers
y_raw = np_data[:,-1]
encoder = LabelEncoder()
encoder.fit(y_raw)
y = encoder.transform(y_raw)

# Shuffle data
X, y = shuffle(X, y, random_state=0)

# Set seed to randomizer
seed = 42

# Ignore deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

<h2>Train/test split</h2>

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (614, 8)
Test set shape: (154, 8)


<h2>Train and evaluate bagging classifier</h2>

In [3]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

bag_clf = BaggingClassifier(
        SGDClassifier(max_iter=1000), n_estimators=125,
        bootstrap=True, oob_score=True, n_jobs=-1)  
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

# Calculate accuracy
print("Accuracy: {0:.2f}%".format(accuracy_score(y_test, y_pred) * 100.0))

# Confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 74.03%

Confusion matrix:
[[77 15]
 [25 37]]


<h2>Train and evaluate RandomForest classifier</h2>

In [9]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=125, n_jobs=-1, random_state=seed)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)

# Calculate accuracy
print("Accuracy: {0:.2f}%".format(accuracy_score(y_test, y_pred) * 100.0))

# Confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 75.97%

Confusion matrix:
[[81 11]
 [26 36]]


<h2>Train and evaluate ExtraTrees classifier</h2>

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

ext_clf = ExtraTreesClassifier(n_estimators=125, n_jobs=-1, random_state=seed)
ext_clf.fit(X_train, y_train)
y_pred = ext_clf.predict(X_test)

# Calculate accuracy
print("Accuracy: {0:.2f}%".format(accuracy_score(y_test, y_pred) * 100.0))

# Confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 73.38%

Confusion matrix:
[[78 14]
 [27 35]]


<h2>Check feature importance</h2>

In [12]:
for name, score in zip(df.columns, rnd_clf.feature_importances_):
    print("{0:.6}:\t{1:.2f}".format(name, score))

NoTime:	0.08
Plasma:	0.25
Diasto:	0.09
Tricep:	0.07
SerumI:	0.07
BMI:	0.18
Diabet:	0.12
Age:	0.13


<h2>Train and evaluate XGBoost classifier</h2>

In [15]:
from xgboost import XGBClassifier

# train model
xgb_clf = XGBClassifier(random_state=seed)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

# Calculate accuracy
print("Accuracy: {0:.2f}%".format(accuracy_score(y_test, y_pred) * 100.0))

# Confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 75.97%

Confusion matrix:
[[77 15]
 [22 40]]
