<h1>Ensemble classifier on Diabetes dataset</h1>

<h2>Load data</h2>

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

# Load data
df = pd.read_csv("../data/diabetes.csv")
np_data = df.values

# Split data into X and y
X = np_data[:,0:-1]

# Convert class label strings to integers
y_raw = np_data[:,-1]
encoder = LabelEncoder()
encoder.fit(y_raw)
y = encoder.transform(y_raw)

# Shuffle data
X, y = shuffle(X, y, random_state=0)

# Set seed to randomizer
seed = 7

# Ignore deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

<h2>Train/test split</h2>

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (614, 8)
Test set shape: (154, 8)


<h2>Train and evaluate bagging classifier</h2>

In [18]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(max_depth=4), n_estimators=125,
        bootstrap=True, oob_score=True, n_jobs=-1)  
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

# Calculate accuracy
print("Accuracy: {0:.2f}%".format(accuracy_score(y_test, y_pred) * 100.0))

# Confusion matrix
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 77.27%

Confusion matrix:
[[79 13]
 [22 40]]
