1. Prepare dataset

In [1]:
# import libs

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
# load dataset
def load_dataset(filename):
    # load data from csv file
    df_data = pd.read_csv(filename, header=None)
    # convert df to numpy array
    dataset = df_data.values
    # split dataset into input (X) and output (y)
    X = dataset[:, :-1]
    y = dataset[:, -1]

    # format all features as string
    X = X.astype(str)
    return X, y

In [3]:
# prepair input func
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)

    return X_train_enc, X_test_enc

In [4]:
# prepare output func

def prepare_output(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)

    return y_train_enc, y_test_enc

In [8]:
X, y = load_dataset('breast-cancer.csv')

In [6]:
print(X.shape)

(286, 9)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_output(y_train, y_test)

2. Modelling
2.1 KNN Classifier

In [10]:
#Create KNN Classifier
knn_ml = KNeighborsClassifier(n_neighbors=5)

# fit model on train set
knn_ml.fit(X_train_enc, y_train_enc)

In [12]:
# 2.1.2 Evalue Model
# predict  on test data
y_pred = knn_ml.predict(X_test_enc)
y_pred

# model accuracy
acc = metrics.accuracy_score(y_test_enc, y_pred)
print(f"Model accuracy: {acc}")

# model precision
pres = metrics.precision_score(y_test_enc, y_pred)
print(f"Model precision: {pres}")

# model recall
rec = metrics.recall_score(y_test_enc, y_pred)
print(f"Model recall score: {rec}")



Model accuracy: 0.6627906976744186
Model precision: 0.6363636363636364
Model recall score: 0.21875


In [13]:
# use crosss validation
# declare object of KNN model

knn_ml = KNeighborsClassifier(n_neighbors=5)

# define the model evaluation procedure
cv=KFold(n_splits=3, shuffle=True, random_state=1)

# encode data

oe = OrdinalEncoder()
X_enc =oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

# evalue model
result = cross_val_score(knn_ml, X_enc, y_enc, cv=cv, scoring="accuracy")

print(f"Accuracy: {result.mean()}")

Accuracy: 0.6678362573099416


In [14]:
# use repeat cross_validation

# declare object of KNN model
knn_ml = KNeighborsClassifier(n_neighbors=5)

# define the model evaluation procedure
cv= RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

# encode data

oe = OrdinalEncoder()
X_enc =oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

# evaluate model
scores = cross_val_score(knn_ml, X_enc, y_enc, scoring='accuracy', cv=cv, n_jobs=-1)

print(f"Cross validation accuracy: {scores.mean()}")


Cross validation accuracy: 0.6736720272904484


2.2  DecisionTreeClassifier Model

In [15]:
# load dataset

col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv("pima-indians-diabetes.csv", header=None, names=col_names)

In [16]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
#split dataset in features and target variable

feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable

In [18]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [28]:
# Create Decision Tree classifer object
clf_ml = DecisionTreeClassifier(max_depth=4, criterion='entropy', max_features=0.6, splitter='best')

# Train Decision Tree Classifer
clf_ml = clf_ml.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf_ml.predict(X_test)

In [29]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7619047619047619


In [30]:
# use crosss validation
# declare object 

clf_ml = DecisionTreeClassifier(max_depth=4, criterion='entropy', max_features=0.6, splitter='best')

# define the model evaluation procedure
cv=KFold(n_splits=3, shuffle=True, random_state=1)

# evalue model
result = cross_val_score(clf_ml, X, y, cv=cv, scoring="accuracy")

print(f"Crosss validation Accuracy: {result.mean()}")

Crosss validation Accuracy: 0.7096354166666666


In [31]:
# use repeat cross_validation

# declare object 
clf_ml = DecisionTreeClassifier(max_depth=4, criterion='entropy', max_features=0.6, splitter='best')

# define the model evaluation procedure
cv= RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

# evaluate model
scores = cross_val_score(clf_ml, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

print(f"Repeat Cross validation accuracy: {scores.mean()}")


Repeat Cross validation accuracy: 0.7252604166666666
