<h2>Load data</h2>

In [11]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Load data
np_data = pd.read_csv("../data/banknote.csv").values

# Split data into X and y
X_raw = np_data[:,0:-1].astype(float)
y_raw = np_data[:,-1]

# Shuffle data
X, y = shuffle(X_raw, y_raw, random_state=0)

# Normalize data to avoid high input values
#scaler = StandardScaler()
#scaler.fit(X_raw)
#X = scaler.transform(X_raw)

# Print some stuff
print("Example:")
print(X[0], "->", y[0])
print("")
print("Data shape:", X.shape)

Example:
[ -1.7713 -10.7665  10.2184  -1.0043] -> 1.0

Data shape: (1372, 4)


<h2>Function for evaluating model accuracy</h2>

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

def evaluate_test(model):
    print("\n-- Test set --")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111, stratify=y)
    # train model on training dataset
    model.fit(X_train, y_train)
    # evaluate dataset
    y_pred = model.predict(X_test)
    # calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y_test, y_pred)
    print(conf_mx)
       
def evaluate_cv(model):
    print("\n-- 5-fold CV --")
    # 10-fold CV
    y_pred = cross_val_predict(model, X, y, cv=5)
    # calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Average accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y, y_pred)
    print(conf_mx)

<h2>Naive Bayes</h2>

In [13]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
evaluate_test(model)
evaluate_cv(model)


-- Test set --
Accuracy: 85.45%
Confusion Matrix:
[[137  16]
 [ 24  98]]

-- 5-fold CV --
Average accuracy: 83.82%
Confusion Matrix:
[[668  94]
 [128 482]]
