In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

dataset = pd.read_csv('/Users/farrelmanazilin/Downloads/data/milk.csv')

# Hold-out Method (70%-30%)

In [19]:
label_encoder = LabelEncoder()
dataset['Grade'] = label_encoder.fit_transform(dataset['Grade'])

datalabel = dataset.loc[:, ['Grade']]
data = dataset.iloc[:, :-1]

xtrain, xtest, ytrain, ytest = train_test_split(data, datalabel, test_size=0.30, random_state=100)

kNN = KNeighborsClassifier(n_neighbors=3, weights='distance')
kNN.fit(xtrain, ytrain.values.ravel())

y_pred = kNN.predict(xtest)

accuracy = accuracy_score(ytest, y_pred)
error_ratio_holdout = 1 - accuracy

print("Akurasi untuk metode Hold-out (70-30%): {:.10f}".format(accuracy))
print("Error ratio untuk metode Hold-out (70-30%): {:.10f}".format(error_ratio_holdout))

Akurasi untuk metode Hold-out (70-30%): 0.9937106918
Error ratio untuk metode Hold-out (70-30%): 0.0062893082


In [20]:
classifier = GaussianNB()
classifier.fit(xtrain, ytrain.values.ravel())
ypred = classifier.predict(xtest)

accuracy = accuracy_score(ytest, ypred)
error_ratio_holdout = 1 - accuracy

print("Akurasi untuk metode Hold-out (70-30%): {:.10f}".format(accuracy))
print("Error ratio untuk metode Hold-out (70-30%): {:.10f}".format(error_ratio_holdout))

Akurasi untuk metode Hold-out (70-30%): 0.9182389937
Error ratio untuk metode Hold-out (70-30%): 0.0817610063


In [21]:
min_train = xtrain.min()
max_train = xtrain.max()

xtrain_normalized = (xtrain - min_train) / (max_train - min_train)
xtest_normalized = (xtest - min_train) / (max_train - min_train)

kNN = KNeighborsClassifier(n_neighbors=3, weights='distance')
kNN.fit(xtrain_normalized, ytrain.values.ravel())  
y_pred = kNN.predict(xtest_normalized)

accuracy = accuracy_score(ytest, y_pred)
error_ratio_holdout = 1 - accuracy

print("Error ratio untuk metode Hold-out (30-70%): {:.10f}".format(accuracy ))
print("Error ratio untuk metode Hold-out (30-70%): {:.10f}".format(error_ratio_holdout))

Error ratio untuk metode Hold-out (30-70%): 0.9937106918
Error ratio untuk metode Hold-out (30-70%): 0.0062893082


# kfold 10

In [28]:
kf = KFold(n_splits=10, random_state=0, shuffle=True)
p = 0
error_ratios_fold = []
error_ratios_bayesian = []

for train_index, test_index in kf.split(dataset):
    p = p + 1
    xtrain = dataset.loc[train_index]
    xtest = dataset.loc[test_index]
    ytrain = xtrain.loc[:, ["Grade"]]
    ytest = xtest.loc[:, ['Grade']]

    # K-Nearest Neighbors within K-Fold loop
    kNN_fold = KNeighborsClassifier(n_neighbors=3, weights='distance')
    kNN_fold.fit(xtrain.iloc[:, :-1], ytrain.values.ravel())
    y_pred_fold_knn = kNN_fold.predict(xtest.iloc[:, :-1])

    accuracy_fold_knn = accuracy_score(ytest, y_pred_fold_knn)
    error_ratio_fold_knn = 1 - accuracy_fold_knn
    error_ratios_fold.append(error_ratio_fold_knn)

    # Gaussian Naïve Bayes within K-Fold loop
    classifier = GaussianNB()
    classifier.fit(xtrain.iloc[:, :-1], ytrain.values.ravel())
    ypred_bayesian = classifier.predict(xtest.iloc[:, :-1])

    accuracy_bayesian = accuracy_score(ytest, ypred_bayesian)
    error_ratio_bayesian = 1 - accuracy_bayesian
    error_ratios_bayesian.append(error_ratio_bayesian)

avg_error_ratio_fold = np.mean(error_ratios_fold)
avg_error_ratio_bayesian = np.mean(error_ratios_bayesian)

print("Rata-rata Error ratio untuk metode K-Nearest Neighbors (k=10): {:.10f}".format(avg_error_ratio_fold))
print("Rata-rata Error akurasi untuk metode K-Fold Cross-Validation (k=10): {:.10f}".format(1-avg_error_ratio_fold))
print("Hasil error Kfold tanpa bayesian\n", error_ratios_fold)
print("\nRata-rata Error ratio untuk metode Bayesian Naïve Bayes (k=10): {:.10f}".format(avg_error_ratio_bayesian))
print("Rata-rata Error akurasi untuk metode Bayesian Naïve Bayes (k=10): {:.10f}".format(1-avg_error_ratio_bayesian))
print("Hasil error Kfold dengan bayesian\n", error_ratios_bayesian)


Rata-rata Error ratio untuk metode K-Nearest Neighbors (k=10): 0.0047169811
Rata-rata Error akurasi untuk metode K-Fold Cross-Validation (k=10): 0.9952830189
Hasil error Kfold tanpa bayesian
 [0.009433962264150941, 0.0, 0.0, 0.009433962264150941, 0.009433962264150941, 0.009433962264150941, 0.0, 0.0, 0.009433962264150941, 0.0]

Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (k=10): 0.0811680144
Rata-rata Error akurasi untuk metode Bayesian Naïve Bayes (k=10): 0.9188319856
Hasil error Kfold dengan bayesian
 [0.23584905660377353, 0.09433962264150941, 0.07547169811320753, 0.08490566037735847, 0.09433962264150941, 0.028301886792452824, 0.09433962264150941, 0.037735849056603765, 0.028301886792452824, 0.03809523809523807]


# LOO

In [30]:
loo = LeaveOneOut()
error_ratios_knn = []
error_ratios_bayesian = []

mindata_LOO = dataset.iloc[:, 0:-1].min()
maxdata_LOO = dataset.iloc[:, 0:-1].max()
dataset_normalized = ((dataset.iloc[:, 0:-1] - mindata_LOO) * (newmax - newmin) / (maxdata_LOO - mindata_LOO)) + newmin

for train_index, test_index in loo.split(dataset_normalized):
    xtrain = dataset_normalized.iloc[train_index]
    xtest = dataset_normalized.iloc[test_index]
    ytrain = datalabel.iloc[train_index]
    ytest = datalabel.iloc[test_index]

    # K-Nearest Neighbors within LOO loop
    kNN_LOO = KNeighborsClassifier(n_neighbors=3, weights='distance')
    kNN_LOO.fit(xtrain, ytrain.values.ravel())
    y_pred_LOO_knn = kNN_LOO.predict(xtest)

    accuracy_LOO_knn = accuracy_score(ytest, y_pred_LOO_knn)
    error_ratio_LOO_knn = 1 - accuracy_LOO_knn
    error_ratios_knn.append(error_ratio_LOO_knn)

    # Gaussian Naïve Bayes within LOO loop
    classifier = GaussianNB()
    classifier.fit(xtrain, ytrain.values.ravel())
    ypred_bayesian = classifier.predict(xtest)

    accuracy_bayesian = accuracy_score(ytest, ypred_bayesian)
    error_ratio_bayesian = 1 - accuracy_bayesian
    error_ratios_bayesian.append(error_ratio_bayesian)

avg_error_ratio_LOO_knn = np.mean(error_ratios_knn)
avg_error_ratio_LOO_bayesian = np.mean(error_ratios_bayesian)

print("Rata-rata Error ratio untuk metode K-Nearest Neighbors (LOO): {:.10f}".format(avg_error_ratio_LOO_knn))
print("Rata-rata Akurasi untuk metode K-Nearest Neighbors (LOO): {:.10f}".format(1 - avg_error_ratio_LOO_knn))
print("\nRata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(avg_error_ratio_LOO_bayesian))
print("Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(1 - avg_error_ratio_LOO_bayesian))

Rata-rata Error ratio untuk metode K-Nearest Neighbors (LOO): 0.0018885741
Rata-rata Akurasi untuk metode K-Nearest Neighbors (LOO): 0.9981114259

Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): 0.0689329556
Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): 0.9310670444
