In [32]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

dataset = pd.read_csv('/Users/farrelmanazilin/Downloads/data/milk.csv')

['high' 'low' 'medium']


# Hold-out Method (70%-30%)

In [36]:
label_encoder = LabelEncoder()
dataset['Grade'] = label_encoder.fit_transform(dataset['Grade'])

datalabel = np.array(dataset)[:,-1]
data = np.array(dataset)[:,:-1]

xtrain, xtest, ytrain, ytest = train_test_split(data, datalabel, test_size=0.30, random_state=100)

classifier = GaussianNB()
classifier.fit(xtrain, ytrain)
ypred = classifier.predict(xtest)

accuracy = accuracy_score(ytest, ypred)
error_ratio_holdout = 1 - accuracy

print("Akurasi tanpa menggunakan normalisasi metode Hold-out (70-30%): {:.10f}".format(accuracy))
print("Error ratio tanpa menggunakan normalisasi metode Hold-out (70-30%): {:.10f}".format(error_ratio_holdout))

Akurasi tanpa menggunakan normalisasi metode Hold-out (70-30%): 0.9182389937
Error ratio tanpa menggunakan normalisasi metode Hold-out (70-30%): 0.0817610063


In [4]:
min_train = xtrain.min()
max_train = xtrain.max()
newmin = 0
newmax = 1

xtrain_normalized = ((xtrain - min_train)*(newmax-newmin) / (max_train - min_train))+newmin
xtest_normalized = ((xtest - min_train)*(newmax-newmin) / (max_train - min_train))+newmin

classifier = GaussianNB()
classifier.fit(xtrain_normalized, ytrain)
ypred = classifier.predict(xtest_normalized)

accuracy = accuracy_score(ytest, ypred)
error_ratio_holdout = 1 - accuracy

print("Akurasi untuk metode Hold-out (70-30%): {:.10f}".format(accuracy))
print("Error ratio untuk metode Hold-out (70-30%): {:.10f}".format(error_ratio_holdout))

Akurasi untuk metode Hold-out (70-30%): 0.9182389937
Error ratio untuk metode Hold-out (70-30%): 0.0817610063


# kfold 10

In [24]:
kf = KFold(n_splits=10, random_state=0, shuffle=True)
p = 0
error_ratios = []
error_ratios_normalized = []

for train_index, test_index in kf.split(dataset):
    p = p + 1
    xtrain = dataset.iloc[train_index, :-1]  
    xtest = dataset.iloc[test_index, :-1]    
    ytrain = dataset.iloc[train_index, -1]    
    ytest = dataset.iloc[test_index, -1]      

    # Gaussian Naïve Bayes within K-Fold loop (Non-normalized data)
    classifier = GaussianNB()
    classifier.fit(xtrain, ytrain)
    ypred = classifier.predict(xtest)

    accuracy = accuracy_score(ytest, ypred)
    error_ratio = 1 - accuracy
    error_ratios.append(error_ratio)

    # Min-max scaling for K-Nearest Neighbors within K-Fold loop
    scaler = MinMaxScaler()
    xtrain_normalized = scaler.fit_transform(xtrain)
    xtest_normalized = scaler.transform(xtest)

    # K-Nearest Neighbors within K-Fold loop (Normalized data)
    classifier_normalized = GaussianNB()
    classifier_normalized.fit(xtrain_normalized, ytrain)
    ypred_normalized = classifier_normalized.predict(xtest_normalized)

    accuracy_normalized = accuracy_score(ytest, ypred_normalized)
    error_ratio_normalized = 1 - accuracy_normalized
    error_ratios_normalized.append(error_ratio_normalized)
    

avg_error_ratio = np.mean(error_ratios)
avg_error_ratio_normalized = np.mean(error_ratios_normalized)

print("Rata-rata Error ratio tanpa menggunakan normalisasi metode Gaussian Naïve Bayes (KFOLD, k=10): {:.10f}".format(avg_error_ratio))
print("Rata-rata Error akurasi tanpa menggunakan normalisasi metode Gaussian Naïve Bayes (KFOLD, k=10): {:.10f}".format(1 - avg_error_ratio))
print("Hasil error Kfold tanpa menggunakan normalisasi Gaussian Naïve Bayes\n", error_ratios)

print("\nRata-rata Error ratio untuk metode Gaussian Naïve Bayes (KFOLD, k=10) dengan Normalisasi: {:.10f}".format(avg_error_ratio_normalized))
print("Rata-rata Error akurasi untuk metode Gaussian Naïve Bayes (KFOLD, k=10) dengan Normalisasi: {:.10f}".format(1 - avg_error_ratio_normalized))
print("Hasil error Kfold untuk K-Nearest Neighbors dengan Normalisasi\n", error_ratios_normalized)

Rata-rata Error ratio tanpa menggunakan normalisasi metode Gaussian Naïve Bayes (KFOLD, k=10): 0.0811680144
Rata-rata Error akurasi tanpa menggunakan normalisasi metode Gaussian Naïve Bayes (KFOLD, k=10): 0.9188319856
Hasil error Kfold tanpa menggunakan normalisasi Gaussian Naïve Bayes
 [0.23584905660377353, 0.09433962264150941, 0.07547169811320753, 0.08490566037735847, 0.09433962264150941, 0.028301886792452824, 0.09433962264150941, 0.037735849056603765, 0.028301886792452824, 0.03809523809523807]

Rata-rata Error ratio untuk metode Gaussian Naïve Bayes (KFOLD, k=10) dengan Normalisasi: 0.0811680144
Rata-rata Error akurasi untuk metode Gaussian Naïve Bayes (KFOLD, k=10) dengan Normalisasi: 0.9188319856
Hasil error Kfold untuk K-Nearest Neighbors dengan Normalisasi
 [0.23584905660377353, 0.09433962264150941, 0.07547169811320753, 0.08490566037735847, 0.09433962264150941, 0.028301886792452824, 0.09433962264150941, 0.037735849056603765, 0.028301886792452824, 0.03809523809523807]


# LOO

In [43]:
loo = LeaveOneOut()
error_ratios_knn = []
error_ratios_bayesian = []


for train_index, test_index in loo.split(dataset):
    xtrain = dataset.iloc[train_index]
    xtest = dataset.iloc[test_index]
    ytrain = datalabel[train_index]
    ytest = datalabel[test_index]

    # Gaussian Naïve Bayes within LOO loop
    classifier = GaussianNB()
    classifier.fit(xtrain, ytrain.ravel())
    ypred = classifier.predict(xtest)

    accuracy = accuracy_score(ytest, ypred)
    error_ratio = 1 - accuracy
    error_ratios.append(error_ratio)
    
    # Min-max scaling for K-Nearest Neighbors within K-Fold loop
    scaler = MinMaxScaler()
    xtrain_normalized = scaler.fit_transform(xtrain)
    xtest_normalized = scaler.transform(xtest)

    # K-Nearest Neighbors within K-Fold loop (Normalized data)
    classifier_normalized = GaussianNB()
    classifier_normalized.fit(xtrain_normalized, ytrain)
    ypred_normalized = classifier_normalized.predict(xtest_normalized)

    accuracy_normalized = accuracy_score(ytest, ypred_normalized)
    error_ratio_normalized = 1 - accuracy_normalized
    error_ratios_bayesian.append(error_ratio_normalized)

avg_error_ratio_LOO = np.mean(error_ratios)
avg_error_ratio_LOO_bayesian = np.mean(error_ratios_bayesian)

print("Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(avg_error_ratio_LOO_bayesian))
print("Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(1 - avg_error_ratio_LOO_bayesian))

print("\nRata-rata Error ratio tanpa normalisasi menggunakan metode Bayesian Naïve Bayes (LOO): {:.10f}".format(avg_error_ratio_LOO))
print("Rata-rata Akurasi tanpa menggunakan normalisasi metode Bayesian Naïve Bayes (LOO): {:.10f}".format(1 - avg_error_ratio_LOO))

Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): 0.0009442871
Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): 0.9990557129

Rata-rata Error ratio tanpa normalisasi menggunakan metode Bayesian Naïve Bayes (LOO): 0.0394465418
Rata-rata Akurasi tanpa menggunakan normalisasi metode Bayesian Naïve Bayes (LOO): 0.9605534582


In [46]:
datalabel = dataset.loc[:, ['Grade']]

# Create a Leave-One-Out cross-validator
loo = LeaveOneOut()

# Initialize a list to store accuracy scores
accuracy_scores_normalisasi = []
accuracy_scores_tanpanormalisasi = []

for train_index, test_index in loo.split(dataset):
    xtrain = dataset.iloc[train_index, :-1]  
    xtest = dataset.iloc[test_index, :-1]    
    ytrain = datalabel.iloc[train_index]      
    ytest = datalabel.iloc[test_index]        
    
    classifier = GaussianNB()
    classifier.fit(xtrain, ytrain.values.ravel())
    ypred = classifier.predict(xtest)
    accuracytn = accuracy_score(ytest, ypred)
    accuracy_scores_tanpanormalisasi.append(accuracytn)
    
    train_data = xtrain
    test_data = xtest
    newmin = 0
    newmax = 1
    mindata = train_data.min()
    maxdata = train_data.max()
    train_data = ((train_data - mindata) * (newmax - newmin) / (maxdata - mindata)) + newmin
    test_data = ((test_data - mindata) * (newmax - newmin) / (maxdata - mindata)) + newmin

    classifier = GaussianNB()
    classifier.fit(train_data, ytrain.values.ravel()) 
    ypred = classifier.predict(test_data)

    accuracyn = accuracy_score(ytest, ypred)
    accuracy_scores_normalisasi.append(accuracyn)


average_accuracy_tn = sum(accuracy_scores_tn) / len(accuracy_scores_tn)
average_accuracy_n = sum(accuracy_scores_n) / len(accuracy_scores_n)

print("Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(average_accuracy_n))
print("Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): {:.10f}".format(1 - average_accuracy_tn))

print("\nRata-rata Error ratio tanpa normalisasi menggunakan metode Bayesian Naïve Bayes (LOO): {:.10f}".format(average_accuracy_tn))
print("Rata-rata Akurasi tanpa menggunakan normalisasi metode Bayesian Naïve Bayes (LOO): {:.10f}".format(1 - average_accuracy_tn))

Rata-rata Error ratio untuk metode Bayesian Naïve Bayes (LOO): 0.9310670444
Rata-rata Akurasi untuk metode Bayesian Naïve Bayes (LOO): 0.0689329556

Rata-rata Error ratio tanpa normalisasi menggunakan metode Bayesian Naïve Bayes (LOO): 0.9310670444
Rata-rata Akurasi tanpa menggunakan normalisasi metode Bayesian Naïve Bayes (LOO): 0.0689329556
