In [42]:
# Implementasi dengan KNN

import numpy as np
import pandas as pd
import pickle

dataset1 = pd.read_csv('data_train.csv')
dataset2 = pd.read_csv('data_validation.csv')

In [43]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1-x2)**2))

class KNN:
    def __init__(self, k = 0):
        self.k = k

    def fit(self, x, y):
        self.x_train = x
        self.y_train = y

    def datasplitting(self, data1, data2):
        self.x_train = data1.iloc[:, :-1].values
        self.x_test = data2.iloc[:, :-1].values
        self.y_train = data1.iloc[:, -1].values
        self.y_test = data2.iloc[:, -1].values
        return self.x_train, self.x_test, self.y_train, self.y_test

    def save_model(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump(self, file)

    @classmethod
    def load_model(cls, filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)
        
    def predict(self, X):
        predictions = []
        for x in X:
            # Compute the distance
            distances = [euclidean_distance(x, x_train) for x_train in self.x_train]

            # get the closest k
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]

            # Count occurrences without using Counter
            label_counts = {}
            for label in k_nearest_labels:
                if label in label_counts:
                    label_counts[label] += 1
                else:
                    label_counts[label] = 1

            # Find the label with the maximum count
            most_common_label = max(label_counts, key=label_counts.get)
            predictions.append(most_common_label)

        return predictions

In [44]:
knn = KNN(k = 19)
x_train, x_test, y_train, y_test = knn.datasplitting(dataset1, dataset2)
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)

print("Kolom target data validasi:")
print(y_test)
print()
print("Hasil prediksi:")
print(predictions)
print()

print("Jumlah kolom target data validasi yang sama dengan hasil prediksi:", np.sum(y_test == predictions))
print("Jumlah baris total data validasi:", len(y_test))
print()
accuracy = np.sum(y_test == predictions) / len(y_test) # Perhitungan akurasi diukur dari kolom target validasi yang sama dengan hasil prediksi dibagi jumlah baris totaldata validasi

# Print Confusion matrix
confusion_matrix = pd.crosstab(y_test, predictions, rownames=['Validation'], colnames=['Predicted'])
print(confusion_matrix)
print()

print("Hasil akurasi dengan KNN sebesar", accuracy)
print()

print("Simpan dan load model Naive-Bayes...")
knn.save_model('knn_model.txt')
loaded_nb = KNN.load_model('knn_model.txt')
print()

predictions = loaded_nb.predict(x_test)
accuracy = np.sum(y_test == predictions) / len(y_test)

print("Hasil akurasi dengan KNNsetelah load model sebesar", accuracy)

[1, 2, 3, 0, 2, 1, 2, 0, 3, 2, 3, 2, 3, 0, 3, 0, 2, 1, 0, 2, 3, 2, 0, 1, 2, 0, 3, 1, 0, 3, 1, 3, 3, 0, 2, 3, 1, 3, 2, 1, 1, 2, 0, 2, 2, 1, 1, 2, 2, 3, 1, 2, 3, 0, 1, 3, 2, 3, 3, 2, 2, 3, 3, 1, 3, 2, 3, 2, 3, 3, 2, 3, 1, 0, 1, 2, 0, 2, 1, 0, 3, 3, 0, 2, 3, 1, 3, 3, 0, 2, 1, 1, 1, 2, 2, 1, 3, 2, 0, 3, 3, 3, 1, 2, 3, 1, 3, 3, 3, 3, 3, 3, 2, 1, 3, 1, 0, 2, 1, 3, 1, 2, 3, 3, 2, 0, 2, 2, 1, 3, 3, 1, 0, 0, 3, 0, 0, 2, 3, 0, 1, 3, 3, 1, 2, 3, 1, 2, 1, 2, 3, 0, 0, 2, 1, 1, 2, 0, 1, 3, 3, 3, 0, 2, 3, 0, 0, 1, 1, 2, 2, 1, 0, 1, 0, 1, 3, 1, 2, 0, 3, 1, 1, 2, 0, 2, 0, 3, 2, 0, 3, 2, 1, 0, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 2, 3, 3, 3, 0, 2, 1, 2, 3, 1, 1, 2, 2, 0, 0, 2, 2, 1, 2, 3, 1, 2, 0, 3, 0, 2, 2, 2, 2, 2, 3, 0, 1, 3, 3, 3, 0, 0, 0, 0, 3, 3, 1, 1, 0, 2, 2, 1, 3, 3, 0, 2, 2, 0, 0, 1, 0, 2, 2, 3, 3, 0, 1, 3, 3, 0, 0, 2, 1, 1, 1, 1, 3, 2, 2, 2, 0, 1, 0, 2, 1, 1, 0, 1, 3, 1, 3, 0, 1, 0, 1, 3, 3, 3, 0, 3, 1, 0, 0, 1, 2, 2, 0, 3, 1, 1, 0, 2, 3, 1, 3, 2, 3, 1, 2, 3, 0, 0, 3, 1, 2, 1, 0, 1, 1, 0, 3, 2, 3, 