## Data Cleaning

In [7]:
import pandas as pd

def clean_data(file_path):
    # Membaca file CSV
    df = pd.read_csv(file_path, delimiter=",")

    # Jika file testing, mengganti nama kolom sesuai yang diinginkan
    new_column_names = ['No.', 'fitur1', 'fitur2', 'fitur3', 'fitur4', 'fitur5', 'fitur6', 'fitur7', 'Kelas']
    df.columns = new_column_names

    # Mengganti tanda koma dengan titik desimal
    for column in df.columns[1:]:
        df[column] = df[column].astype(str).str.replace(',', '.')
        df[column] = df[column].str.replace(';', '')
    
    # Mengubah tipe data ke float untuk kolom fitur
    for column in df.columns[1:-1]:
        df[column] = df[column].astype(float)

    return df

# Path ke file CSV data training dan data testing
training_file_path = 'Data-AnalisisPenyakitDarah - modellingtraining.csv'
testing_file_path = 'Data-AnalisisPenyakitDarah - testing.csv'

# Membersihkan data training dan data testing
cleaned_training_data = clean_data(training_file_path)
cleaned_testing_data = clean_data(testing_file_path)

# Melanjutkan ke tahap selanjutnya dengan data yang telah dibersihkan
# Contoh: menampilkan 5 baris pertama dari data yang telah dibersihkan
print("Data Training yang telah dibersihkan:")
print(cleaned_training_data.head())

print("\nData Testing yang telah dibersihkan:")
print(cleaned_testing_data.head())

Data Training yang telah dibersihkan:
      No.  fitur1  fitur2  fitur3  fitur4  fitur5  fitur6  fitur7 Kelas
0    E541    3.78     9.0    29.4    77.7    23.8    30.7    21.1   BTT
1    D272    5.94    10.3    37.5    63.1    17.4    27.5    15.9   BTT
2    F728    5.93    10.8    34.4    58.0    18.2    31.3    16.0   BTT
3  N1,848    5.29    11.0    37.6    71.0    20.7    29.2    16.9   BTT
4   O2320    5.79    11.0    37.0    63.9    19.0    29.7    16.6   BTT

Data Testing yang telah dibersihkan:
      No.  fitur1  fitur2  fitur3  fitur4  fitur5  fitur6  fitur7 Kelas
0    D290    5.20     9.8    31.4    60.4    18.8    31.2    16.8   BTT
1  K1,164    5.26    10.9    36.4    69.2    20.8    30.1    15.5   BTT
2    F744    5.55    11.2    36.8    66.3    20.2    30.5    16.5   BTT
3    F678    6.61    11.7    38.1    57.7    17.7    30.7    16.9   BTT
4  L1,341    4.92    11.9    37.5    76.1    24.1    31.7    16.9   BTT


In [8]:
# Memisahkan fitur dan label
X_train = cleaned_training_data.drop(columns=['No.', 'Kelas'])
y_train = cleaned_training_data['Kelas']

X_test = cleaned_testing_data.drop(columns=['No.', 'Kelas'])
y_test = cleaned_testing_data['Kelas']

print(X_test)

    fitur1  fitur2  fitur3  fitur4  fitur5  fitur6  fitur7
0     5.20     9.8    31.4    60.4    18.8    31.2    16.8
1     5.26    10.9    36.4    69.2    20.8    30.1    15.5
2     5.55    11.2    36.8    66.3    20.2    30.5    16.5
3     6.61    11.7    38.1    57.7    17.7    30.7    16.9
4     4.92    11.9    37.5    76.1    24.1    31.7    16.9
..     ...     ...     ...     ...     ...     ...     ...
58    5.36    11.3    37.5    70.0    21.1    30.2    17.5
59    4.99    11.6    38.9    77.8    23.3    29.9    17.4
60    4.76    12.3    38.6    81.2    25.9    31.9    14.4
61    5.38    12.9    39.6    73.7    23.9    32.5    15.1
62    5.24    13.9    41.9    80.0    26.5    33.1    14.9

[63 rows x 7 columns]


## Model Training

In [13]:
import numpy as np
import pandas as pd

def minkowski(a, b, p):
    return np.power(np.sum(np.power(np.abs(a-b), p)), 1/p)

class KNN:
    def __init__(self, p=2, k=11):
        self.p = p
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_predicted = []
        for test_row in X_test.itertuples(index=False, name=None):
            distances = self.X_train.apply(lambda row: minkowski(row.values, np.array(test_row), self.p), axis=1)
            smallest_distances = distances.nsmallest(self.k)
            indices_distances = smallest_distances.index
            classes = self.y_train[indices_distances]
            majority_class = classes.value_counts().idxmax()
            y_predicted.append(majority_class)
        return y_predicted

In [10]:
# # Fungsi Jarak
# import numpy as np

# def minkowski(a, b, p) :
#     return np.power(np.sum(np.power(np.abs(a-b), p)), 1/p)

# # Reference row (first row)
# test_rows = [X_test.iloc[i] for i in range(len(X_test))] 

# # Minkowski distance parameter
# p = 2  # Example for Euclidean distance
# k = 11
# y_predicted = []
# for test_row in test_rows : 
#     distances = X_train.apply(lambda row: minkowski(row.values, test_row.values, p), axis=1)
#     smallest_distances  = distances.nsmallest(k)
#     indices_distances = smallest_distances.index
#     classes = y_train[indices_distances]
    
#     majority_class = classes.value_counts().idxmax()
#     # print(classes)
#     # print(majority_class)
#     y_predicted.append(majority_class)
#     # print(hasil)

# print(y_predicted)

# # print("Distances from the reference row:\n", distances)
# # print(reference_rows)
# # class KNN :
#     #  self.__init(dist)



## KNN dengan Manhattan Distances
Didapat dengan set $p = 1$

In [11]:
# Example usage
# X_train, y_train, X_test should be pandas DataFrames or Series
knn_mh = KNN(p=1, k=11)
knn_mh.fit(X_train, y_train)
y_predicted = knn_mh.predict(X_test)

print(y_predicted)

['DB', 'DB', 'BTT', 'BTT', 'DB', 'HbE', 'BTT', 'HbE', 'DB', 'HbE', 'HbE', 'HbE', 'DB', 'HbE', 'DB', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'HbE', 'DB', 'DB', 'DB', 'DB', 'HbE', 'HbE', 'HbE']


## KNN dengan Euclidean Distances 
Didapat dengan set $p = 2$

In [12]:
# Example usage
# X_train, y_train, X_test should be pandas DataFrames or Series
knn_mh = KNN(p=2, k=11)
knn_mh.fit(X_train, y_train)
y_predicted = knn_mh.predict(X_test)

print(y_predicted)

['DB', 'DB', 'BTT', 'DB', 'DB', 'HbE', 'BTT', 'HbE', 'DB', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'DB', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'HbE', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'DB', 'HbE', 'DB', 'DB', 'DB', 'HbE', 'DB', 'DB', 'DB', 'DB', 'DB', 'HbE', 'HbE']
