In [None]:
import pandas as pd

df = pd.read_csv("ml-dataset.csv")
X = df.drop("decision", axis=1)
y = df["decision"]
print(y.value_counts())

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average="macro"))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average="macro"))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f1_score(y_test, y_pred, average="macro"))
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

knn, dt, nb = [], [], []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    clf = KNeighborsClassifier(n_neighbors=3)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    knn.append(f1_score(y_test, y_pred, average="macro"))

    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    dt.append(f1_score(y_test, y_pred, average="macro"))

    clf = GaussianNB()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    nb.append(f1_score(y_test, y_pred, average="macro"))

print("KNN:", " ".join(map(str, knn)), sum(knn) / len(knn))
print("DT:", " ".join(map(str, dt)), sum(dt) / len(dt))
print("NB:", " ".join(map(str, nb)), sum(nb) / len(nb))