In [None]:
# Cell 1: Importera alla bibliotek vi behöver

import numpy as np
import pandas as pd

from IPython.display import display

from sklearn.model_selection import train_test_split      # Dela upp data i train/test
from sklearn.preprocessing import MinMaxScaler            # Normalisering till [0,1]
from sklearn.neighbors import KNeighborsClassifier        # k-Nearest Neighbors (k-NN)
from sklearn.metrics import confusion_matrix, accuracy_score  # Confusion matrix + accuracy



In [None]:
# Cell 2: Läs in student-mat och skapa X (features) och y (tre betygsklasser)

# Läs in CSV-filen (matematik-elever)
df = pd.read_csv("student-mat.csv", sep=";")

# Välj alla numeriska kolumner
numeric_cols = df.select_dtypes(include=[np.number]).columns

# X = alla numeriska kolumner UTOM G3 (slutbetyget)
X = df[numeric_cols].drop(columns=["G3"])

# y = G3 omvandlat till tre klasser:
# 0–9   -> "low"
# 10–14 -> "medium"
# 15–20 -> "high"
bins = [-1, 9, 14, 20]                # gränser
labels = ["low", "medium", "high"]    # namn på klasserna
y = pd.cut(df["G3"], bins=bins, labels=labels)

print("Fördelning av klasser (y):")
print(y.value_counts())
print("\nForm på X (rader, kolumner):", X.shape)


In [None]:
# Cell 3: Bestäm vilka k och vilka train/test-splittar vi ska köra

# Tre udda k mellan 3 och 15 (krav i uppgiften)
k_values = [3, 7, 11]

# Tre olika train/test-splittar
splits = [
    (0.10, "train=90%, test=10%"),
    (1/3,  "train=2/3, test=1/3"),
    (0.50, "train=50%, test=50%")
]

# Här sparar vi alla körningar (k, data-typ, split, accuracy)
results = []



In [None]:
# Cell 4: Kör alla 18 körningar (3 k * 3 splits * 2 datatyper)

for k in k_values:
    for test_size, split_text in splits:
        # ------------------------------------------------
        # Steg 1: Dela upp data i train och test (EN gång)
        # ------------------------------------------------
        X_train, X_test, y_train, y_test = train_test_split(
            X,
            y,
            test_size=test_size,
            stratify=y,
            random_state=42   # samma uppdelning varje gång
        )

        # ============================
        # A) ORIGINALDATA (ej skalat)
        # ============================

        # Skapa k-NN-modellen
        knn = KNeighborsClassifier(n_neighbors=k)

        # Träna på originaldata
        knn.fit(X_train, y_train)

        # Prediktioner på testdata
        y_pred = knn.predict(X_test)

        # Confusion matrix + accuracy
        cm = confusion_matrix(y_test, y_pred, labels=labels)
        acc = accuracy_score(y_test, y_pred)

        print("----------------------------------------------------")
        print(f"DATA: original | k = {k} | {split_text}")
        print("Confusion matrix (rader = sant, kolumner = prediktion):")
        print(cm)
        print(f"Accuracy: {acc:.4f}")

        # Spara resultat (utan själva matrisen – vi räknar om den för top 3 sen)
        results.append({
            "k": k,
            "data_typ": "original",
            "split_text": split_text,
            "test_size": test_size,
            "accuracy": acc
        })

        # =======================================
        # B) NORMALISERAT DATA [0,1] (MinMax)
        # =======================================

        # Skala train och test med MinMaxScaler
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Ny k-NN-modell (samma k)
        knn_scaled = KNeighborsClassifier(n_neighbors=k)

        # Träna på normaliserad data
        knn_scaled.fit(X_train_scaled, y_train)

        # Prediktioner på normaliserad testdata
        y_pred_scaled = knn_scaled.predict(X_test_scaled)

        # Confusion matrix + accuracy
        cm_scaled = confusion_matrix(y_test, y_pred_scaled, labels=labels)
        acc_scaled = accuracy_score(y_test, y_pred_scaled)

        print("----------------------------------------------------")
        print(f"DATA: normaliserat [0,1] | k = {k} | {split_text}")
        print("Confusion matrix (rader = sant, kolumner = prediktion):")
        print(cm_scaled)
        print(f"Accuracy: {acc_scaled:.4f}")

        # Spara resultat
        results.append({
            "k": k,
            "data_typ": "normaliserat [0,1]",
            "split_text": split_text,
            "test_size": test_size,
            "accuracy": acc_scaled
        })


In [None]:
# Cell 5: Skapa en tabell med alla 18 körningar (som Figur A2)

res_df = pd.DataFrame(results)

print("Alla 18 körningar (k, data-typ, split, accuracy):")
display(res_df)


In [None]:
# Cell 6: Hitta de tre bästa (högst accuracy)

res_sorted = res_df.sort_values(by="accuracy", ascending=False)
top3 = res_sorted.head(3)

print("Tre bästa körningar (sorterat efter accuracy):")
display(top3)



In [None]:
# Cell 7: För de tre bästa körningarna – räkna ut confusion matrix + accuracy igen

print("Confusion matrix + accuracy för de tre bästa körningarna:\n")

for i, row in top3.reset_index(drop=True).iterrows():
    k_best = row["k"]
    data_typ_best = row["data_typ"]
    test_size_best = row["test_size"]
    split_text_best = row["split_text"]

    print("========================================")
    print(f"Körning {i+1}:")
    print(f"k = {k_best}")
    print(f"Data: {data_typ_best}")
    print(f"Split: {split_text_best}")

    # 1. Samma train/test-split igen
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size_best,
        stratify=y,
        random_state=42
    )

    # 2. Välj om vi ska normalisera eller inte
    if data_typ_best == "normaliserat [0,1]":
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train_used = scaler.fit_transform(X_train)
        X_test_used = scaler.transform(X_test)
    else:
        X_train_used = X_train
        X_test_used = X_test

    # 3. Skapa och träna k-NN med rätt k
    knn_best = KNeighborsClassifier(n_neighbors=k_best)
    knn_best.fit(X_train_used, y_train)

    # 4. Prediktioner + utvärdering
    y_pred_best = knn_best.predict(X_test_used)
    cm_best = confusion_matrix(y_test, y_pred_best, labels=labels)
    acc_best = accuracy_score(y_test, y_pred_best)

    print("Confusion matrix (rader = sant, kolumner = prediktion):")
    print(cm_best)
    print(f"Accuracy: {acc_best:.4f}")
