In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline                                      


In [None]:
df = pd.read_csv("student-por.csv",sep=";")

In [None]:
df.describe()

In [None]:
# Välj alla numeriska kolumner
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols


In [None]:
X = df[numeric_cols].drop(columns=["G3"])

In [None]:
bins = [-1, 9, 14, 20]
labels = ["low", "medium", "high"]

y = pd.cut(df["G3"], bins=bins, labels=labels)

y.value_counts()


In [None]:
def kör_knn(X, y, k, test_size, normalisera=False, random_state=42):
    """
    Gör EN körning med kNN:
    - delar upp i train/test
    - normaliserar ev. med MinMaxScaler
    - tränar kNN
    - returnerar confusion matrix och accuracy
    """
    # Dela upp data i train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        stratify=y,         # behåll klassfördelning
        random_state=random_state
    )
    
    # Eventuell normalisering till [0,1]
    if normalisera:
        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train = scaler.fit_transform(X_train)  # fit på train
        X_test = scaler.transform(X_test)        # transform på test
    
    # Skapa och träna kNN-modellen
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    # Prediktion på testdata
    y_pred = knn.predict(X_test)
    
    # Confusion matrix + accuracy
    cm = confusion_matrix(y_test, y_pred, labels=y.cat.categories)
    acc = accuracy_score(y_test, y_pred)
    
    return cm, acc


In [None]:
k_values = [3, 7, 11]

splits = [
    (0.10, "train=90%, test=10%"),
    (1/3, "train=2/3, test=1/3"),
    (0.50, "train=50%, test=50%")
]

resultat = []

for k in k_values:
    for test_size, split_text in splits:
        for norm_flag, norm_text in [(False, "originaldata"),
                                     (True, "normaliserat [0,1]")]:
            
            cm, acc = kör_knn(X, y, k=k, test_size=test_size,
                              normalisera=norm_flag)
            
            print("-------------------------------------------------")
            print(f"k = {k}, data = {norm_text}, split = {split_text}")
            print("Confusion matrix (klasser: low, medium, high):")
            print(cm)
            print(f"Accuracy: {acc:.4f}")
            
            resultat.append({
                "k": k,
                "data_typ": norm_text,
                "split": split_text,
                "test_size": test_size,
                "accuracy": acc,
                "confusion_matrix": cm
            })


In [None]:
res_df = pd.DataFrame([
    {k: v for k, v in r.items() if k != "confusion_matrix"}
    for r in resultat
])

# Visa enklare tabell
print(res_df[["k", "data_typ", "split", "accuracy"]])


In [None]:
top3 = res_df.sort_values(by="accuracy", ascending=False).head(3)
top3
