In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix 

In [45]:
data = pd.read_csv("data/letter-recognition-processed.csv")

In [46]:


# target variable
y =data["lettr"] 
# feature variables
x = data.drop(columns=["lettr"])


In [None]:
# 100 gånger för att få ett medelvärde på accuracy
def processing_loop(x, y, size):
    accuracies = [] 
    matrices = []
    teststorlek = size

    for _ in range(100):
        ## slumpa data i träning och test splitt
        X_train, X_test, y_train, y_test = train_test_split(
            x, y, test_size=teststorlek, shuffle=True
        )

        # skapa en beslut träd med random_state=None för att få olika träd varje gång
        model = DecisionTreeClassifier(random_state=None)
        model.fit(X_train, y_train)

        # prediktion
        y_pred = model.predict(X_test)

        ## mät kvalitet 
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        accuracies.append(acc)
        matrices.append(cm)

    return np.mean(accuracies), accuracies, matrices

    

In [48]:
## original data inte normaliserad
mean_orig_90, acc_orig_90, mats_orig_90 = processing_loop(x, y, 0.10)
mean_orig_70, acc_orig_70, mats_orig_70 = processing_loop(x, y, 0.30)

In [49]:
## normaliserat data med MinMaxScaler
scaler = MinMaxScaler()
x_norm=scaler.fit_transform(x)
mean_norm_90, acc_norm_90, mats_norm_90 = processing_loop(x_norm, y, 0.10)
mean_norm_70, acc_norm_70, mats_norm_70 = processing_loop(x_norm, y, 0.30)

In [50]:
## result 
print("---- RESULT ----")
print(f"Original 90/10 → {mean_orig_90:.4f}")
print(f"Original 70/30 → {mean_orig_70:.4f}")
print(f"Norm 90/10 →     {mean_norm_90:.4f}")
print(f"Norm 70/30 →     {mean_norm_70:.4f}")

---- RESULT ----
Original 90/10 → 0.9843
Original 70/30 → 0.9793
Norm 90/10 →     0.9841
Norm 70/30 →     0.9800


In [51]:
alla = [
    ("Original 90/10", mean_orig_90, acc_orig_90, mats_orig_90),
    ("Original 70/30", mean_orig_70, acc_orig_70, mats_orig_70),
    ("Norm 90/10", mean_norm_90, acc_norm_90, mats_norm_90),
    ("Norm 70/30", mean_norm_70, acc_norm_70, mats_norm_70),
]

topp3 = sorted(alla, key=lambda x: x[1], reverse=True)[:3]

for namn, medel, accs, mats in topp3:
    print("\nKörning:", namn)
    print("Medelaccuracy:", round(medel, 4))

    best = np.argmax(accs)
    print("Bästa confusion matrix:")
    print(mats[best])
    print("------------------------")



Körning: Original 90/10
Medelaccuracy: 0.9843
Bästa confusion matrix:
[[77  0  0]
 [ 0 87  0]
 [ 0  0 68]]
------------------------

Körning: Norm 90/10
Medelaccuracy: 0.9841
Bästa confusion matrix:
[[81  0  0]
 [ 0 68  0]
 [ 0  0 83]]
------------------------

Körning: Norm 70/30
Medelaccuracy: 0.98
Bästa confusion matrix:
[[204   0   0]
 [  1 240   1]
 [  0   1 247]]
------------------------
