In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv('data/letter-recognition.csv')

In [11]:
df.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [14]:
df.describe()

Unnamed: 0,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,4.02355,7.0355,5.12185,5.37245,3.50585,6.8976,7.50045,4.6286,5.17865,8.28205,6.454,7.929,3.0461,8.33885,3.69175,7.8012
std,1.913212,3.304555,2.014573,2.26139,2.190458,2.026035,2.325354,2.699968,2.380823,2.488475,2.63107,2.080619,2.332541,1.546722,2.567073,1.61747
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,5.0,4.0,4.0,2.0,6.0,6.0,3.0,4.0,7.0,5.0,7.0,1.0,8.0,2.0,7.0
50%,4.0,7.0,5.0,6.0,3.0,7.0,7.0,4.0,5.0,8.0,6.0,8.0,3.0,8.0,3.0,8.0
75%,5.0,9.0,6.0,7.0,5.0,8.0,9.0,6.0,7.0,10.0,8.0,9.0,4.0,9.0,5.0,9.0
max,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0


In [18]:
# Förbered data
# Vi väljer relevanta kolumner. 'car_or_bus' är vår target (det vi vill klassificera).
# Vi tar bort 'id', 'id_android' och 'linha' (för många saknade värden).
chosen_letters = ['A','B','C','Z']
df_filtred = df[df['lettr'].isin(chosen_letters)]

X = df_filtred.drop(columns=['lettr'])
y = df_filtred['lettr']

In [19]:
def run_svm_experiment(experiment_name, X_data, y_data, test_size, kernel, scale_data=False):
    print(f"\n{'='*10} {experiment_name} {'='*10}")
    
    # Split data random_state=42
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=test_size,)
    
    # Normalisering (skalning)
    if scale_data:
        scaler = MinMaxScaler() # Skalar till intervallet [0, 1]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        print("Data: Normaliserat (MinMax)")
    else:
        print("Data: Icke-normaliserat (Original)")
        
    print(f"Split: Train={(1-test_size)*100:.0f}%, Test={test_size*100:.0f}%")
    print(f"Kernel: {kernel}")

    # Träna modell
    svc = SVC(kernel=kernel)
    svc.fit(X_train, y_train)
    
    # Prediktion och utvärdering
    y_pred = svc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    return acc, cm

# --- EXPERIMENT ---
# a.i: Icke-normaliserat, 75% train / 25% test, RBF (default)
run_svm_experiment("Exp 1: Oskalat Data", X, y, 0.25, 'rbf', scale_data=False)

# a.ii: Normaliserat, 75% train / 25% test, RBF
run_svm_experiment("Exp 2: Normaliserat Data", X, y, 0.25, 'rbf', scale_data=True)

# b.ii: Normaliserat, 50% train / 50% test, RBF (Annan split)
run_svm_experiment("Exp 3: Annan Split (50/50)", X, y, 0.75, 'rbf', scale_data=True)

# c: Normaliserat, 75% train / 25% test, Linear (Annan kernel)
run_svm_experiment("Exp 4: Linjär Kernel", X, y, 0.25, 'linear', scale_data=True)


Data: Icke-normaliserat (Original)
Split: Train=75%, Test=25%
Kernel: rbf
Accuracy: 0.9960
Confusion Matrix:
[[200   0   1   0]
 [  1 182   0   0]
 [  0   1 200   0]
 [  0   0   0 172]]

Data: Normaliserat (MinMax)
Split: Train=75%, Test=25%
Kernel: rbf
Accuracy: 0.9987
Confusion Matrix:
[[191   0   1   0]
 [  0 223   0   0]
 [  0   0 187   0]
 [  0   0   0 155]]

Data: Normaliserat (MinMax)
Split: Train=25%, Test=75%
Kernel: rbf
Accuracy: 0.9912
Confusion Matrix:
[[596   0   6   0]
 [  5 569   0   0]
 [  2   1 536   0]
 [  1   5   0 548]]

Data: Normaliserat (MinMax)
Split: Train=75%, Test=25%
Kernel: linear
Accuracy: 0.9881
Confusion Matrix:
[[205   2   3   1]
 [  1 197   0   0]
 [  1   0 192   0]
 [  0   1   0 154]]


(0.988110964332893,
 array([[205,   2,   3,   1],
        [  1, 197,   0,   0],
        [  1,   0, 192,   0],
        [  0,   1,   0, 154]]))