In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv('data/letter-recognition-processed.csv')

In [2]:
df.describe()

Unnamed: 0,ID,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
count,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0,2311.0
mean,9865.106447,3.845521,6.912159,5.033752,5.405885,2.945045,6.942882,6.898745,3.712678,4.591952,8.144093,6.772393,9.115967,2.305495,8.594115,3.225011,7.436175
std,5776.650492,1.838955,3.284699,1.622003,2.389124,2.012087,2.298923,2.92432,2.092785,2.79782,2.127232,3.829724,2.489712,1.600909,2.059849,2.089242,1.814604
min,8.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,1.0,0.0,1.0
25%,4824.0,3.0,5.0,4.0,4.0,2.0,5.0,4.0,2.0,2.0,7.0,2.0,8.0,1.0,7.0,2.0,7.0
50%,9750.0,4.0,7.0,5.0,6.0,3.0,7.0,8.0,3.0,4.0,8.0,7.0,8.0,2.0,9.0,3.0,8.0
75%,14811.5,5.0,9.0,6.0,7.0,4.0,8.0,9.0,6.0,7.0,10.0,10.0,11.0,3.0,10.0,4.0,8.0
max,20000.0,10.0,15.0,11.0,15.0,12.0,14.0,13.0,10.0,12.0,14.0,13.0,15.0,10.0,13.0,12.0,13.0


In [3]:

chosen_letters = ['A','Y','C']
df_filtred = df[df['lettr'].isin(chosen_letters)]

X = df_filtred.drop(columns=['lettr'])
y = df_filtred['lettr']

In [None]:
def run_svm_experiment(experiment_name, X_data, y_data, test_size, kernel, scale_data=False):
    print(f"\n{'='*10} {experiment_name} {'='*10}")
    
  
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=test_size, random_state=42)
    
    
    if scale_data:
        scaler = MinMaxScaler() 
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        print("Data: Normaliserat (MinMax)")
    else:
        print("Data: Icke-normaliserat (Original)")
        
    print(f"Split: Train={(1-test_size)*100:.0f}%, Test={test_size*100:.0f}%")
    print(f"Kernel: {kernel}")

    svc = SVC(kernel=kernel)
    svc.fit(X_train, y_train)
    
    
    y_pred = svc.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    return acc, cm


run_svm_experiment("Exp 1: Oskalat Data", X, y, 0.25, 'rbf', scale_data=False)


run_svm_experiment("Exp 2: Normaliserat Data", X, y, 0.25, 'rbf', scale_data=True)


run_svm_experiment("Exp 3: Annan Split (25/75)", X, y, 0.75, 'rbf', scale_data=True)


run_svm_experiment("Exp 4: Linjär Kernel", X, y, 0.25, 'linear', scale_data=True)


Data: Icke-normaliserat (Original)
Split: Train=75%, Test=25%
Kernel: rbf
Accuracy: 0.3460
Confusion Matrix:
[[69 42 94]
 [53 36 91]
 [62 36 95]]

Data: Normaliserat (MinMax)
Split: Train=75%, Test=25%
Kernel: rbf
Accuracy: 0.9896
Confusion Matrix:
[[190   2   2]
 [  1 198   0]
 [  1   0 184]]

Data: Normaliserat (MinMax)
Split: Train=25%, Test=75%
Kernel: rbf
Accuracy: 0.9833
Confusion Matrix:
[[573   1  18]
 [  7 536   1]
 [  1   1 596]]

Data: Normaliserat (MinMax)
Split: Train=75%, Test=25%
Kernel: linear
Accuracy: 0.9775
Confusion Matrix:
[[205   2   3]
 [  5 165   0]
 [  3   0 195]]


(0.9775086505190311,
 array([[205,   2,   3],
        [  5, 165,   0],
        [  3,   0, 195]]))