In [104]:
import pandas as pd

In [108]:
df = pd.read_csv('features_all_letters.csv' , encoding='latin-1')

In [106]:
df.head

<bound method NDFrame.head of         lm_0_x    lm_0_y    lm_0_z    lm_1_x    lm_1_y    lm_1_z    lm_2_x  \
0     0.736526  1.281574 -0.608224  1.464792  0.525249  0.585787  1.692046   
1     0.465805  1.111084  0.909697  0.806079  1.069665 -0.530141  1.190799   
2     0.096985  0.457263  0.000578  0.108307  0.213536  0.729824  0.378038   
3     1.532381  0.870818 -1.148211  1.045526  0.586852 -1.315713  0.369849   
4     0.342329 -0.568508 -0.990914  0.176960 -0.240414  0.754830  0.118993   
...        ...       ...       ...       ...       ...       ...       ...   
1020  0.463537 -1.365550  1.626476  0.140207 -2.174672 -0.141225 -0.721992   
1021  0.455000  1.600938  1.284518  0.754075  1.288526 -0.863178  0.548296   
1022 -1.504145 -1.417760 -1.761779 -1.441923 -0.733056  1.681158 -1.142818   
1023 -0.609674 -1.176379 -0.707310 -0.282042 -0.815642  0.971023  0.219577   
1024  1.248605 -0.044114  0.974907  1.332861  0.395719 -1.083079  1.571464   

        lm_2_y    lm_2_z    lm_3_

In [None]:
X = df.drop(columns=["instance_id", "label"]).values
y = df["label"].values

In [None]:
from sklearn.preprocessing import LabelEncoder

#Convert text data into numbers as most machine learning algorithms work on numeric labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=42, test_size=0.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state=42)

In [58]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [94]:
param_grid = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="accuracy")

grid_search.fit(X_train_scaled, y_train)

In [95]:
best_dt = grid_search.best_estimator_
best_dt.fit(X_train_scaled, y_train)

In [103]:
from sklearn.metrics import accuracy_score, confusion_matrix

y_pred_dt = best_dt.predict(X_test_scaled)

dt_test_acc = accuracy_score(y_test, y_pred_dt)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Decision Tree CV accuracy:", grid_search.best_score_)
print("Decision Tree Test Accuracy:", dt_test_acc)

print(confusion_matrix(y_test, y_pred_dt))

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5}
Best Decision Tree CV accuracy: 0.9023289869023656
Decision Tree Test Accuracy: 0.8867667121418826
[[76  2  0  1  4  0  0  0  4  2]
 [ 0 78  3  0  0  1  0  0  0  0]
 [ 0  6 53  0  1  2  0  0  1  1]
 [ 0  0  2 65  1  0  2  1  0  0]
 [ 7  0  0  0 70  0  0  1  1  2]
 [ 0  3  1  0  2 77  0  0  2  0]
 [ 0  0  0  1  0  1 61  0  0  0]
 [ 0  0  1  3  0  0  1 66  0  0]
 [ 1  0  0  0  0  0  0  0 53  6]
 [ 1  0  0  0  2  0  3  1  9 51]]


In [None]:
import math
from collections import Counter


#Euclidian distance to measure the similarity between target and training data points
def euclidean_distance(p1, p2):
    total = 0
    #Iterates index by index through the vectors
    for i in range(len(p1)):
        #Subtract corresponding feature values
        difference = p1[i] - p2[i]
        #Sums all squared differences
        total += difference * difference
        #Return square root
    return math.sqrt(total)

#Predict class label for each test instance using kNN
def knn_predict(X_train, y_train, X_test, k):
    predictions = []

    #Loop over test points
    for test_point in X_test:
        distances = []

        #Compute distance to every training sample
        for i in range(len(X_train)):
            train_point = X_train[i]
            label = y_train[i]

            dist = euclidean_distance(test_point, train_point)
            distances.append((dist, label))

        #Sort by distance so nearest neighbours appear first
        distances.sort(key=lambda x: x[0])
        #Take the k nearest neighbours
        k_nearest = distances[:k]

        neighbour_labels = []
        for item in k_nearest:
            neighbour_labels.append(item[1])

        #Choose the most frequent label
        most_common_label = Counter(neighbour_labels).most_common(1)[0][0]
        predictions.append(most_common_label)

    return predictions

In [None]:
#Perform 5-fold cross-validation to estimate model performance
def knn_cross_val(X, y, k_neighbours, folds=5):
    #Split data into folds
    fold_size = math.floor(len(X)/folds)
    accuracies = []

    #Loop through folds
    for i in range(folds):
        #Each fold becomes the validation set once
        start = i * fold_size
        end = start + fold_size

        X_val = X[start:end]
        y_val = y[start:end]

        X_train = X[:start] + X[end:]
        y_train = y[:start] + y[end:]

        #Predict validation labels
        predict_val_label = knn_predict(X_train, y_train, X_val, k_neighbours)

        #Compute accuracy
        #Accuracy = correct predictions ÷ total predictions
        correct = 0
        for j in range(len(y_val)):
            if predict_val_label[j] == y_val[j]:
                correct += 1

        acc = correct / len(y_val)
        accuracies.append(acc)

    #Returns cross-validation accuracy for a given k.
    return sum(accuracies) / len(accuracies)


In [None]:
#Convert NumPy arrays to Python lists using tolist function
X_train_list = X_train_scaled.tolist()
y_train_list = y_train.tolist()

#Define k values to test
#Use odd numbers to prevent ties in majority voting
k_values = [1, 3, 5, 7, 9]
#Dictionary to store cross-validation results
knn_results = {}

#Loop to run 5-fold cross-validation for each k
for k in k_values:
    stored_accuracy = knn_cross_val(X_train_list, y_train_list, k)
    knn_results[k] = stored_accuracy
    #Print CV accuracy
    print(f"k={k}, CV accuracy={stored_accuracy:.3f}")

#Looks at all k values and select the one with highest CV accuracy
best_k = max(knn_results, key=knn_results.get)
print("Best KNN CV accuracy:", knn_results[best_k])


k=1, CV accuracy=0.929
k=3, CV accuracy=0.923
k=5, CV accuracy=0.925
k=7, CV accuracy=0.925
k=9, CV accuracy=0.926
0.9292307692307693


In [None]:
#Predicting on the test set using best k
y_pred_knn = knn_predict(
    ##Convert NumPy arrays to Python lists using tolist function
    X_train_scaled.tolist(),
    y_train.tolist(),
    X_test_scaled.tolist(),
    best_k
)

#Count how many predictions are correct
correct_prediction = 0

for i in range(len(y_test)):
    #Compare prediction with true label, prediction correct if they match
    if y_pred_knn[i] == y_test[i]:
        #Count correct predictions
        correct_prediction += 1

#Compute accuracy
knn_test_acc = correct_prediction / len(y_test)

# final kNN performance
print("kNN Test Accuracy:", knn_test_acc)


kNN Test Accuracy: 0.9454297407912687


In [None]:
from sklearn.svm import SVC

svm = SVC()


#Define hyperparameter search space
param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"]
}

#Set up GridSearch with 5-fold cross-validation
grid_svm = GridSearchCV(svm, param_grid, cv=5, scoring="accuracy"
)

#Run the hyperparameter search
grid_svm.fit(X_train_scaled, y_train)

#Get the best SVM model with the highest mean CV accuracy
best_svm = grid_svm.best_estimator_

#Make predictions on the test set
#Evaluates models ability to perform on unseen data
y_pred_svm = best_svm.predict(X_test_scaled)

print("Best SVM hyperparameters:", grid_svm.best_params_)

print("Best SVM CV accuracy:", grid_svm.best_score_)


Best SVM hyperparameters: {'C': 10, 'kernel': 'rbf'}
Best SVM CV accuracy: 0.9624351681689566


In [None]:
from sklearn.metrics import accuracy_score

#Calculate percentage of correct predictions on test set
svm_test_acc = accuracy_score(y_test, y_pred_svm)

print("SVM Test Accuracy:", svm_test_acc)

SVM Test Accuracy: 0.9672578444747613
