# Imports

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import euclidean
from sklearn.neighbors import KNeighborsClassifier

# Baseline model

In [13]:
#Load the dataset from the given file
df = pd.read_csv('PetersonBarney/verified_pb.data', delim_whitespace=True, header=None)

#From the HEADER file, give the column names
df.columns = ['Gender', 'Speaker', 'PhonemeNumber', 'PhonemeAscii', 'F0', 'F1', 'F2', 'F3']

#We have to remove those ones, which contain * in them
df = df[~df['PhonemeAscii'].str.contains('\*')]

#Features and labels, I will leave F0 in it
X = df[['F0', 'F1', 'F2', 'F3']]
y = df['PhonemeAscii']

#Split the data into first 80% train, 20%test, then split 20% test into 10% validation and 10% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=23)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=23)

#Calculate centroids without normalization
centroids_no_norm = X_train.groupby(y_train).mean()

#Classify based on the nearest centroid
def classify(instance, centroids):
    distances = centroids.apply(lambda x: euclidean(x, instance), axis=1)
    return distances.idxmin()

#Classify the validation set without normalization
y_val_pred_no_norm = X_val.apply(lambda x: classify(x, centroids_no_norm), axis=1)
val_accuracy_no_norm = accuracy_score(y_val, y_val_pred_no_norm)
print(f'Validation Accuracy without Normalization: {val_accuracy_no_norm * 100:.2f}%')

#Now scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

#Choose only the columns which we need in X_train_scaled
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=['F0', 'F1', 'F2', 'F3'])

#Calculate the scaled centroids
centroids_scaled = X_train_scaled_df.groupby(y_train).mean()

#Now classify the validation set with normalization
y_val_pred_scaled = pd.DataFrame(X_val_scaled, columns=['F0', 'F1', 'F2', 'F3']).apply(lambda x: classify(x, centroids_scaled), axis=1)
val_accuracy_scaled = accuracy_score(y_val, y_val_pred_scaled)
print(f'Validation Accuracy with Normalization: {val_accuracy_scaled * 100:.2f}%')

#Let's choose which did better based on the validation test results
if val_accuracy_scaled > val_accuracy_no_norm:
    print("Normalized model did better, hence using that")
    centroids_final = centroids_scaled
    X_test_final = X_test_scaled
else:
    print("Non-normalized is better, so we are using that")
    centroids_final = centroids_no_norm
    X_test_final = X_test

#Classify the test set with the better model
y_test_pred = pd.DataFrame(X_test_final, columns=['F0', 'F1', 'F2', 'F3']).apply(lambda x: classify(x, centroids_final), axis=1)

#Use the Accuracy as evaluation
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


  df = pd.read_csv('PetersonBarney/verified_pb.data', delim_whitespace=True, header=None)


Validation Accuracy without Normalization: 50.00%
Validation Accuracy with Normalization: 11.67%
Non-normalized is better, so we are using that
Test Accuracy: 53.33%


# Enhanced model - KNN

In [15]:
#Let's try KNN as our advanced model, also use the scaled data

#Do the model for different k values, evaluate the validation set on different k values and choose the best one
best_k = None
best_val_accuracy = 0

#Don't let k = 1, to prevent overfitting
for k in range(2, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    
    #Predict the validation set with the kiven k value
    y_val_pred = knn.predict(X_val_scaled)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    #If the current k is better, update our best_k value
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_k = k

#Print out the best k value, which we will use in the model
print(f'Best k: {best_k} with Validation Accuracy: {best_val_accuracy * 100:.2f}%')

#Train the model on the training set with the best k
knn_final = KNeighborsClassifier(n_neighbors=best_k)
knn_final.fit(X_train_scaled, y_train)

#Evaluate the model on the test set
y_test_pred = knn_final.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)

#Print out the result
print(f'Test Accuracy with KNN (k={best_k}): {test_accuracy * 100:.2f}%')

Best k: 3 with Validation Accuracy: 90.00%
Test Accuracy with KNN (k=3): 87.50%
