In [112]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

## Baseline Model

In [88]:
#Reading in the data and renaming the columns according to the HEADER file
data_df = pd.read_csv('PetersonBarney/verified_pb.data', delim_whitespace=True, header=None)
data_df.columns = ['gender', 'speaker_id', 'phoneme_number', 'phoneme_ascii', 'f0', 'f1', 'f2', 'f3']

#Removing the phonemes that have * in their ascii
mask = data_df['phoneme_ascii'].str.contains('\*')
data_df = data_df[~mask]

#Defining the independent and the target variables
X = data_df[['f0', 'f1', 'f2', 'f3']]
Y = data_df['phoneme_ascii']

#Performing the splitting
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 6)
X_test, X_val, Y_test, Y_val = train_test_split(X_temp, Y_temp, test_size = 0.5, shuffle = True, random_state = 7)

#Calculating centers without normalization
centers = X_train.groupby(Y_train).mean()
def classify_point(centers, point):
    distances = centers.apply(lambda center: euclidean(center, point), axis=1)
    return distances.idxmin()
#Calculate the validation set's classes

def classifier(X, Y, centers, type_of_test):
    classified_validation = X.apply(lambda point: classify_point(centers, point), axis = 1)
    accuracy_without_normalization = accuracy_score(Y, classified_validation)
    print('The ' + type_of_test + ': ' + str(accuracy_without_normalization))
classifier(X_val, Y_val, centers, 'validation score without normalization')
classifier(X_test, Y_test, centers, 'test score without normalization')

#With scaling
scaler = MinMaxScaler()

X_train_minmax = scaler.fit_transform(X_train)
X_train_minmax_df = pd.DataFrame(X_train_minmax, columns=['f0', 'f1', 'f2', 'f3'])
X_val_minmax = scaler.transform(X_val)
X_val_minmax_df = pd.DataFrame(X_val_minmax, columns = ['f0', 'f1', 'f2', 'f3'])
X_test_minmax = scaler.transform(X_test)
X_test_minmax_df = pd.DataFrame(X_test_minmax, columns = ['f0', 'f1', 'f2', 'f3'])
centers_minmax = X_train_minmax_df.groupby(Y_train).mean()
classifier(X_val_minmax_df, Y_val, centers_minmax, 'validation score with minmax normalization')
classifier(X_test_minmax_df, Y_test, centers_minmax, 'test score with minmax normalization')

#With standard scaling

scaler2 = StandardScaler()

X_train_standard = scaler2.fit_transform(X_train)
X_train_standard_df = pd.DataFrame(X_train_standard, columns=['f0', 'f1', 'f2', 'f3'])
X_val_standard = scaler2.transform(X_val)
X_val_standard_df = pd.DataFrame(X_val_standard, columns = ['f0', 'f1', 'f2', 'f3'])
X_test_standard = scaler2.transform(X_test)
X_test_standard_df = pd.DataFrame(X_test_standard, columns = ['f0', 'f1', 'f2', 'f3'])
centers_standard = X_train_standard_df.groupby(Y_train).mean()
classifier(X_val_standard_df, Y_val, centers_standard, 'validation score with standard normalization')
classifier(X_test_standard_df, Y_test, centers_standard, 'test score with standard normalization')


The validation score without normalization: 0.6083333333333333
The test score without normalization: 0.525
The validation score with minmax normalization: 0.10833333333333334
The test score with minmax normalization: 0.13333333333333333
The validation score with standard normalization: 0.10833333333333334
The test score with standard normalization: 0.13333333333333333


## Alternative model
Let's make the previous one a bit stochastic

In [109]:
import random
def stochastic_classify_point(centers, point):
    distances = centers.apply(lambda center: euclidean(center, point), axis=1)
    return random.choices(centers.index.tolist(), weights = 1/distances, k=1)
def stochastic_classifier(X, Y, centers, type_of_test):
    classified_validation = X.apply(lambda point: stochastic_classify_point(centers, point)[0], axis = 1)
    accuracy_without_normalization = accuracy_score(Y, classified_validation)
    print('The ' + type_of_test + ': ' + str(accuracy_without_normalization))

In [110]:
stochastic_classifier(X_val, Y_val, centers, 'validation score without normalization')
stochastic_classifier(X_test, Y_test, centers, 'test score without normalization')

stochastic_classifier(X_val_minmax_df, Y_val, centers_minmax, 'validation score with minmax normalization')
stochastic_classifier(X_test_minmax_df, Y_test, centers_minmax, 'test score with minmax normalization')

stochastic_classifier(X_val_standard_df, Y_val, centers_standard, 'validation score with standard normalization')
stochastic_classifier(X_test_standard_df, Y_test, centers_standard, 'test score with standard normalization')

The validation score without normalization: 0.19166666666666668
The test score without normalization: 0.20833333333333334
The validation score with minmax normalization: 0.1
The test score with minmax normalization: 0.10833333333333334
The validation score with standard normalization: 0.125
The test score with standard normalization: 0.1


## Alternative Model2 - KNN

In [115]:
def KNN_model(k, X_train, X_val, X_test, Y_train, Y_val, Y_test):
    #Define and train the model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    #Make predictions
    prediction_validation = knn.predict(X_val)
    val_accuracy = accuracy_score(Y_val, prediction_validation)
    prediction_test = knn.predict(X_test)
    test_accuracy = accuracy_score(Y_test, prediction_test)
    print('The KNN model with k = ' + str(k) + ' during validation scored: ' + str(val_accuracy))
    print('The KNN model with k = ' + str(k) + ' during testing scored: ' + str(test_accuracy))
    return val_accuracy, test_accuracy
validation_accuracies_without_normalization = []
test_accuracies_without_normalization = []
validation_accuracies_with_standard_normalization = []
test_accuracies_with_standard_normalization = []
for k in range(2, 30):
    val, test = KNN_model(k, X_train, X_val, X_test, Y_train, Y_val, Y_test)
    validation_accuracies_without_normalization.append(val)
    test_accuracies_without_normalization.append(test)
    val1, test1 = KNN_model(k, X_train_standard_df, X_val_standard_df, X_test_standard_df, Y_train, Y_val, Y_test)
    validation_accuracies_with_standard_normalization.append(val1)
    test_accuracies_with_standard_normalization.append(test1)
print('The best validation accuracy without normalization was achieved by k = ' + str(2+validation_accuracies_without_normalization.index(max(validation_accuracies_without_normalization))))
print('The best test accuracy without normalization was achieved by k = ' + str(2+test_accuracies_without_normalization.index(max(test_accuracies_without_normalization))))
print('The best validation accuracy with normalization was achieved by k = ' + str(2+validation_accuracies_with_standard_normalization.index(max(validation_accuracies_with_standard_normalization))))
print('The best test accuracy with normalization was achieved by k = ' + str(2+test_accuracies_with_standard_normalization.index(max(test_accuracies_with_standard_normalization))))

The KNN model with k = 2 during validation scored: 0.9
The KNN model with k = 2 during testing scored: 0.85
The KNN model with k = 2 during validation scored: 0.8666666666666667
The KNN model with k = 2 during testing scored: 0.8416666666666667
The KNN model with k = 3 during validation scored: 0.9083333333333333
The KNN model with k = 3 during testing scored: 0.8333333333333334
The KNN model with k = 3 during validation scored: 0.875
The KNN model with k = 3 during testing scored: 0.8666666666666667
The KNN model with k = 4 during validation scored: 0.9
The KNN model with k = 4 during testing scored: 0.8416666666666667
The KNN model with k = 4 during validation scored: 0.8666666666666667
The KNN model with k = 4 during testing scored: 0.875
The KNN model with k = 5 during validation scored: 0.9
The KNN model with k = 5 during testing scored: 0.8583333333333333
The KNN model with k = 5 during validation scored: 0.875
The KNN model with k = 5 during testing scored: 0.8583333333333333
Th