In [135]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# Loadnig the data

In [136]:
file_path = r'C:\Users\szjan\Downloads\PetersonBarney\PetersonBarney\verified_pb.data'

columns = ['M/F/C', 'SPKR', 'Phoneme-Number', 'Phoneme-Ascii', 'F0', 'F1', 'F2', 'F3']

df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=columns)

X = df[['F1', 'F2', 'F3']]
y = df['Phoneme-Ascii']

# Splitting

In [137]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Finding the centers of the sounds

In [138]:
centers_of_gravity = df.groupby('Phoneme-Ascii').mean()[['F1', 'F2', 'F3']]

print(centers_of_gravity)

                       F1           F2           F3
Phoneme-Ascii                                      
*AA            816.907692  1190.230769  2765.076923
*AE            670.480000  2070.000000  2791.120000
*AH            692.354167  1277.270833  2617.041667
*AO            636.772727   975.340909  2768.272727
*EH            586.559322  2116.796610  2869.830508
*ER            491.500000  1727.500000  2168.750000
*IH            466.921053  2358.105263  3081.842105
*IY            325.000000  2712.500000  3225.000000
*UH            484.320000  1151.040000  2637.760000
*UW            401.111111  1050.333333  2658.888889
AA             846.057471  1206.137931  2677.379310
AE             833.503937  1944.811024  2752.141732
AH             735.644231  1384.576923  2751.490385
AO             584.805556   883.574074  2665.777778
EH             593.333333  2222.064516  2895.774194
ER             510.141892  1546.581081  1896.101351
IH             428.622807  2305.254386  2912.763158
IY          

# Baseline model

In [139]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def nearest_neighbor_classifier(test_point, centers):
    min_distance = float('inf')
    nearest_phoneme = None
    for phoneme, center in centers.iterrows():
        center_point = np.array([center['F1'], center['F2'], center['F3']])
        distance = euclidean_distance(test_point, center_point)
        if distance < min_distance:
            min_distance = distance
            nearest_phoneme = phoneme
    return nearest_phoneme

In [140]:
centers_of_gravity_train = pd.concat([X_train, y_train], axis=1).groupby('Phoneme-Ascii').mean()[['F1', 'F2', 'F3']]

In [141]:
y_val_pred = X_val.apply(lambda row: nearest_neighbor_classifier(np.array([row['F1'], row['F2'], row['F3']]), centers_of_gravity_train), axis=1)

val_accuracy = (y_val_pred == y_val).mean()

val_accuracy

0.4276315789473684

# Logistic regression

In [142]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [143]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

# Train the model on the training set
log_reg.fit(X_train_scaled, y_train)


In [144]:
# Predict on the validation set
y_val_pred = log_reg.predict(X_val_scaled)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy}")


Validation accuracy: 0.7171052631578947
