In [25]:
#We need to create several functions in order to classificate our data 

#first we import the libraries needed
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.spatial import distance
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#this function help us to load the data with the .data format
def load_data(filename):
    columns = ['gender', 'speaker', 'phoneme', 'phoneme_ascii', 'F0', 'F1', 'F2', 'F3']
    data = pd.read_csv(filename, delim_whitespace=True, names=columns)
    return data

#function to divide data in training, validation and test (80/10/10)
def split_data(data):
    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
    return train_data, val_data, test_data

# function to calculate the centroid of each phoneme
def calculate_centroid(train_data):
    centroids = train_data.groupby('phoneme')[['F0', 'F1', 'F2', 'F3']].mean()
    return centroids

# base classifier using distance to the centroids calculated
def baseline_classifier(test_data, centroids):
    predictions = []
    for _, row in test_data.iterrows():
        distances = centroids.apply(lambda x: distance.euclidean([row['F0'], row['F1'], row['F2'], row['F3']], x), axis=1)
        predicted_phoneme = distances.idxmin()  # Fonema más cercano
        predictions.append(predicted_phoneme)
    return predictions

#function to evaluate the model we are using
def evaluate_model(test_data, predictions):
    accuracy = accuracy_score(test_data['phoneme'], predictions)
    print(f"Accuracy: {accuracy}")
    print(classification_report(test_data['phoneme'], predictions))
    
#same functions with no F0 in case we need to take it out

def calculate_centroid_noF0(train_data):
    centroids = train_data.groupby('phoneme')[['F1', 'F2', 'F3']].mean()
    return centroids

# Clasificador base usando la distancia al centroide
def baseline_classifier_noF0(test_data, centroids):
    predictions = []
    for _, row in test_data.iterrows():
        distances = centroids.apply(lambda x: distance.euclidean([row['F1'], row['F2'], row['F3']], x), axis=1)
        predicted_phoneme = distances.idxmin()
        predictions.append(predicted_phoneme)
    return predictions

In [27]:
#load the data copying the directory of the file and ending with //verified_pb.data
data = load_data('C:/Users/otelo/OneDrive - Universidad Politécnica de Madrid/Documentos/MATES UPM/4º/AML/PetersonBarney//verified_pb.data')

In [28]:
data

Unnamed: 0,gender,speaker,phoneme,phoneme_ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


In [30]:
#split the data 80/10/10
train_data, val_data, test_data = split_data(data)

In [31]:
#get the centroids
centroids = calculate_centroid(train_data)

In [32]:
centroids

Unnamed: 0_level_0,F0,F1,F2,F3
phoneme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,197.330508,299.974576,2636.610169,3238.983051
2,199.424,438.064,2336.936,2972.76
3,187.772358,594.707317,2176.186992,2876.154472
4,177.122951,795.901639,1959.229508,2723.713115
5,184.081967,717.016393,1339.54918,2690.106557
6,187.201681,844.218487,1207.865546,2728.378151
7,185.177419,601.564516,916.185484,2699.33871
8,199.778689,474.155738,1151.196721,2620.270492
9,200.292683,355.99187,961.918699,2597.365854
10,192.059322,513.898305,1558.915254,1915.305085


In [33]:
predictions = baseline_classifier(test_data, centroids)

In [34]:
predictions
#we consider {1: 'a', 2: 'e', 3: 'i', 4: 'o', 5: 'u', 6: 'ae', 7: 'oe', 8: 'ii', 9: 'uu', 10: 'ou'}

[4,
 9,
 6,
 1,
 10,
 4,
 7,
 4,
 2,
 7,
 8,
 8,
 1,
 8,
 10,
 1,
 8,
 8,
 10,
 1,
 10,
 1,
 6,
 8,
 9,
 2,
 9,
 6,
 9,
 6,
 4,
 9,
 9,
 1,
 1,
 8,
 5,
 8,
 2,
 3,
 10,
 8,
 1,
 7,
 6,
 1,
 4,
 10,
 4,
 5,
 4,
 7,
 10,
 5,
 10,
 9,
 2,
 10,
 9,
 7,
 2,
 1,
 6,
 7,
 8,
 2,
 10,
 4,
 4,
 7,
 8,
 4,
 10,
 10,
 7,
 6,
 5,
 1,
 4,
 8,
 9,
 7,
 1,
 2,
 4,
 1,
 7,
 9,
 1,
 9,
 9,
 6,
 5,
 9,
 8,
 7,
 7,
 4,
 5,
 7,
 5,
 6,
 10,
 7,
 9,
 5,
 9,
 1,
 1,
 7,
 9,
 10,
 5,
 6,
 3,
 4,
 5,
 4,
 2,
 4,
 1,
 9,
 6,
 9,
 1,
 7,
 4,
 3,
 10,
 8,
 8,
 10,
 4,
 2,
 9,
 4,
 1,
 10,
 9,
 3,
 1,
 5,
 9,
 10,
 7,
 7,
 8,
 2,
 5,
 1,
 3,
 1]

In [35]:
#now we are able to see how correctly the model predicts the results
evaluate_model(test_data, predictions)

Accuracy: 0.5263157894736842
              precision    recall  f1-score   support

           1       0.64      0.82      0.72        17
           2       0.30      0.27      0.29        11
           3       0.20      0.10      0.13        10
           4       0.53      0.71      0.61        14
           5       0.50      0.46      0.48        13
           6       0.91      0.45      0.61        22
           7       0.61      0.61      0.61        18
           8       0.19      0.23      0.21        13
           9       0.33      0.44      0.38        16
          10       0.83      0.83      0.83        18

    accuracy                           0.53       152
   macro avg       0.50      0.49      0.49       152
weighted avg       0.55      0.53      0.52       152



In [37]:
#we get 0.526 (52.6%), which means that, on average, the model correctly predicts about half of the time. Since we have 10 classes, this value is above a random classifier (which would have an expected accuracy of 10%),

In [38]:
#For CLASSES
#Classes 1, 4, 6, 7, 10 have a good performance with F1-scores around 0.61-0.83, indicating that the model works reasonably well for these phonemes.
#classes 2, 3, 8, 9 have problems. Especially classes 2, 3 and 8 show rather low F1-scores (0.13-0.38), which means that the model has difficulty correctly classifying these classes.
#Clase 6: It has a high accuracy (0.91), but a low recall (0.45), suggesting that when it predicts class 6, it is usually correct, but often fails to recognize examples of that class.


In [39]:
#macro avg indicates model performance in all classes equally
#weighted avg indicates the performance of the model weighted by the number of samples of each class


In [40]:
#Problemas identified:
#1) Uneven performance between classes
#2) Class 6: Difference between accuracy and recall, as we said before, indicates that the model correctly predicts when it tags examples such as Class 6 but does not correctly detect examples of this class



In [14]:
#soluciones:
#1) Normalization
#2) exclude irrelevant features like F0
#3)#exclude unreliable data like the ones with asterisks

In [71]:
#1) Normalization

#we need to define the baseline classifier in another way
from scipy.spatial.distance import cdist

def baseline_classifier_4(X_train, y_train, X_test, y_test):
    # Calculamos los centroides de cada fonema en el conjunto de entrenamiento
    unique_phonemes = np.unique(y_train)
    centroids = {}

    for phoneme in unique_phonemes:
        # Seleccionamos las filas de X_train que corresponden a cada fonema
        centroids[phoneme] = X_train[y_train == phoneme].mean(axis=0)

    # Convertimos el diccionario a una matriz de centroides para más eficiencia
    centroid_matrix = np.array([centroids[phoneme] for phoneme in unique_phonemes])

    # Predecimos el fonema más cercano a cada muestra en el conjunto de prueba
    predictions = []
    for sample in X_test:
        # Calculamos la distancia de la muestra a cada centroide
        distances = cdist([sample], centroid_matrix, metric='euclidean')
        
        # Seleccionamos el fonema con la distancia más corta (más cercano)
        predicted_phoneme = unique_phonemes[np.argmin(distances)]
        predictions.append(predicted_phoneme)

    return predictions


# separate the characteristics (F0, F1, F2, F3) and tags (phonemes) in training and test
X_train = train_data[['F0', 'F1', 'F2', 'F3']]  # characteristics
y_train = train_data['phoneme']  # phonemes
X_test = test_data[['F0', 'F1', 'F2', 'F3']]  #characteristics
y_test = test_data['phoneme']  #phonemes

#create the scaler, which normalizes
scaler = StandardScaler()

# normalize the characteristics that belong to the training section
X_train_scaled = scaler.fit_transform(X_train)

# normalize the characteristics that belong to the test section
X_test_scaled = scaler.transform(X_test)

# Paso 1: Obtener predicciones con el baseline classifier normalizado
predictions_scaled = baseline_classifier_4(X_train_scaled, y_train, X_test_scaled, y_test)

#we again evaluate the model
accuracy_scaled = accuracy_score(y_test, predictions_scaled)
print(f"Exactitud con datos normalizados: {accuracy_scaled}")

#and print the report
print(classification_report(y_test, predictions_scaled))


Exactitud con datos normalizados: 0.675
              precision    recall  f1-score   support

           1       0.75      0.60      0.67        20
          10       0.82      0.93      0.87        15
           2       0.28      0.56      0.37         9
           3       0.78      0.54      0.64        13
           4       0.94      0.88      0.91        17
           5       0.50      0.57      0.53         7
           6       0.83      0.71      0.77         7
           7       0.62      0.73      0.67        11
           8       0.33      0.25      0.29         8
           9       0.82      0.69      0.75        13

    accuracy                           0.68       120
   macro avg       0.67      0.65      0.65       120
weighted avg       0.71      0.68      0.68       120



In [53]:
#RESULTS: we see that normlizing the data has improved the results considerably in all aspects so we save it as the best solution until we find a better one

In [49]:
#2) exclude F0: we will use the functions we define at the beginning for data with no F0 column

In [50]:
centroids_noF0 = calculate_centroid_noF0(train_data)

In [51]:
predictions_noF0 = baseline_classifier_noF0(test_data, centroids_noF0)

In [52]:
evaluate_model(test_data, predictions_noF0)

Accuracy: 0.5328947368421053
              precision    recall  f1-score   support

           1       0.64      0.82      0.72        17
           2       0.30      0.27      0.29        11
           3       0.20      0.10      0.13        10
           4       0.53      0.71      0.61        14
           5       0.58      0.54      0.56        13
           6       1.00      0.50      0.67        22
           7       0.58      0.61      0.59        18
           8       0.13      0.15      0.14        13
           9       0.33      0.44      0.38        16
          10       0.83      0.83      0.83        18

    accuracy                           0.53       152
   macro avg       0.51      0.50      0.49       152
weighted avg       0.56      0.53      0.53       152



In [54]:
#RESULTS: we observe that taking out FO does not really improve much the results so next we will try our last solution, excluding unreliable data

In [55]:
#3)EXCLUDE DATA WITH ASTERISK

In [62]:
# Convertir la columna 'ascii_phoneme' a string por si tiene algún valor no string
data['phoneme_ascii'] = data['phoneme_ascii'].astype(str)

# Eliminar filas donde la columna 'ascii_phoneme' contiene un asterisco al inicio
data_cleaned = data[~data['phoneme_ascii'].str.startswith('*', na=False)]

#again, we divide the data and repeat the process
train_data, val_data, test_data = split_data(data_cleaned)

centroids = calculate_centroid(train_data)

predictions = baseline_classifier(test_data, centroids)

evaluate_model(test_data, predictions)


Accuracy: 0.5083333333333333
              precision    recall  f1-score   support

           1       0.58      0.55      0.56        20
          10       0.74      0.93      0.82        15
           2       0.08      0.11      0.09         9
           3       0.40      0.31      0.35        13
           4       0.86      0.71      0.77        17
           5       0.30      0.43      0.35         7
           6       0.56      0.71      0.63         7
           7       0.75      0.55      0.63        11
           8       0.12      0.12      0.12         8
           9       0.40      0.31      0.35        13

    accuracy                           0.51       120
   macro avg       0.48      0.47      0.47       120
weighted avg       0.53      0.51      0.51       120



In [63]:
data_cleaned

Unnamed: 0,gender,speaker,phoneme,phoneme_ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


In [64]:
#RESULT: we observe that cleaned data has 321 columns less and that the results for these data are a bit worse 
#therefore, we do not consider this a good solution to our problems

In [68]:
#final conclussion
#the best solution to obtain better results is normalizing the data and excluding F0 or unreliable data is nor effective 