In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score
import h2o
from h2o.automl import H2OAutoML

In [2]:
# Importing the provided data with pre-defined header text
header_text = ['gender','speaker','phoneme','phoneme in ascii', 'F0', 'F1', 'F2', 'F3']

data = pd.read_csv("./PetersonBarney/verified_pb.data",sep='\t',names=header_text)

data

Unnamed: 0,gender,speaker,phoneme,phoneme in ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


In [3]:
# Creating the required train-valid-test split in two steps
train_df, temp_df = train_test_split(data, test_size=0.2, random_state=42)

valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Original size: {data.shape}")
print(f"Train size: {train_df.shape}")
print(f"Validation size: {valid_df.shape}")
print(f"Test size: {test_df.shape}")

assert len(test_df) + len(valid_df) + len(train_df) == len(data)

Original size: (1520, 8)
Train size: (1216, 8)
Validation size: (152, 8)
Test size: (152, 8)


In [4]:
# Defining the feature and target columns for the classification task
feature_cols = ['F0','F1','F2','F3']
target_col = 'phoneme'

In [5]:
# Finding the center of each class/cluster (only using the training data)
centroid_df = train_df.groupby(target_col,as_index=False)[['F0','F1','F2','F3']].mean()
centroid_df

Unnamed: 0,phoneme,F0,F1,F2,F3
0,1,197.330508,299.974576,2636.610169,3238.983051
1,2,199.424,438.064,2336.936,2972.76
2,3,187.772358,594.707317,2176.186992,2876.154472
3,4,177.122951,795.901639,1959.229508,2723.713115
4,5,184.081967,717.016393,1339.54918,2690.106557
5,6,187.201681,844.218487,1207.865546,2728.378151
6,7,185.177419,601.564516,916.185484,2699.33871
7,8,199.778689,474.155738,1151.196721,2620.270492
8,9,200.292683,355.99187,961.918699,2597.365854
9,10,192.059322,513.898305,1558.915254,1915.305085


In [6]:
def performing_pred(df):
    # Calculating the Euclidean distance from each data point to each centroid and saving the winner
    distances = cdist(df[feature_cols], centroid_df[feature_cols], metric='euclidean')
    closest_centroid_indices = np.argmin(distances, axis=1)

    # Add a new column with the corresponding closest centroid's phoneme
    df['closest_centroid'] = centroid_df.iloc[closest_centroid_indices][target_col].values

    return accuracy_score(df[target_col], df['closest_centroid'])

In [7]:
print('-------------BENCHMARK RESULTS-------------')
print(f'Accuracy on training set: {performing_pred(train_df) * 100:.2f}%')
print(f'Accuracy on validation set: {performing_pred(valid_df) * 100:.2f}%')
print(f'Accuracy on test set: {performing_pred(test_df) * 100:.2f}%')
print('-------------------------------------------')

-------------BENCHMARK RESULTS-------------
Accuracy on training set: 49.92%
Accuracy on validation set: 51.32%
Accuracy on test set: 52.63%
-------------------------------------------


For building a better classifier, I used the H2O library, which provides automatic tools for model training, with minimal work and good results.

In [8]:
# Starting the H2O cluster
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\Users\mate.gedeon\AppData\Local\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\MATE~1.GED\AppData\Local\Temp\tmpaugpsn8o
  JVM stdout: C:\Users\MATE~1.GED\AppData\Local\Temp\tmpaugpsn8o\h2o_mate_gedeon_started_from_python.out
  JVM stderr: C:\Users\MATE~1.GED\AppData\Local\Temp\tmpaugpsn8o\h2o_mate_gedeon_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Budapest
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.2
H2O_cluster_version_age:,10 months and 5 days
H2O_cluster_name:,H2O_from_python_mate_gedeon_0ol8xf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.896 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [9]:
# Converting the DataFrames to h2o frames
h2o_train = h2o.H2OFrame(train_df)
h2o_val = h2o.H2OFrame(valid_df)
h2o_test = h2o.H2OFrame(test_df)

# Converting the target column to a factor (categorical format)
h2o_train[target_col] = h2o_train[target_col].asfactor()
h2o_val[target_col] = h2o_val[target_col].asfactor()
h2o_test[target_col] = h2o_test[target_col].asfactor()

# By default H2O uses cross-validation on all available data, but we can pre-define the validation set
aml = H2OAutoML(max_models=50,seed=1,nfolds=0)
aml.train(x=feature_cols, y=target_col, training_frame=h2o_train,  validation_frame=h2o_val)

# H2O creates a leaderboard with the trained model's info
lb = aml.leaderboard
lb.head(rows=lb.nrows)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
20:49:36.540: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


model_id,mean_per_class_error,logloss,rmse,mse
DeepLearning_grid_1_AutoML_1_20240913_204936_model_6,0.0716099,0.425008,0.326726,0.10675
DeepLearning_grid_2_AutoML_1_20240913_204936_model_1,0.0909791,0.40481,0.327358,0.107163
DeepLearning_grid_1_AutoML_1_20240913_204936_model_1,0.0955522,0.382132,0.309168,0.0955847
DeepLearning_grid_3_AutoML_1_20240913_204936_model_6,0.097433,0.380351,0.338521,0.114596
DeepLearning_grid_2_AutoML_1_20240913_204936_model_6,0.0974771,0.444286,0.352231,0.124066
GBM_grid_1_AutoML_1_20240913_204936_model_2,0.0987933,0.396096,0.309366,0.0957072
DeepLearning_grid_2_AutoML_1_20240913_204936_model_3,0.0996583,0.437584,0.355116,0.126107
GLM_1_AutoML_1_20240913_204936,0.103025,0.372175,0.316282,0.100034
GBM_grid_1_AutoML_1_20240913_204936_model_3,0.107884,0.453678,0.332699,0.110689
XRT_1_AutoML_1_20240913_204936,0.108252,0.584118,0.333346,0.111119


In [10]:
# Extracting the best model
best_model = aml.leader

In [11]:
# Creating a function for evaluation
def h2o_eval(frame):
    predictions = best_model.predict(frame)
    predictions_df = predictions.as_data_frame()
    actual_phonemes = frame[target_col].as_data_frame()
    accuracy = accuracy_score(actual_phonemes, predictions_df['predict'])
    
    return accuracy

In [12]:
# Evaluating the model
print('-------------H2O RESULTS-------------')
print(f'Accuracy on training set: {h2o_eval(h2o_train) * 100:.2f}%')
print(f'Accuracy on validation set: {h2o_eval(h2o_val) * 100:.2f}%')
print(f'Accuracy on test set: {h2o_eval(h2o_test) * 100:.2f}%')
print('-------------------------------------------')

-------------H2O RESULTS-------------
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
Accuracy on training set: 88.65%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
Accuracy on validation set: 92.11%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
Accuracy on test set: 87.50%
-------------------------------------------


In [13]:
h2o.cluster().shutdown()

H2O session _sid_a4d1 closed.


**Conclusion:** The benchmark approach with 50% accuracy is still respectable compared to random guessing considering there are 10 classes, but H2O performs much better. The best performance came from a Deep Learning model, with a couple GBM models following it, which in some metrics are better than the Deep Learning models. Most likely the GBM models would still far outperform the benchmark.