In [None]:
import pandas as pd
import io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm, linear_model, datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.metrics import (confusion_matrix, precision_score, recall_score,
                             accuracy_score, roc_auc_score, RocCurveDisplay)
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from google.colab import files

# these lines of code import the .csv and read it into a df for Google Colab
# but you should import the .csv however is easiest for your environment
df = pd.read_csv('pb52.csv')

FileNotFoundError: ignored

Okay, so here are the main hyperparameters/design decisions for our modeling:

Output/what we're predicting:
*   sex/gender (m/f)
*   vowel sound produced (1 of 10 vowels)

Input Features:
*   f0 (pitch of speaker)
*   f1-f3 (frequencies of particular sound)
*   vowel (one-hot encoded representation)
*   sex (represented by 0 for m and 1 for f)

Other Design Decisions:
*   train/test/split sizes for model evaluation
*   which model to use
*   how many vowels to include (more or less specificity)
*   individual model hyperparameters

In [None]:
# these X-SAMPA symbols roughly correspond to to:
# "u" is the "oo" in "boot"
# "i" is the "ee" in "meet"
# "E" is the "e" in "bet"
# "A" is the "a" in "father"
# "{" is the "a" in "trap"
# "3'" is the "u" in "bud"
# "U" is the "oo" in "foot"
# "O" is the "o" in "off"
# "V" is the "u" in "gut"
# "I" is the "i" in "kit"
# vowels are described here: https://en.wikipedia.org/wiki/X-SAMPA
# features described here: https://rdrr.io/cran/phonTools/man/pb52.html
df['sex'] = df['sex'].map({'m': 0, 'f': 1})
vowel_symbols = list(df['vowel'].unique())
print(f'Vowels Measured in the Dataset: {vowel_symbols}')
print(f'Feature Names: {list(df.columns)}')
df

Vowels Measured in the Dataset: ['i', 'I', 'E', '{', 'V', 'A', 'O', 'U', 'u', "3'"]
Feature Names: ['type', 'sex', 'speaker', 'vowel', 'repetition', 'f0', 'f1', 'f2', 'f3']


Unnamed: 0,type,sex,speaker,vowel,repetition,f0,f1,f2,f3
0,m,0,1,i,1,160,240,2280,2850
1,m,0,1,i,2,186,280,2400,2790
2,m,0,1,I,1,203,390,2030,2640
3,m,0,1,I,2,192,310,1980,2550
4,m,0,1,E,1,161,490,1870,2420
...,...,...,...,...,...,...,...,...,...
1515,c,1,76,U,2,322,610,1550,3400
1516,c,1,76,u,1,345,520,1250,3460
1517,c,1,76,u,2,334,500,1140,3380
1518,c,1,76,3',1,308,740,1850,2160


In [None]:
# SELECT PREDICTION FEATURE:
main_feat = "vowel" #choose 'sex' or 'vowel'

# HYPERPARAMETERS USED IN EVERY MODEL

# we can either test on the dataset with every vowel (df) or with a subset of the vowels (filtered_df)
filtered_df = df[df['vowel'].isin(["u", "i", "E", "A", "3'"])] # this selects which vowels to include
main_df = df # this sets which dataframe is being used for modeling

if main_feat == 'sex':
  y = main_df.sex
  main_df = pd.get_dummies(main_df, columns=['vowel'])
  input_features = ['f0', 'f1', 'f2', 'f3'] + list(main_df.columns[-10:]) #adds one-hot encoded vowels
elif main_feat == 'vowel':
  y = main_df.vowel
  input_features = ['sex', 'f0', 'f1', 'f2', 'f3']
X = main_df[input_features]

# scaling the features so that they're all on the same scale
# this solves for the issue of logistic regression not converging
# uses normalization with mean and standard deviation NOT min-max standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 80% in training 10% in validation 10% in testing
X_train, X_tmp, y_train, y_tmp = train_test_split(X_scaled, y, test_size=.2, random_state=0, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_tmp, y_tmp, test_size=.5, random_state=0, stratify=y_tmp)

In [None]:
print(X_scaled)

[[-0.9486833  -0.51863015 -1.60695075  1.02955256  0.2738117 ]
 [-0.9486833  -0.08775577 -1.40813181  1.21799383  0.15826811]
 [-0.9486833   0.19396979 -0.86137975  0.6369666  -0.13059086]
 ...
 [ 1.05409255  2.3649138  -0.31462769 -0.76063944  1.29444673]
 [ 1.05409255  1.93403941  0.8782859   0.3543047  -1.05493957]
 [ 1.05409255  2.26548125  0.48064804  0.32289782 -0.97791051]]


In [None]:
# model 1: K-Nearest Neighbors
knn_m = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_m.fit(X_train, y_train)
pred = knn_m.predict(X_valid)
acc = accuracy_score(y_valid, pred)
print(f'Accuracy on validation data is {round(acc,3)}')

Best Results for KNN on Validation Data (includes analysis on which features are important for each prediction):

Our number of features is small enough that we can find feature importance manually by trying models with and without different features.

For predicting gender: for 1 nearest neighbor the model got 0.928 validation accuracy. I thought that including the one-hot encoded vowels would improve accuracy but it did not. Removing the f1, f2, and f3 features had little effect on model results. This indicates that pitch is the most important feature for determining gender.

For predicting vowels: the best validation accuracy was 0.842 for 3 nearest neighbors. Removing either f1 or f2 resulted in very bad accuracy less than 0.4 while removing f0 resulted in slightly worse accuracy. The f3 frequency feature did not affect performance. Excluding sex as a feature did not change model performance.


In [None]:
# model 2: Logistic Regression
log_m = linear_model.LogisticRegression()
log_m.fit(X_train, y_train)
pred = log_m.predict(X_valid)
acc = accuracy_score(y_valid, pred)
print(f'Accuracy on validation data is {round(acc,3)}')

For predicting gender: default hyperparameters give a validation accuracy of 0.868

For predicting vowels: default hyperparameters give a validation accuracy of 0.868


In [None]:
# model 3: Support Vector Machine
svm_m = svm.SVC()
svm_m.fit(X_train, y_train)
pred = svm_m.predict(X_valid)
acc = accuracy_score(y_valid, pred)
print(f'Accuracy on validation data is {round(acc,3)}')

For predicting gender: default hyperparameters give a validation accuracy of 0.901

For predicting vowels: default hyperparameters give a validation accuracy of 0.895

In [None]:
# model 4: Decision Tree
svm_m = DecisionTreeClassifier()
svm_m.fit(X_train, y_train)
pred = svm_m.predict(X_valid)
acc = accuracy_score(y_valid, pred)
print(f'Accuracy on validation data is {round(acc,3)}')

For predicting gender: default hyperparameters give a validation accuracy of 0.895

For predicting vowels: default hyperparameters give a validation accuracy of 0.783

In [None]:
# testing all of them and looping through various hyperparameters
classifiers = [svm.SVC(),
               linear_model.LogisticRegression(),
               DecisionTreeClassifier(),
               KNeighborsClassifier()]
parameter_dicts = [{'kernel':('linear', 'rbf'), 'C':[0.01,0.1,1,100]},
                   {'max_iter':[1000, 4000, 5000, 6000], 'C':[0.01,0.1,1,100]},
                   {'criterion':['entropy'], 'max_depth':[1,3,5,7,15]},
                   {'metric':['euclidean'], 'n_neighbors':[1,2,3,4,5,10]}]

best_index=-1
best_acc=-np.inf
best_classifier=None
best_name = None

for i in range(4):
    print(i)
    classifier = classifiers[i]
    parameters = parameter_dicts[i]
    m = GridSearchCV(classifier, parameters)
    m.fit(X_train, y_train)
    acc = m.score(X_valid, y_valid)
    if acc>best_acc:
        best_acc = acc
        best_index = i
        best_classifier = m
        best_name = classifier

print(f'Best Index: {best_index} \nBest Validation Accuracy: {round(best_acc, 5)}' +
      f'\nBest Model: {best_name}' +
      f'\nWith Parameters: \n{best_classifier.best_params_}')

In [None]:
# model 3: Support Vector Machine
svm_m = svm.SVC(C=1, kernel='linear')
svm_m.fit(X_train, y_train)
pred = svm_m.predict(X_valid)
acc = accuracy_score(y_valid, pred)
print(f'Accuracy on validation data is {round(acc,3)}')

Accuracy on validation data is 0.882


In [None]:
df = pd.read_csv('self_data.csv', on_bad_lines='skip')[0:10]

df['sex'] = df['sex'].map({'m': 0, 'f': 1})

main_feat = "vowel" #choose 'sex' or 'vowel'

main_df = df

if main_feat == 'sex':
  y = main_df.sex
  main_df = pd.get_dummies(main_df, columns=['vowel'])
  input_features = ['f0', 'f1', 'f2', 'f3'] + list(main_df.columns[-10:]) #adds one-hot encoded vowels
elif main_feat == 'vowel':
  y = main_df.vowel
  input_features = ['sex', 'f0', 'f1', 'f2', 'f3']
X = main_df[input_features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pred = svm_m.predict(X_scaled)
acc = accuracy_score(y, pred)
print(pred)
print(acc)
X

['u' 'i' '{' 'A' 'I' '{' 'U' 'U' 'O' 'V']
0.8


Unnamed: 0,sex,f0,f1,f2,f3
0,0,142,269,1002,2278
1,0,109,274,2433,3288
2,0,102,619,1579,2553
3,0,111,754,1184,2333
4,0,137,416,1918,2588
5,0,102,734,1552,2222
6,0,125,568,1180,2366
7,0,98,514,1080,3102
8,0,122,642,965,2840
9,0,100,609,1217,2851


For predicting gender: KNN with 5 nearest neighbors was the model and hyperparameters that gave the highest accuracy of 0.914.

For predicting vowels: SVM with a linear kernel and C=1 gave the highest accuracy with 0.882.