In [None]:
import librosa
import os
import numpy as np
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

### Import Speaker Metadata

In [None]:
meta = pd.read_csv('drive/MyDrive/speakers_all.csv')
display(meta)

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,24.0,12.0,"koussi, senegal",balanta,balanta,male,788,senegal,True,,,
1,18.0,10.0,"buea, cameroon",cameroon,cameroon,male,1953,cameroon,True,,,
2,48.0,8.0,"hong, adamawa, nigeria",fulfulde,fulfulde,male,1037,nigeria,True,,,
3,42.0,42.0,"port-au-prince, haiti",haitian,haitian,male,1165,haiti,True,,,
4,40.0,35.0,"port-au-prince, haiti",haitian,haitian,male,1166,haiti,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2167,46.0,5.0,"lagos, nigeria",yoruba3,yoruba,female,766,nigeria,False,,,
2168,46.0,12.0,"lagos, nigeria",yoruba4,yoruba,male,851,nigeria,False,,,
2169,47.0,2.0,"ibadan, nigeria",yoruba5,yoruba,female,2023,nigeria,False,,,
2170,31.0,1.0,"bethel, alaska, usa",yupik1,yupik,female,571,usa,False,,,


In [None]:
lang_list = list(meta['native_language'].value_counts().keys()[:6])
print(lang_list)

['english', 'spanish', 'arabic', 'mandarin', 'french', 'korean']


### Import Audio Data

In [None]:
! unzip "/content/drive/MyDrive/recordings2.zip" -d "/content"

Archive:  /content/drive/MyDrive/recordings2.zip
   creating: /content/output/
  inflating: /content/output/russian12.wav  
  inflating: /content/output/nepali2.wav  
  inflating: /content/output/armenian4.wav  
  inflating: /content/output/amharic15.wav  
  inflating: /content/output/english101.wav  
  inflating: /content/output/romanian6.wav  
  inflating: /content/output/german19.wav  
  inflating: /content/output/french14.wav  
  inflating: /content/output/mandarin3.wav  
  inflating: /content/output/english7.wav  
  inflating: /content/output/english115.wav  
  inflating: /content/output/miskito8.wav  
  inflating: /content/output/german25.wav  
  inflating: /content/output/french28.wav  
  inflating: /content/output/german31.wav  
  inflating: /content/output/vietnamese17.wav  
  inflating: /content/output/english129.wav  
  inflating: /content/output/cantonese9.wav  
  inflating: /content/output/arabic96.wav  
  inflating: /content/output/arabic82.wav  
  inflating: /content/out

### Generate MFCC Embeddings

In [None]:
#MFCCs

directory = 'output'
mfccss = []
labels = []
max_dim = 0
min_dim = np.float("inf")

# iterate over files in directory
for filename in os.listdir(directory):

    print(filename)

    #only include files with speakers from lang_list
    lang = meta[meta['filename'] + ".wav" == filename]['native_language'].values[0]

    if lang in lang_list:
        labels.append(lang)

        f = os.path.join(directory, filename)
        x, sr = librosa.load(f)

        #compute mfcss
        mfccs = librosa.feature.mfcc(y = x, sr =sr , n_mfcc = 40)

        max_dim = max(max_dim, mfccs.shape[1])
        min_dim = min(min_dim, mfccs.shape[1])
        mfccss.append(mfccs)



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  min_dim = np.float("inf")


italian17.wav
english153.wav
spanish3.wav
romanian11.wav
estonian2.wav
arabic25.wav
english502.wav
xiang3.wav
armenian8.wav
english438.wav
arabic29.wav
french47.wav
malayalam2.wav
spanish80.wav
romanian16.wav
georgian1.wav
serbian18.wav
english104.wav
polish18.wav
mandarin43.wav
greek3.wav
arabic49.wav
arabic58.wav
english103.wav
urdu9.wav
korean3.wav
english249.wav
malayalam1.wav
english97.wav
greek7.wav
turkish16.wav
english54.wav
spanish93.wav
arabic54.wav
english578.wav
portuguese38.wav
polish16.wav
macedonian1.wav
azerbaijani2.wav
dari5.wav
bosnian5.wav
english19.wav
korean14.wav
punjabi4.wav
serbian15.wav
english106.wav
russian33.wav
spanish52.wav
spanish48.wav
gujarati1.wav
italian13.wav
korean42.wav
italian25.wav
ife1.wav
vlaams3.wav
english534.wav
russian12.wav
english32.wav
hausa6.wav
greek11.wav
english94.wav
english126.wav
russian34.wav
russian29.wav
ga5.wav
english571.wav
english123.wav
dutch41.wav
ewe1.wav
english495.wav
french30.wav
mandarin47.wav
english472.wav
arabic31

In [None]:
print(labels)

['english', 'spanish', 'arabic', 'english', 'english', 'arabic', 'french', 'spanish', 'english', 'mandarin', 'arabic', 'arabic', 'english', 'korean', 'english', 'english', 'english', 'spanish', 'arabic', 'english', 'english', 'korean', 'english', 'spanish', 'spanish', 'korean', 'english', 'english', 'english', 'english', 'english', 'english', 'english', 'french', 'mandarin', 'english', 'arabic', 'english', 'english', 'arabic', 'english', 'arabic', 'english', 'spanish', 'spanish', 'mandarin', 'english', 'korean', 'spanish', 'arabic', 'english', 'mandarin', 'english', 'english', 'english', 'spanish', 'korean', 'english', 'english', 'mandarin', 'arabic', 'english', 'english', 'english', 'french', 'english', 'english', 'spanish', 'english', 'spanish', 'english', 'english', 'english', 'english', 'english', 'english', 'spanish', 'spanish', 'spanish', 'english', 'english', 'english', 'english', 'spanish', 'mandarin', 'english', 'french', 'english', 'spanish', 'english', 'english', 'english', 

## Pad MFCCS

In [None]:
#resize mfccs to all be the same size
resized_mfccs = []
for mfcc in mfccss:
  rs = librosa.util.fix_length(data=mfcc, size=max_dim, axis=1)
  resized_mfccs.append(rs.flatten())

resized_mfccs = np.array(resized_mfccs)
print(resized_mfccs.shape)

(1023, 146640)


#Clustering

## PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
pca = PCA(n_components=3)
transformed = pca.fit_transform(resized_mfccs)

In [None]:
print(transformed.shape)

(1023, 3)


In [None]:
pca.explained_variance_ratio_

array([0.2639865 , 0.06861802, 0.04019303], dtype=float32)

In [None]:
#Plot results
fig = px.scatter_3d(transformed, x=0, y=1, z= 2, color = labels, title = "PCA of MFCC")
fig.show()

### PCA Clustering Metrics

In [None]:
def clustering_metrics(labels, transformed):
  lang_to_comps = {}
  for lang, comp in zip(labels, transformed):
    if lang not in lang_to_comps:
        lang_to_comps[lang] = [comp]
    else:
        lang_to_comps[lang].append(comp)

  #calculate cluster centroids
  lang_to_centroid = {}
  for lang in lang_to_comps:
    centroid = np.average(np.asarray(lang_to_comps[lang]), axis = 0)
    lang_to_centroid[lang] = centroid


  #standard deviations
  for lang in lang_to_comps:
    std = np.std(np.asarray(lang_to_comps[lang]))
    print(lang.upper(), "standard deviation: ", std)

  print("\n")

  #pairwise cluster distances
  for i in range(len(lang_list)):
    centroid = lang_to_centroid[lang_list[i]]
    #english_centroid = lang_to_centroid['english']
    for j in range(i+1, len(lang_list)):
      #if lang != l:
      distance = np.linalg.norm(lang_to_centroid[lang_list[j]] - centroid)
      print(lang_list[i].upper() + "-" + lang_list[j].upper() + ' distance : ', distance)




In [None]:
clustering_metrics(labels, transformed)

FRENCH standard deviation:  2003.4473
MANDARIN standard deviation:  2747.9807
SPANISH standard deviation:  2773.0388
ENGLISH standard deviation:  1764.3357
KOREAN standard deviation:  2701.9292
ARABIC standard deviation:  3111.815


ENGLISH-SPANISH distance :  3282.56
ENGLISH-ARABIC distance :  4952.707
ENGLISH-MANDARIN distance :  3931.9167
ENGLISH-FRENCH distance :  2076.2822
ENGLISH-KOREAN distance :  3649.1873
SPANISH-ARABIC distance :  1708.4023
SPANISH-MANDARIN distance :  679.2193
SPANISH-FRENCH distance :  1445.0953
SPANISH-KOREAN distance :  463.0187
ARABIC-MANDARIN distance :  1201.8511
ARABIC-FRENCH distance :  3090.0422
ARABIC-KOREAN distance :  1471.4558
MANDARIN-FRENCH distance :  2039.7697
MANDARIN-KOREAN distance :  373.4303
FRENCH-KOREAN distance :  1699.0162


## tSNE

In [None]:
from sklearn.manifold import TSNE

tSNE Hyperparameter Tuning

In [None]:
#experiment with different hyperparameters

perplexities = [2, 5, 30, 50, 100]
iterations = [500, 1000, 2000, 5000]
lr = [10, 50, 100]

for p in perplexities:
  for i in iterations:
    for l in lr:
        tsne = TSNE(n_components = 3, perplexity=p, learning_rate = l, n_iter = i).fit_transform(resized_mfccs)
        print("Hyperparameters: perplexity = " + str(p) + " iterations = " + str(i) + " lr = " + str(l))

        fig = px.scatter_3d(tsne, x=0, y=1, z=2, color = labels)
        fig.show()


KeyboardInterrupt: ignored

### Final tSNE

In [43]:
tsne = TSNE(n_components = 3, perplexity=50, learning_rate = 10, n_iter = 5000).fit_transform(resized_mfccs)

fig = px.scatter_3d(tsne, x=0, y=1, z=2, color = labels, title = "tSNE of MFCC")
fig.show()

clustering_metrics(labels, tsne)

ENGLISH standard deviation:  7.6257024
SPANISH standard deviation:  9.678843
ARABIC standard deviation:  10.232881
FRENCH standard deviation:  8.239849
MANDARIN standard deviation:  10.0298395
KOREAN standard deviation:  9.3293085


ENGLISH-SPANISH distance :  13.727588
ENGLISH-ARABIC distance :  20.079487
ENGLISH-MANDARIN distance :  17.25433
ENGLISH-FRENCH distance :  10.48883
ENGLISH-KOREAN distance :  16.983809
SPANISH-ARABIC distance :  6.383329
SPANISH-MANDARIN distance :  3.6084385
SPANISH-FRENCH distance :  3.3008423
SPANISH-KOREAN distance :  3.3153098
ARABIC-MANDARIN distance :  2.955766
ARABIC-FRENCH distance :  9.6063795
ARABIC-KOREAN distance :  3.1257854
MANDARIN-FRENCH distance :  6.8574386
MANDARIN-KOREAN distance :  0.48825985
FRENCH-KOREAN distance :  6.5457463


In [44]:
#numerically encode labels
nums = []
for l in labels:
  if l == "english":
    nums.append(1)
  elif l == "spanish":
    nums.append(2)
  elif l == "arabic":
    nums.append(3)
  elif l == "french":
    nums.append(4)
  elif l == "madarin":
    nums.append(5)
  else:
    nums.append(6)


In [45]:
from sklearn.metrics import r2_score
from sklearn.cross_decomposition import PLSRegression

#Variance explained by tSNE
tSNE_matrix = pd.DataFrame(tsne[:, 0])
pls1 = PLSRegression(n_components = 1)
pls1.fit(tSNE_matrix, nums)
y_pred1 = pls1.predict(tSNE_matrix)
print(r2_score(nums, y_pred1, multioutput = 'variance_weighted'))

tSNE_matrix = pd.DataFrame(tsne[:, 1])
pls2 = PLSRegression(n_components = 1)
pls2.fit(tSNE_matrix, nums)
y_pred2 = pls2.predict(tSNE_matrix)
print(r2_score(nums, y_pred2, multioutput = 'variance_weighted'))

tSNE_matrix = pd.DataFrame(tsne[:, 2])
pls3 = PLSRegression(n_components = 1)
pls3.fit(tSNE_matrix, nums)
y_pred3 = pls3.predict(tSNE_matrix)
print(r2_score(nums, y_pred3, multioutput = 'variance_weighted'))


0.18382952034969083
0.001220270683492397
0.01583628977613627


### UMAP

In [None]:
!pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82816 sha256=f30100c8fdefdcf9778a66e50ffcf97801b1201d04d0034d15697c875b3b0281
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d

In [None]:
import umap.umap_ as umap

### UMAP Hyperparamter Tuning

In [None]:
distances = [0, .001, .01, .1, .5]
neighbors = [5, 15, 50, 100, 150]
metric = ["euclidean"]

for n in neighbors:
  for d in distances:
    for m in metric:
      u = umap.UMAP(n_components = 3, n_neighbors = n, min_dist = d, metric = m).fit_transform(resized_mfccs)
      print("Hyperparameters: neighbors = " + str(n) + " distances = " + str(d) + "metric = " + str(m))
      fig = px.scatter_3d(u, x=0, y=1, z=2, color = labels)
      fig.show()


Hyperparameters: neighbors = 5 distances = 0metric = euclidean


Hyperparameters: neighbors = 5 distances = 0.001metric = euclidean


Hyperparameters: neighbors = 5 distances = 0.01metric = euclidean


Hyperparameters: neighbors = 5 distances = 0.1metric = euclidean


Hyperparameters: neighbors = 5 distances = 0.5metric = euclidean


Hyperparameters: neighbors = 15 distances = 0metric = euclidean


Hyperparameters: neighbors = 15 distances = 0.001metric = euclidean


Hyperparameters: neighbors = 15 distances = 0.01metric = euclidean


Hyperparameters: neighbors = 15 distances = 0.1metric = euclidean


Hyperparameters: neighbors = 15 distances = 0.5metric = euclidean


Hyperparameters: neighbors = 50 distances = 0metric = euclidean


Hyperparameters: neighbors = 50 distances = 0.001metric = euclidean


Hyperparameters: neighbors = 50 distances = 0.01metric = euclidean


Hyperparameters: neighbors = 50 distances = 0.1metric = euclidean


Hyperparameters: neighbors = 50 distances = 0.5metric = euclidean


Hyperparameters: neighbors = 100 distances = 0metric = euclidean


Hyperparameters: neighbors = 100 distances = 0.001metric = euclidean


Hyperparameters: neighbors = 100 distances = 0.01metric = euclidean


Hyperparameters: neighbors = 100 distances = 0.1metric = euclidean


Hyperparameters: neighbors = 100 distances = 0.5metric = euclidean


Hyperparameters: neighbors = 150 distances = 0metric = euclidean


Hyperparameters: neighbors = 150 distances = 0.001metric = euclidean


Hyperparameters: neighbors = 150 distances = 0.01metric = euclidean


Hyperparameters: neighbors = 150 distances = 0.1metric = euclidean


Hyperparameters: neighbors = 150 distances = 0.5metric = euclidean


Final UMAP

In [40]:
u = umap.UMAP(n_components = 3, n_neighbors = 150, min_dist = 0.1, metric = "euclidean").fit_transform(resized_mfccs)
fig = px.scatter_3d(u, x=0, y=1, z=2, color = labels, title = "UMAP of MFCC")
fig.show()

clustering_metrics(labels, u)


ENGLISH standard deviation:  2.7565863
SPANISH standard deviation:  2.558641
ARABIC standard deviation:  2.3593345
FRENCH standard deviation:  2.2744977
MANDARIN standard deviation:  2.50709
KOREAN standard deviation:  2.2801757


ENGLISH-SPANISH distance :  4.123945
ENGLISH-ARABIC distance :  6.0379486
ENGLISH-MANDARIN distance :  5.178733
ENGLISH-FRENCH distance :  3.1853957
ENGLISH-KOREAN distance :  5.0676985
SPANISH-ARABIC distance :  1.9144416
SPANISH-MANDARIN distance :  1.0563364
SPANISH-FRENCH distance :  0.95102334
SPANISH-KOREAN distance :  0.9447511
ARABIC-MANDARIN distance :  0.8641001
ARABIC-FRENCH distance :  2.8569415
ARABIC-KOREAN distance :  0.97037995
MANDARIN-FRENCH distance :  2.0047035
MANDARIN-KOREAN distance :  0.1356728
FRENCH-KOREAN distance :  1.8883094


In [41]:
#Variance explained by UMAP1
UMAP_matrix = pd.DataFrame(u[:, 0])
pls = PLSRegression(n_components = 1)
pls.fit(UMAP_matrix, nums)
y_pred = pls.predict(UMAP_matrix)
print(r2_score(nums, y_pred, multioutput = 'variance_weighted'))

UMAP_matrix = pd.DataFrame(u[:, 1])
pls = PLSRegression(n_components = 1)
pls.fit(UMAP_matrix, nums)
y_pred = pls.predict(UMAP_matrix)
print(r2_score(nums, y_pred, multioutput = 'variance_weighted'))

UMAP_matrix = pd.DataFrame(u[:, 2])
pls = PLSRegression(n_components = 1)
pls.fit(UMAP_matrix, nums)
y_pred = pls.predict(UMAP_matrix)
print(r2_score(nums, y_pred, multioutput = 'variance_weighted'))





0.18551283126563578
0.09671967954784101
0.012140221554196582
