In [None]:
import fix_notebook_imports

from src import util

%load_ext autoreload
%autoreload 2

import os
import pickle
from collections import defaultdict
import itertools

import numpy as np
import pandas as pd
import scipy
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
palette = sns.color_palette("bright", 10)
%matplotlib notebook


In [None]:
df_ipa_pickle_path = os.path.join(util.DATA_RAW_DIR, "IPA_DF.pkl")
df_annotated_csv_path = os.path.join(util.DATA_RAW_DIR, "annotated_feature_DF.csv")
    

In [None]:
with open(df_ipa_pickle_path, 'rb') as f:
    df_ipa = pickle.load(f)
    
df_ipa


In [None]:
df_annotated = pd.read_csv(df_annotated_csv_path)
df_annotated


In [None]:
# drop non-binary columns
drop_columns = ["cont"]

df_annotated = df_annotated.drop(columns=drop_columns)
df_annotated


In [None]:
max_length = max([len(x) for x in df_annotated["phoneme"]])

def get_phonemes(s, phonemes_set):
    phonemes = []
        
    s = s.replace(" ", "")
    while 1:
        phoneme_found = False
        for length in range(max_length, 0, -1):
            phoneme = s[:length]
            if phoneme in phonemes_set: 
                phoneme_found = True
                phonemes.append(phoneme)
        
        if phoneme_found == "":
            s = s[len(phoneme):]
        else:
            s = s[1:]            
        
        if len(s) == 0:
            return phonemes



In [None]:
language_phonemes_list = []

phonemes_set = set([row["phoneme"] for index, row in df_annotated.iterrows()])

for index, row in df_ipa.iterrows():
    language = row[1]
    s = row[3]
    phonemes = get_phonemes(s, phonemes_set)
    language_phonemes_list.append((language, phonemes))


In [None]:
df_phonemes = pd.DataFrame(data=language_phonemes_list, columns=("language", "transcription_ipa_phonemes"))
df_phonemes


In [None]:
phoneme_to_name_dict = {row["phoneme"] : row["name"] for index, row in df_annotated.iterrows()}

df_phonemes["transcription_ipa_names"] = df_phonemes["transcription_ipa_phonemes"].apply(lambda x: [phoneme_to_name_dict[phoneme] for phoneme in x])
df_phonemes


In [None]:
for language in np.unique(df_phonemes["language"]):
    count = len(df_phonemes[df_phonemes["language"] == language])
    print(f"{language}: {count}")

In [None]:
language_rename_mapper = {language : language.replace("-tok", "") for language in list(set(df_phonemes["language"]))}

df_phonemes["language"] = df_phonemes["language"].replace(language_rename_mapper)
df_phonemes


In [None]:
unique_languages = list(set(df_phonemes["language"]))
unique_languages


In [None]:
phonemes = []
for phonemes_list in df_phonemes["transcription_ipa_phonemes"]:
    phonemes += list(phonemes_list)
    
unique_phonemes = list(set(phonemes))

len(unique_phonemes)


In [None]:
language_to_phonemes_dict_temp = defaultdict(list)

for index, row in df_phonemes.iterrows():
    language_to_phonemes_dict_temp[row["language"]] += row["transcription_ipa_phonemes"]
    
language_to_phonemes_dict = {}
for language, phonemes in language_to_phonemes_dict_temp.items():
    language_to_phonemes_dict[language] = sorted(list(set(phonemes)))
    
del language_to_phonemes_dict_temp


In [None]:
unique_phonetic_features = np.array(df_annotated.columns[3:])

DIM = len(unique_phonetic_features)

def get_phoneme_to_phonetic_features_dict():
    phoneme_to_phonetic_features_dict = defaultdict(list)
    for phoneme in unique_phonemes:
        mask = np.array(df_annotated[df_annotated["phoneme"] == phoneme].iloc[0, 3:]).astype(bool) # produces a mask since values are binary
        phoneme_to_phonetic_features_dict[phoneme] = unique_phonetic_features[mask] 
        
    return phoneme_to_phonetic_features_dict

phoneme_to_phonetic_features_dict = get_phoneme_to_phonetic_features_dict()
phoneme_to_phonetic_features_dict


In [None]:
def get_n_grams(n):
    return itertools.combinations(unique_phonetic_features, n)

def get_phoneme_to_phonetic_feature_n_grams_dict(max_n):
    assert type(max_n) == int
    assert max_n >= 1
    
    phoneme_to_phonetic_feature_n_grams_dict = defaultdict(list)
    
    for phoneme in unique_phonemes:
        for n in range(1, max_n+1):
            n_grams = get_n_grams(n)
            for n_gram in n_grams:
                if set(n_gram).issubset(set(phoneme_to_phonetic_features_dict[phoneme])):
                    phoneme_to_phonetic_feature_n_grams_dict[phoneme].append(n_gram)
    
    return phoneme_to_phonetic_feature_n_grams_dict

phoneme_to_phonetic_feature_n_grams_dict = get_phoneme_to_phonetic_feature_n_grams_dict(max_n=5)


In [None]:
n_grams_pool = set(itertools.chain.from_iterable(phoneme_to_phonetic_feature_n_grams_dict.values()))
len(n_grams_pool)


In [None]:
def get_all_n_grams_in_range_in_pool(max_n, min_n=1):
    assert type(max_n) == int
    assert max_n >= min_n
    
    n_grams = []
    for n in range(min_n, max_n+1):
        n_grams += get_n_grams(n)
        
    n_grams = [n_gram for n_gram in n_grams if n_gram in n_grams_pool]
        
    return np.array(n_grams)


In [None]:
def get_features_for_phoneme(phoneme, n_grams):
    return np.array([ 1 if n_gram in phoneme_to_phonetic_feature_n_grams_dict[phoneme] else 0 for n_gram in n_grams ]) 


In [None]:

def get_features_for_language(language, n_grams):
    feat = np.zeros((len(n_grams),))
    
    phonemes = language_to_phonemes_dict[language]
    phonemes = list(set(phonemes))
    for phoneme in phonemes:
        feat = feat + get_features_for_phoneme(phoneme, n_grams)
        
    feat = feat / len(phonemes) # normalize for different numbers of phonemes
             
    return feat 
    

def get_feature_matrix(n_grams):
    
    m = len(unique_languages)
    n = len(n_grams)
    print(m, n)
    
    feature_matrix = np.zeros((m, n))
    for i, language in enumerate(unique_languages):
        feature_matrix[i, :] = get_features_for_language(language, n_grams)
    
    df = pd.DataFrame(feature_matrix)
    feat_avg = df.mean(axis=0)
    feat_std = df.std(axis=0)

    for i in range(m):
        feat = feature_matrix[i, :]
        feat  = (feat - feat_avg) / feat_std
        feature_matrix[i, :] = feat
        
    return feature_matrix

n_grams = get_all_n_grams_in_range_in_pool(max_n=3, min_n=2)
feature_matrix = get_feature_matrix(n_grams)

feature_matrix

In [None]:
def plot_embeddings_2D(df_embedded, title, method, figsize=(10,6)):
    fig = plt.figure(figsize=figsize)
    ax = sns.scatterplot(
        x="dim_1", 
        y="dim_2", 
        data=df_embedded
    )

    plt.title(f'{title}, Method: {method})')

    def label_point(x, y, val, ax):
        a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
        for i, point in a.iterrows():
            ax.text(point['x']+.02, point['y'], str(point['val']))

    label_point(df_embedded.dim_1, df_embedded.dim_2, df_embedded.language, plt.gca())
    

In [None]:
def plot_embeddings_3D(df_embedded, title, method, figsize=(10,6)):
    fig = plt.figure(figsize=figsize)
    ax = Axes3D(fig)
    ax.scatter(
        df_embedded["dim_1"],
        df_embedded["dim_2"],
        df_embedded["dim_3"],
    )

    plt.title(f'{title}, Method: {method})')

    def label_point(x, y, z, val, ax):
        a = pd.concat({'x': x, 'y': y, 'z': z, 'val': val}, axis=1)
        for i, point in a.iterrows():
            ax.text(point['x']+.02, point['y'], point['z'], str(point['val']))

    label_point(df_embedded.dim_1, df_embedded.dim_2, df_embedded.dim_3, df_embedded.language, plt.gca())
    

In [None]:
# Get PCA 

NUM_PC = 3

pca = PCA()
X_embedded = pca.fit_transform(feature_matrix)
df_pca = pd.DataFrame(X_embedded[:,:NUM_PC], columns=[f"dim_{i}" for i in range(1, NUM_PC+1)])
df_pca["language"] = unique_languages



In [None]:
# Plot 2D PCA

plot_embeddings_2D(df_pca, title="Phonology Embeddings", method="PCA", figsize=(5,4))


In [None]:
# Plot 3D PCA

plot_embeddings_3D(df_pca, title="Phonology Embeddings", method="PCA", figsize=(6,6))


In [None]:
# Plot t-SNE

tsne_2D = TSNE(n_components=2, perplexity=10, learning_rate=100)
X_embedded_2D = tsne_2D.fit_transform(feature_matrix)
df_tsne_2D = pd.DataFrame(X_embedded_2D, columns=["dim_1", "dim_2"])
df_tsne_2D["language"] = unique_languages
plot_embeddings_2D(df_tsne_2D, title="Phonology Embeddings", method="t-SNE", figsize=(5,4))
