In [None]:
import sys

sys.path.append("..")
from src.utils import load_dictionary, clean_text, save_dict
from src.embeddings import load_embeddings, save_embedding
from src.visualization import plot_embeddings
from sklearn.model_selection import train_test_split
import os
import numpy as np

In [2]:
df = load_dictionary(pos=False)
df.head()

Unnamed: 0,english,pos,twi
0,hopeless,ADJECTIVE,anidasoɔ nni mu; deɛ anidasoɔ nni mu
1,horizon,NOUN,ewiem ne asase ahyiaeɛ
2,horn,NOUN,abɛn; abebɛn
3,horror,NOUN,ehu
4,horrible,ADJECTIVE,ɛyɛ hu; deɛ ɛyɛ hu


In [3]:
df["pos"].unique()

array(['ADJECTIVE', 'NOUN', 'VERB', 'ADJECTIVE | NOUN', 'EXCLAMATION',
       'ADVERB', 'NOUN | ADJECTIVE', '3RD PERSON PLURAL SUBJECT PRONOUN',
       '2ND PERSON SINGULAR POSSESSIVE ADJECTIVE',
       '1ST PERSON PLURAL SUBJECT PRONOUN',
       '1ST PERSON PLURAL OBJECT PRONOUN',
       '1ST PERSON SINGULAR POSSESSIVE ADJECTIVE',
       '3RD PERSON POSSESSIVE ADJECTIVE (male)',
       '3RD PERSON POSSESSIVE PRONOUN (female)',
       '1ST PERSON PLURAL POSSESSIVE ADJECTIVE',
       '3RD PERSON PLURAL POSSESSIVE ADJECTIVE',
       '3RD PERSON PLURAL POSSESSIVE PRONOUN',
       '2ND PERSON PLURAL POSSESSIVE PRONOUN',
       '3RD PERSON NEUTRAL SUBJECT PRONOUN', 'SUBJECT PRONOUN',
       '1ST PERSON SINGULAR SUBJECT PRONOUN',
       '3RD PERSON SINGULAR SUBJECT PRONOUN (male)',
       '3RD PERSON SINGULAR SUBJECT PRONOUN',
       '3RD PERSON SINGULAR OBJECT PRONOUN (male)',
       '3RD PERSON SINGULAR OBJECT PRONOUN (female)', 'NOUN | ADVERB',
       'DEMONSTRATIVE PRONOUN | DETERMINER',

In [4]:
df = df.drop("pos", axis=1)

In [5]:
df.columns

Index(['english', 'twi'], dtype='object')

In [6]:
len(df["english"])

7250

In [7]:
df["twi"]

0       anidasoɔ nni mu; deɛ anidasoɔ nni mu 
1                      ewiem ne asase ahyiaeɛ
2                               abɛn; abebɛn 
3                                         ehu
4                          ɛyɛ hu; deɛ ɛyɛ hu
                        ...                  
7245                                 di nsesa
7246                                      emu
7247                                    aniku
7248                                 ntwaremu
7249                           twitwa anan mu
Name: twi, Length: 7250, dtype: object

In [8]:
df["twi"] = df["twi"].str.split(";").str[0]

In [9]:
df["twi"]

0              anidasoɔ nni mu
1       ewiem ne asase ahyiaeɛ
2                         abɛn
3                          ehu
4                       ɛyɛ hu
                 ...          
7245                  di nsesa
7246                       emu
7247                     aniku
7248                  ntwaremu
7249            twitwa anan mu
Name: twi, Length: 7250, dtype: object

In [None]:
df["twi"] = df["twi"].apply(clean_text)

In [None]:
df["english"] = df["english"].apply(lambda x: clean_text(text=x, lang="eng"))

In [15]:
os.makedirs("../data/processed", exist_ok=True)

In [16]:
df.to_csv("../data/processed/twi_dict_processed.csv", index=False)

In [None]:
dictionary = dict(zip(df["english"], df["twi"]))

In [None]:
keys = list(dictionary.keys())
train_keys, test_keys = train_test_split(keys, test_size=0.2, random_state=42)

train_dict = dict((k, dictionary[k]) for k in train_keys)
test_dict = dict((k, dictionary[k]) for k in test_keys)

In [None]:
save_dict(train_dict, "train_dict")
save_dict(test_dict, "test_dict")

In [None]:
en_embeddings = load_embeddings("../data/embeddings/glove.6B.100d.txt")
twi_embeddings = load_embeddings(
    "../data/embeddings/twi_fasttext_4M_akan_dim_100.vec", fasttext=True
)

In [None]:
en_labels = list(en_embeddings.keys())
en_vectors = np.array(list(en_embeddings.values()))
plot_embeddings(en_vectors, en_labels, "ENglish Embedding Space")


In [None]:
twi_labels = list(twi_embeddings.keys())
twi_vectors = np.array(list(twi_embeddings.values()))
plot_embeddings(twi_vectors, twi_labels, "Twi Embedding Space")


In [None]:
save_embedding(en_embeddings, twi_embeddings)