# Generate Word2Vec embeddings for X-SAMPA syllables

In [1]:
%load_ext dotenv
%dotenv
import os

base_dir = os.getenv("WORKING_DIR")
os.chdir(base_dir)

In [2]:
from src.data_loader_and_saver import JSONDataLoaderAndSaver

data_loader = JSONDataLoaderAndSaver(base_dir, input_data_dir="src/data")

In [3]:
from src.bi_lstm_crf.embeddings_saver import EmbeddingsSaver

embeddings_saver = EmbeddingsSaver("sampa_syllables", base_dir, "src/bi_lstm_crf/embeddings")

In [4]:
from src.kveta.sampa_syllable_parser import SampaSyllableParser

sampa_parser = SampaSyllableParser()

In [5]:
from gensim.models import Word2Vec

## All poems just 1 metre, no unknown metres

In [6]:
extension = "_one_metre_all_metres_recognized"

In [7]:
sampa_tokens = data_loader.load_data(f"train_sampa_tokens{extension}")
sampa_tokens[0]

train_sampa_tokens_one_metre_all_metres_recognized.json: loaded 40137 records.


[['tEc', 't_so', 'sE', 'h\\roznI', 'zlatEm', 'zaskvi:vaji:'],
 ['na', 'za:h\\onEx', 'sE', 'astrI', 'usmi:vaji:'],
 ['pP\\I', 'pl=nE:', 't_Si:SI', 'v', 'dobrE:m', 'rozmaru'],
 ['sI', 'do', 'kola', 'drux', 'z', 'druh\\Em', 'zaspi:vaji:'],
 ['o', 'kra:sni:x', 'J\\i:fka:x', 'ktErE:', 'kokEtJE'],
 ['vjEji:P\\Em', 'JadEr', 'vl=nu', 'ukri:vaji:'],
 ['o', 'zlati:x', 'slatki:x', 't_SasEx', 'mladoscI'],
 ['jES', 'nEli:tostJE', 's', 'la:sko_u', 'upli:vaji:'],
 ['o', 'polIpt_si:x', 'jIxS', 'sta:lE', 'mi:J', 'a', 'mi:J'],
 ['jES', 'jako', 'pr=vJi:', 'slatkE:', 'nEbi:vaji:'],
 ['o', 'rukavIt_Ska:x', 'stuSka:x', 'kadEP\\i:x'],
 ['jES', 'f', 'skP\\i:nka:x', 's', 'lIstI', 'la:skI', 'sEtli:vaji:'],
 ['o', 'vlasEx', 'proP\\i:dli:x', 'gdIs', 'tmavi:x', 'tEc'],
 ['jES', 'stP\\i:br=no_u', 'sE', 'vl=no_u', 'zaxvi:vaji:'],
 ['o', 'pi:sJi:x', 'ktErE:', 'f', 'sklonu', 'mladoscI'],
 ['tak', 'vEsElE:', 'juS', 'notI', 'nEmi:vaji:'],
 ['o', 'fSEm', 't_so', 'vadnE', 'lItski:x', 'losu:', 'h\\ro_u'],
 ['nat', 't_Si:m'

In [8]:
sampa_tokens = [line for poem in sampa_tokens for line in poem]
sampa_tokens[0]

['tEc', 't_so', 'sE', 'h\\roznI', 'zlatEm', 'zaskvi:vaji:']

In [9]:
len(sampa_tokens)

1323119

In [10]:
sentences = [[syll for sampa_token in line for syll in sampa_parser.get_syllables(sampa_token)] for line in sampa_tokens]
sentences[0]

['tEc', 't_so', 'sE', 'h\\ro', 'znI', 'zla', 'tEm', 'za', 'skvi:', 'va', 'ji:']

In [11]:
len(sentences)

1323119

In [12]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

In [13]:
model.wv.most_similar("sJi:k", topn=10)

[('sJi:P\\', 0.8218221068382263),
 ('sJEx', 0.6744515895843506),
 ('sJIt', 0.6575213074684143),
 ('zlIf', 0.6558831930160522),
 ('sJi:', 0.5904481410980225),
 ('sEJ', 0.5803896188735962),
 ('SES', 0.5532275438308716),
 ('smEn', 0.5508992671966553),
 ('smEx', 0.5405819416046143),
 ('smEm', 0.533706545829773)]

In [14]:
vocab = model.wv.index_to_key
vocab[:10]

['a', 'sE', 'jE', 'nE', 'o', 'na', 'po', 'lI', 'h\\o', 'lE']

In [15]:
len(vocab)

17197

In [16]:
embeddings_saver.save_embeddings(model, vocab, extension)

Embeddings saved to embeddings_sampa_syllables_one_metre_all_metres_recognized.gz


## All lines just 1 metre, no unknown metres

In [31]:
extension = "_one_metre_line_all_metres_recognized"

In [32]:
sampa_tokens = data_loader.load_data(f"train_sampa_tokens{extension}")
sampa_tokens[0]

train_sampa_tokens_one_metre_line_all_metres_recognized.json: loaded 41762 records.


[['mu:j', 'koJi:t_SEk', 'vrani:'],
 ['jako', 'malovani:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['noSkI', 'pjEknE:', 'zdvi:h\\a:'],
 ['uSIma', 'sI', 'stP\\i:h\\a:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['dlo_uh\\a:', 't_SErna:', 'h\\P\\i:va'],
 ['po', 'vjEtru', 'mu', 'spli:va:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['potkofkI', 'mu', 'zvoJi:'],
 ['jIskra', 'jIskru', 'h\\oJi:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['mu:j', 'koJi:t_Sku', 'vrani:'],
 ['jako', 'malovani:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['aS', 'cE', 'osEdla:mE'],
 ['kam', 'sE', 'poJ\\i:va:mE'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['rozjEdEm', 'sE', 'f', 'polI'],
 ['pP\\Ez', 'h\\orI', 'a', 'dolI'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['rozjEdEm', 'sE', 'lEtEm'],
 ['SIroSIri:m', 'svjEtEm'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['ZEc', 'sE', 'rozjEdEmE'],
 ['kam', 'sE', 'rozjEt', 'xt_sEmE'],
 ['h\\opsa', 'h\\Ejsa', 'h\\Ej'],
 ['tak', 'mu:j', 'vrani:', 'koJI'],
 ['ac', 'sI', 'na:s', 'gdo', 'h\\oJi:'],
 ['h\\opsa', 'h\\Ejsa', 'h\\E

In [33]:
sampa_tokens = [line for poem in sampa_tokens for line in poem]
sampa_tokens[0]

['mu:j', 'koJi:t_SEk', 'vrani:']

In [34]:
len(sampa_tokens)

1434187

In [35]:
sentences = [[syll for sampa_token in line for syll in sampa_parser.get_syllables(sampa_token)] for line in sampa_tokens]
sentences[0]

['mu:j', 'ko', 'Ji:', 't_SEk', 'vra', 'ni:']

In [36]:
len(sentences)

1434187

In [37]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

In [38]:
model.wv["sJi:k"]

array([-1.2578778 ,  0.7765036 , -1.719597  , -0.8721979 ,  1.4351071 ,
        0.4534164 , -1.2066792 , -1.435556  ,  2.0679584 ,  0.11561677,
       -0.39317933,  0.57331127, -1.1125889 ,  0.7042801 ,  1.8864619 ,
       -1.6707833 ,  0.24268824,  0.7571237 ,  0.60766494,  0.23893344,
        1.282776  , -0.4417474 ,  0.7752723 , -2.246198  , -0.9891198 ,
       -2.548064  , -1.8823848 , -0.8477542 , -1.4762535 ,  0.5680849 ,
       -1.3101867 , -0.3506459 ,  0.43943116, -1.7477374 , -0.50576264,
       -0.7006718 , -1.7652946 , -2.8245993 ,  0.4906584 , -2.2424204 ,
        0.06671168, -1.2228371 ,  1.4229977 , -0.7655375 ,  0.22899467,
        1.1292756 , -0.19747517, -1.2078893 ,  2.1079013 ,  1.7278663 ,
       -0.07951596, -1.7949287 , -0.5174524 ,  1.0209321 , -0.54930687,
       -2.8948164 , -1.9857851 ,  3.2667851 , -0.38583663,  0.8674617 ,
       -2.2349408 ,  2.192388  ,  0.23176646, -1.437929  ,  0.69532174,
        1.1353384 ,  0.83849496,  3.7022793 , -0.9401905 , -1.22

In [39]:
model.wv.most_similar("sJi:k", topn=10)

[('sJi:P\\', 0.8067282438278198),
 ('zlIf', 0.7727611064910889),
 ('sJIt', 0.7053767442703247),
 ('sJEx', 0.6722691059112549),
 ('SES', 0.6415119171142578),
 ('smEx', 0.6143134832382202),
 ('sEJ', 0.6060691475868225),
 ('sJi:', 0.5881050229072571),
 ('sma:k', 0.5839481949806213),
 ('smEn', 0.5685071349143982)]

In [40]:
vocab = model.wv.index_to_key
vocab[:10]

['a', 'sE', 'jE', 'nE', 'o', 'na', 'po', 'lI', 'h\\o', 'lE']

In [41]:
len(vocab)

17589

In [42]:
embeddings_saver.save_embeddings(model, vocab, extension)

Embeddings saved to embeddings_sampa_syllables_one_metre_line_all_metres_recognized.gz
