# Models for learning word embeddings


In [54]:
import re
from collections import defaultdict  # For word frequency
from time import time  # To time our operations

import pandas as pd  # For data handling

load_model = '' # model_name
corpus = 'wiki'
#corpus = 'speeches'
model_name = 'fasttext'
#model = 'word2vec'
preprocessing_type = 'nltk'
#preprocessing = 'spacy'

In [55]:
import os
cwd = os.getcwd()
print(cwd)

/home/jonatan/school/nlp-project/notebooks


In [56]:
if corpus == 'speeches':
    df = pd.read_csv("../output/speeches-1.csv", delimiter="|", lineterminator="\n")
    df = df.dropna().reset_index(drop=True)
    df.head()
elif corpus == 'wiki':
    data = open("../output/wikipedia2008_fi_lemmatized.txt").read()

## Cleaning:

In [57]:
if corpus == 'speeches':
    if preprocessing_type == 'spacy':
        from preprocessing import spacy_preprocess
        df_clean = spacy_preprocess(df)
    elif preprocessing_type == 'nltk':
        from preprocessing import nltk_preprocess
        df_clean = nltk_preprocess(df)
    else:
        raise Exception("Non allowed parameter for preprocessing")
    print(f"Cleaned shape: {df_clean.shape}")

## Bigrams:

In [58]:
from gensim.models.phrases import Phrases, Phraser

As `Phrases()` takes a list of list of words as input:

In [59]:
if corpus == 'speeches':
    speeches = [re.split(r"[.!?]", row) for row in df_clean['clean']]
    sentences = [sent.strip().split() for speech in speeches for sent in speech if sent != ""]
elif corpus == 'wiki':
    sentences = re.split(r"[.!?]", data) 
    sentences = [sent.strip().split() for sent in sentences if sent != ""]

In [60]:
sentences

[['redundanssi', 'olla', 'yli|määrä', ',', 'erityisesti', 'tieto', 'liittyä'],
 ['myös',
  'systeemi',
  'voida',
  'olla',
  'redundanssi',
  ',',
  'yli|määrä',
  'esimerkiksi',
  'vara|osa',
  'tai',
  'vaihto|ehtoisa',
  'toiminta|tapa',
  'tai',
  'rinnakkainen',
  'järjestelmä'],
 ['sotilaallisesti', 'reservi', 'olla', 'redundanssi'],
 ['redundanssi',
  'maksaa',
  ',',
  'mutta',
  'varmistaa',
  'systeemi',
  'toiminta',
  'esimerkiksi',
  'kestää',
  'virhe',
  'tai',
  'osa|systeemi',
  'tuhoutua'],
 ['jos',
  'tieto',
  'olla',
  'redundanssi',
  ',',
  'tieto',
  'voida',
  'poistaa',
  'osa',
  ',',
  'ja',
  'tieto|sisältö',
  'säilyä',
  'silti',
  'ennallaan'],
 ['vastaavasti',
  'järjestelmä',
  'toimia',
  ',',
  'vaikka',
  'kaikki',
  'muu',
  'paitsi',
  'yksi',
  'rinnakkainen',
  'järjestelmä',
  'ei',
  'toimia'],
 ['redundanssi',
  'voida',
  'myös',
  'tarkoituksellisesti',
  'lisätä',
  'viesti',
  'tai',
  'järjestelmä'],
 ['tekninen',
  'tapa',
  'lisätä',


In [30]:
phrases = Phrases(sentences, min_count=30, progress_per=10000)

In [31]:
bigram = Phraser(phrases)

Transform the corpus based on the bigrams detected:

In [32]:
sentences = bigram[sentences]

## Most Frequent Words:

In [33]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

26854

In [34]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

[',', 'olla', 'ja', ')', '(', 'hän', ',_joka', 'se', 'vuosi', '"']

# Training the model

In [35]:
import multiprocessing

if model_name == 'fasttext':
    from gensim.models import FastText as Model
elif model_name == 'word2vec':
    from gensim.models import word2vec as Model
elif model_name == 'word2vec':
    raise Exception("Non allowed parameter for model.")

In [36]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [37]:
min_count=20
window=2
size=300
sample=6e-5
alpha=0.03
min_alpha=0.0007
negative=20
workers=cores-1

if len(load_model) > 0:
    model = Word2Vec.load(load_model)
else:
    model = Model(min_count=min_count,
                  window=window,
                  size=size,
                  sample=sample,
                  alpha=alpha,
                  min_alpha=min_alpha,
                  negative=negative,
                  workers=workers)

## Build vocabulary table:

In [38]:
t = time()

model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.5 mins


## Training:

In [39]:
t = time()
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.27 mins


In [40]:
model.save(f"{model_name}_{corpus}_{preprocessing_type}_mincount{min_count}_window{window}_size{size}_alpha{alpha}_minalpha{min_alpha}_negative{negative}_workers{workers}.model")