In [1]:
from voikko.libvoikko import Voikko
import os
from collections import Counter
import random
import urllib
import tarfile
import pickle as pkl
import numpy as np

## Download data file

Data file can be downloaded from [here](http://statmt.org/wmt17/translation-task.html#download). 
Since we need only the Finnish text to train a word2vec model, we just need to download the monolingual language model training data.

**Note**: the download link after click is: http://statmt.org/wmt15/europarl-v8.fi.tgz.

In [2]:
def safe_mkdir(path):
    '''Create a directory if there isn't one already.'''
    try:
        os.makedirs(path)
    except OSError:
        pass

In [3]:
def unzip_and_remove(zipped_file, unzip_dir):
    print('unzipping file...')
    tar = tarfile.open(zipped_file, 'r')
    tar.extractall(path=unzip_dir)
    tar.close()
    os.remove(zipped_file)

In [4]:
def download_data_file(download_url,
                       data_dir,
                       local_dest, 
                       expected_byte):
    """ 
    Download the file from download_url into local_dest
    if the file doesn't already exists.
    Check if the downloaded file has the same number of bytes as expected_byte.
    Unzip the file and remove the zip file
    """
    unzip_name = local_dest[:-4]

    if os.path.exists(unzip_name):
        print('file already exists')
        return unzip_name
    elif os.path.exists(local_dest):
        print('file already exists but unzipped')
        unzip_and_remove(local_dest, data_dir)
        return unzip_name
    else:
        safe_mkdir(data_dir)

        print('Downloading...')
        _, _ = urllib.request.urlretrieve(download_url, local_dest)
        file_stat = os.stat(local_dest)

        if file_stat.st_size == expected_byte:
            print('Successfully downloaded')
        else:
            print('The downloaded file has unexpected number of bytes')
            return

        unzip_and_remove(local_dest, data_dir)
        return unzip_name

In [5]:
download_url = 'http://statmt.org/wmt15/europarl-v8.fi.tgz'

script_dir = os.path.dirname(os.path.abspath('__file__'))
project_dir = os.path.dirname(script_dir)
data_dir = os.path.join(project_dir, 'data')
data_file_name = 'europarl-v8.fi.tgz'
data_file_path = os.path.join(data_dir, data_file_name)

expected_byte = 99540237

data_file_path = download_data_file(download_url, data_dir, data_file_path, expected_byte)
assert data_file_path

Downloading...
Successfully downloaded
unzipping file...


## Read data file and tokenize into words

In [6]:
def read_data(file_path):
    ''' Read data into a list of words and store the words into a file
    if the relevant word file does not exist'''

    if os.path.exists(file_path + '_words'):
        print('reading from word file...')
        with open(file_path + '_words', 'r') as f:
            words = f.read().split('\n')
            return words

    print('reading from data file...')
    v = Voikko("fi")

    with open(file_path) as f:
        words = [word.tokenText.lower() for word in v.tokens(f.read())
                 if word.tokenType==1 or word.tokenType==2]
        # print(words)
        v.terminate()

        file = open(file_path + '_words', 'w')
        file.write('\n'.join(words))
        file.close()

        return words

In [7]:
words = read_data(data_file_path)
print(words[:100])

reading from data file...
['istuntokauden', 'uudelleenavaaminen', 'julistan', 'perjantaina', 'joulukuun', '17', '.', 'päivänä', 'keskeytetyn', 'euroopan', 'parlamentin', 'istunnon', 'avatuksi', 'ja', 'esitän', 'vielä', 'kerran', 'vilpittömän', 'toiveeni', 'siitä', ',', 'että', 'teillä', 'olisi', 'ollut', 'oikein', 'mukava', 'joululoma', '.', 'kuten', 'olette', 'varmaan', 'saattaneet', 'huomata', ',', 'vuodenvaihteeseen', '2000', 'povattuja', 'suuria', 'tietokoneongelmia', 'ei', 'ilmennytkään', '.', 'sen', 'sijaan', 'todella', 'kauheat', 'luonnonkatastrofit', 'koettelivat', 'kansalaisia', 'joissakin', 'unionimme', 'maissa', '.', 'te', 'olette', 'esittäneet', 'toiveen', ',', 'että', 'tästä', 'asiasta', 'keskusteltaisiin', 'lähipäivinä', 'tämän', 'istuntojakson', 'aikana', '.', 'sillä', 'välin', 'toivoisin', ',', 'kuten', 'useampi', 'kollega', 'on', 'minulle', 'esittänytkin', ',', 'että', 'viettäisimme', 'minuutin', 'hiljaisuuden', 'kaikkien', 'niiden', 'uhrien', 'muistoksi', ',', 'jotka'

## Build a vocabulary for the corpus.

In [8]:
def build_vocab(words, vocab_size, vocab_dir, vocab_file_path):
    '''Build vocabulary of vocab_size most frequent words and write it to vocab_file_path.
        words: a list of words.
    '''
    print("building vocabulary...")

    dictionary = dict()
    index = 0

    if words == None:
        with open(vocab_file_path, 'r') as f:
            words = f.read().split('\n')
            for word in words:
                dictionary[word] = index
                index += 1
    else:
        safe_mkdir(vocab_dir)
        file = open(vocab_file_path, 'w')
        count = [('UNK', -1)]
        count.extend(Counter(words).most_common(vocab_size - 1))

        for word, _ in count:
            dictionary[word] = index
            index += 1
            file.write(word + '\n')
        file.close()

    return dictionary

In [9]:
vocab_dir = os.path.join(project_dir, 'vocab')
vocab_file_name = 'fin_vocab.tsv'
vocab_file_path = os.path.join(vocab_dir, vocab_file_name)
vocab_size = 10000

if os.path.exists(vocab_file_path):
    dictionary =  build_vocab(None, 0, vocab_dir, vocab_file_path)
else:
    dictionary = build_vocab(words, vocab_size, vocab_dir, vocab_file_path)
    del words

print(dictionary['UNK']) # 0
print(dictionary['että']) # 5
print(dictionary['todellakin']) # 287

building vocabulary...
0
5
287


## Convert words to their correspond index in the dictionary

In [10]:
def sentence_to_index(index_file, file_path, dictionary):
    '''Read sentences from file and replace them with
    their corresponding word indices in the dictionary'''

    print("converting sentences to indices...")
    v = Voikko("fi")

    index_f = open(index_file, 'wb')
    with open(file_path) as f:
        index_sentences = []
        for sentence in f:
            words = [word.tokenText.lower() for word in v.tokens(sentence)
                 if word.tokenType==1 or word.tokenType==2]

            # print(words)
            index_words = [dictionary[word] if word in dictionary else 0 for word in words]
            index_sentences.append(index_words)
        v.terminate()

        # save sentence indices into a index_file
        pkl.dump(index_sentences, index_f, -1)
        index_f.close()
        
        return index_sentences

In [11]:
index_file = os.path.join(data_dir, 'sentence_idx_imdb.pkl')

if os.path.exists(index_file):
    with open(index_file, 'rb') as f:
        sentence_indices = pkl.load(f)
else:
    sentence_indices = sentence_to_index(index_file, data_file_path, dictionary)

converting sentences to indices...


In [12]:
print(sentence_indices[:10])

[[4199, 0], [5654, 5933, 4556, 1501, 2, 952, 0, 6, 26, 1616, 0, 3, 1368, 67, 246, 0, 0, 22, 1, 5, 1663, 33, 105, 380, 0, 0, 2], [42, 401, 5038, 8482, 3661, 1, 0, 381, 0, 667, 0, 7, 0, 2], [12, 370, 180, 0, 0, 0, 670, 1091, 9084, 392, 2], [323, 401, 1795, 8856, 1, 5, 74, 199, 0, 0, 25, 6211, 145, 2], [56, 3637, 3427, 1, 42, 0, 1055, 4, 515, 0, 1, 5, 0, 2847, 0, 149, 64, 1862, 0, 1, 21, 3563, 8269, 434, 6, 30, 907, 0, 0, 2], [871, 1, 5, 0, 0, 25, 2847, 0, 5053], [18, 86, 0, 0, 2847, 0, 2, 19], [11, 20, 1, 2870, 6703, 2], [401, 5038, 687, 0, 6015, 3, 0, 417, 1, 5, 5720, 0, 4, 0, 434, 4962, 0, 0, 2]]


In [13]:
print(sentence_indices[:10])

[[4199, 0], [5654, 5933, 4556, 1501, 2, 952, 0, 6, 26, 1616, 0, 3, 1368, 67, 246, 0, 0, 22, 1, 5, 1663, 33, 105, 380, 0, 0, 2], [42, 401, 5038, 8482, 3661, 1, 0, 381, 0, 667, 0, 7, 0, 2], [12, 370, 180, 0, 0, 0, 670, 1091, 9084, 392, 2], [323, 401, 1795, 8856, 1, 5, 74, 199, 0, 0, 25, 6211, 145, 2], [56, 3637, 3427, 1, 42, 0, 1055, 4, 515, 0, 1, 5, 0, 2847, 0, 149, 64, 1862, 0, 1, 21, 3563, 8269, 434, 6, 30, 907, 0, 0, 2], [871, 1, 5, 0, 0, 25, 2847, 0, 5053], [18, 86, 0, 0, 2847, 0, 2, 19], [11, 20, 1, 2870, 6703, 2], [401, 5038, 687, 0, 6015, 3, 0, 417, 1, 5, 5720, 0, 4, 0, 434, 4962, 0, 0, 2]]


## Create a generator to construct training pairs according to the skip-gram model

In [14]:
def generate_sample(index_words, context_window_size):
    '''Form training pairs according to the skip-gram model.'''
    for sentence_words in index_words:
        for index, center in enumerate(sentence_words):
            context = random.randint(1, context_window_size)
            # print(context)

            # get a random number of targets before the center word
            for target in sentence_words[max(0, index - context): index]:
                yield center, target

            # get a random number of targets after the center word
            for target in sentence_words[index + 1: index + context + 1]:
                yield center, target

In [15]:
skip_window = 2
single_gen = generate_sample(sentence_indices, skip_window)
print(next(single_gen))

(4199, 0)


In [16]:
print(next(single_gen))

(0, 4199)


In [17]:
print(next(single_gen))

(5654, 5933)


## lemmatize a file

In [18]:
def lemmatize_file(filename):
    print('lemmatizing ' + filename)

    v = Voikko("fi")
    lemmatized_filename = filename + '_lemmatized'
    lemmatized_file = open(lemmatized_filename, 'w') 

    with open(filename, 'r') as f:
        for sentence in f:
            sent_toks = v.tokens(sentence)

            words_baseform = []
            for word in sent_toks:
                if word.tokenType == 1:
                    word_analyzed = v.analyze(word.tokenText)
                    if len(word_analyzed) > 0:
                        words_baseform.append(word_analyzed[0].get('BASEFORM'))
                    else:
                        words_baseform.append(word.tokenText)
                else:
                    words_baseform.append(word.tokenText)

            sent_baseform = ''.join(words_baseform)
            lemmatized_file.write(sent_baseform)

    lemmatized_file.close()
    v.terminate()
    return lemmatized_filename

## Putting all of above in one function

In [19]:
def batch_gen(download_url, data_dir, data_file_path, expected_byte,
              vocab_dir, vocal_file_path, vocab_size, index_file,
              batch_size, skip_window, lemmatize):
    if os.path.exists(index_file):
        with open(index_file, 'rb') as f:
            sentence_indices = pkl.load(f)

    else:
        if os.path.exists(vocab_file_path):
            dictionary =  build_vocab(None, 0, vocab_dir, vocab_file_path)
        else:
            data_file_path = download_data_file(download_url, data_dir, data_file_path, expteced_byte)
            if lemmatize:
                data_file_path = lemmatize_file(data_file_path)
            words = read_data(data_file_path)
            dictionary = build_vocab(words, vocab_size, vocab_dir, vocab_file_path)
            del words

        sentence_indices = sentence_to_index(index_file, data_file_path, dictionary)

    single_gen = generate_sample(sentence_indices, skip_window)

    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [20]:
batch_size = 10
skip_window = 1
lemmatize = False
gen = batch_gen(download_url, data_dir, data_file_path, expected_byte,
               vocab_dir, vocab_file_path, vocab_size, index_file,
                batch_size, skip_window, lemmatize)
print(next(gen))

(array([4199,    0, 5654, 5933, 5933, 4556, 4556, 1501, 1501,    2],
      dtype=int32), array([[0.000e+00],
       [4.199e+03],
       [5.933e+03],
       [5.654e+03],
       [4.556e+03],
       [5.933e+03],
       [1.501e+03],
       [4.556e+03],
       [2.000e+00],
       [1.501e+03]]))


## Read most common words and write it into a new file to visualize their embeddings on TensorBoard

In [21]:
def most_common_words(vocab_file_path, visual_dir, num_visualize):
    """ create a list of num_visualize most frequent words to visualize on TensorBoard.
    saved to visual_dir/vocab_[num_visualize].tsv
    """
    words = open(vocab_file_path, 'r').readlines()[:num_visualize]
    words = [word for word in words]

    safe_mkdir(visual_dir)
    file = open(os.path.join(visual_dir, 'vocab_' + str(num_visualize) + '.tsv'), 'w')
    for word in words:
        file.write(word)
    file.close()

In [22]:
visual_dir = os.path.join(project_dir, 'visualization')
num_visualize = 100
most_common_words(vocab_file_path, visual_dir, num_visualize)