In [93]:
import ujson as json
import os
import pandas as pd

from tqdm import tqdm_notebook

%matplotlib inline

import matplotlib.pyplot as plt

from joblib import Parallel,delayed

from collections import Counter
import nltk

sent_tokenizer = nltk.data.load('tokenizers/punkt/finnish.pickle')

In [231]:
from spacy.lang.fi import Finnish
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc,Span,Token

import re
import string

nlp = Finnish()
sentencizer = nlp.create_pipe('sentencizer')
nlp.add_pipe(sentencizer)
tokenizer = Tokenizer(nlp.vocab)

def clean_token(token): 
    if len(re.sub(r'[^\w\s]', '', token.text).strip()) == 0:
        return ''
    token = token.text
    if token[-1] in string.punctuation:
        token = token[:-1]
    token = re.sub(r'[\"\”\'\`\(\)\[\]]', '', token)
    return token.strip()

Token.set_extension('processed', getter=clean_token, force=True)

class SentenceWriter(object):
    
    def __init__(self, input_filepath, output_filepath,
                 min_sentence_tokens=5):
        self.input_filepath = input_filepath
        self.output_filepath = output_filepath
        self.min_sentence_tokens = min_sentence_tokens
        
    def line_to_sents(self, line):
        line = json.loads(line)['content']
        sents = []
        for doc in nlp.pipe(line):
            for sent in tokenizer.pipe(s.string.strip() for s in doc.sents):
                sent_tokens = []
                for token in sent:
                    if len(token) > 0:
                        sent_tokens.append(token._.processed)
                
                if len(sent_tokens) > self.min_sentence_tokens:
                    sents.append(' '.join(sent_tokens))
        return sents
        
    def preprocess(self):
        with open(self.input_filepath, 'r', encoding='utf8') as fin:
            with open(self.output_filepath, 'a', encoding='utf8') as fout:
                for i,line in enumerate(fin):
                    sents = self.line_to_sents(line)
                    if len(sents) > 0:
                        fout.write('\n'.join(sents))
                        
                    if i % 100 == 0:
                        print('Read %s lines' % i)
                        
sw = SentenceWriter('./data/feed/iltalehti.jl', 
                    './data/preprocessed/test.csv').preprocess()

Read 0 lines
Read 100 lines
Read 200 lines
Read 300 lines
Read 400 lines
Read 500 lines
Read 600 lines
Read 700 lines
Read 800 lines
Read 900 lines
Read 1000 lines
Read 1100 lines
Read 1200 lines
Read 1300 lines
Read 1400 lines
Read 1500 lines
Read 1600 lines
Read 1700 lines
Read 1800 lines
Read 1900 lines
Read 2000 lines
Read 2100 lines
Read 2200 lines
Read 2300 lines
Read 2400 lines
Read 2500 lines
Read 2600 lines
Read 2700 lines
Read 2800 lines
Read 2900 lines
Read 3000 lines
Read 3100 lines
Read 3200 lines
Read 3300 lines
Read 3400 lines
Read 3500 lines
Read 3600 lines
Read 3700 lines
Read 3800 lines
Read 3900 lines
Read 4000 lines
Read 4100 lines
Read 4200 lines
Read 4300 lines
Read 4400 lines
Read 4500 lines
Read 4600 lines
Read 4700 lines
Read 4800 lines
Read 4900 lines
Read 5000 lines
Read 5100 lines
Read 5200 lines
Read 5300 lines
Read 5400 lines
Read 5500 lines
Read 5600 lines
Read 5700 lines
Read 5800 lines
Read 5900 lines
Read 6000 lines
Read 6100 lines
Read 6200 lines
Read

KeyboardInterrupt: 

In [215]:
#sents = Parallel(n_jobs=4)(delayed(process_content)(l)
#                           for l in tqdm_notebook(df['content'].values[:1000]))

In [232]:
from gensim.models import Word2Vec

In [233]:
from gensim.models.word2vec import LineSentence

In [234]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [235]:
sents = LineSentence('./data/preprocessed/test.csv')

In [236]:
w2v = Word2Vec(
    min_count=10,
    window=4,
    size=100,
    workers=4
)

In [237]:
w2v.build_vocab(sents, progress_per=1e6)

2019-06-24 22:57:54,609 : INFO : collecting all words and their counts
2019-06-24 22:57:54,609 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-24 22:58:02,621 : INFO : PROGRESS: at sentence #1000000, processed 12513689 words, keeping 193927 word types
2019-06-24 22:58:10,112 : INFO : collected 434928 word types from a corpus of 24232875 raw words and 1974149 sentences
2019-06-24 22:58:10,112 : INFO : Loading a fresh vocabulary
2019-06-24 22:58:10,954 : INFO : min_count=10 retains 79968 unique words (18% of original 434928, drops 354960)
2019-06-24 22:58:10,970 : INFO : min_count=10 leaves 23543299 word corpus (97% of original 24232875, drops 689576)
2019-06-24 22:58:11,204 : INFO : deleting the raw counts dictionary of 434928 items
2019-06-24 22:58:11,235 : INFO : sample=0.001 downsamples 19 most-common words
2019-06-24 22:58:11,235 : INFO : downsampling leaves estimated 21807383 word corpus (92.6% of prior 23543299)
2019-06-24 22:58:11,578 : INFO : e

In [238]:
w2v.train(
    sents,
    total_examples=w2v.corpus_count,
    epochs=w2v.epochs
)

2019-06-24 22:58:12,546 : INFO : training model with 4 workers on 79968 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=4
2019-06-24 22:58:13,560 : INFO : EPOCH 1 - PROGRESS: at 2.84% examples, 549253 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:14,964 : INFO : EPOCH 1 - PROGRESS: at 5.22% examples, 419355 words/s, in_qsize 0, out_qsize 1
2019-06-24 22:58:15,968 : INFO : EPOCH 1 - PROGRESS: at 8.28% examples, 466199 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:16,981 : INFO : EPOCH 1 - PROGRESS: at 10.72% examples, 488887 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:17,995 : INFO : EPOCH 1 - PROGRESS: at 12.94% examples, 496742 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:18,996 : INFO : EPOCH 1 - PROGRESS: at 15.08% examples, 499211 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:20,010 : INFO : EPOCH 1 - PROGRESS: at 17.41% examples, 505189 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:58:21,024 : INFO : EPOCH 1 - PROGRESS: at 19.79%

2019-06-24 22:59:23,087 : INFO : EPOCH 2 - PROGRESS: at 63.36% examples, 517104 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:24,085 : INFO : EPOCH 2 - PROGRESS: at 65.85% examples, 518481 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:25,109 : INFO : EPOCH 2 - PROGRESS: at 68.24% examples, 519593 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:26,118 : INFO : EPOCH 2 - PROGRESS: at 70.57% examples, 520290 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:27,117 : INFO : EPOCH 2 - PROGRESS: at 73.37% examples, 521091 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:28,352 : INFO : EPOCH 2 - PROGRESS: at 75.54% examples, 514273 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:29,345 : INFO : EPOCH 2 - PROGRESS: at 78.49% examples, 516194 words/s, in_qsize 0, out_qsize 0
2019-06-24 22:59:30,363 : INFO : EPOCH 2 - PROGRESS: at 81.52% examples, 517684 words/s, in_qsize 1, out_qsize 0
2019-06-24 22:59:31,361 : INFO : EPOCH 2 - PROGRESS: at 84.42% examples, 519443 words/s, in_qsiz

2019-06-24 23:00:29,606 : INFO : EPOCH 4 - PROGRESS: at 16.68% examples, 509013 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:31,049 : INFO : EPOCH 4 - PROGRESS: at 18.89% examples, 484345 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:32,063 : INFO : EPOCH 4 - PROGRESS: at 21.33% examples, 492710 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:33,080 : INFO : EPOCH 4 - PROGRESS: at 23.79% examples, 499238 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:34,094 : INFO : EPOCH 4 - PROGRESS: at 26.29% examples, 503967 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:35,093 : INFO : EPOCH 4 - PROGRESS: at 28.87% examples, 507792 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:36,091 : INFO : EPOCH 4 - PROGRESS: at 31.19% examples, 511894 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:37,113 : INFO : EPOCH 4 - PROGRESS: at 33.55% examples, 513343 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:00:38,127 : INFO : EPOCH 4 - PROGRESS: at 36.02% examples, 516008 words/s, in_qsiz

2019-06-24 23:01:40,969 : INFO : EPOCH 5 - PROGRESS: at 80.92% examples, 517909 words/s, in_qsize 0, out_qsize 1
2019-06-24 23:01:42,092 : INFO : EPOCH 5 - PROGRESS: at 82.79% examples, 512143 words/s, in_qsize 0, out_qsize 1
2019-06-24 23:01:43,105 : INFO : EPOCH 5 - PROGRESS: at 85.95% examples, 515313 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:01:44,104 : INFO : EPOCH 5 - PROGRESS: at 88.87% examples, 517161 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:01:45,103 : INFO : EPOCH 5 - PROGRESS: at 91.79% examples, 518819 words/s, in_qsize 1, out_qsize 0
2019-06-24 23:01:46,140 : INFO : EPOCH 5 - PROGRESS: at 94.60% examples, 519278 words/s, in_qsize 0, out_qsize 1
2019-06-24 23:01:47,155 : INFO : EPOCH 5 - PROGRESS: at 96.77% examples, 517359 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:01:48,169 : INFO : EPOCH 5 - PROGRESS: at 97.68% examples, 509624 words/s, in_qsize 0, out_qsize 0
2019-06-24 23:01:49,174 : INFO : EPOCH 5 - PROGRESS: at 99.87% examples, 509039 words/s, in_qsiz

(109037293, 121164375)

In [253]:
w2v.wv.most_similar('homo')

[('referoiden', 0.518416166305542),
 ('pätevästi', 0.5164487361907959),
 ('deus', 0.5112777948379517),
 ('sapiens', 0.4446752667427063),
 ('uskovainen', 0.4147804379463196),
 ('transagendansa', 0.41277870535850525),
 ('Allahiin', 0.4052746295928955),
 ('joillain', 0.37194502353668213),
 ('otetuista', 0.3689946234226227),
 ('Hararin', 0.3660024404525757)]