## Preprocessing

In [None]:
import ujson as json
import glob
import os
import re

import numpy as np
import pandas as pd

from tqdm import tqdm as tqdm_notebook
import gc

%matplotlib inline
import time
import matplotlib.pyplot as plt

from joblib import Parallel,delayed
from itertools import islice

from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

from itertools import zip_longest

sent_tokenizer = nltk.data.load('tokenizers/punkt/finnish.pickle')

import string

from nltk.tokenize import TweetTokenizer

import logging
logger = logging.getLogger(__name__)
logger.handlers = []
sh = logging.StreamHandler()
logger.setLevel(logging.DEBUG)
logger.addHandler(sh)

In [None]:
def preprocess_lines(lines, tokenizer, min_sent_len=5):
    """Preprocess given lines."""
    contents = [c 
                for js in json.loads('[' + ','.join(lines) + ']')
                for c in js['content']]
    
    filter_re = re.compile(r'[^\w\s]')
    url_re = re.compile(r'\w+:\/\/\S*')
    
    sents = []
    for doc in contents:
        
        for sent in sent_tokenizer.tokenize(doc):
            sent_tokens = []
            
            # Additional RE
            sent = url_re.sub('<URL>', sent)
            
            # Tokenization
            tokens = tokenizer.tokenize(sent)

            # Cleaning up
            for token in tokens:
                normal_chars = filter_re.sub('', token)
                other_chars = filter_re.findall(token)
                        
                enough_normal_chars = len(normal_chars) > 0
                if enough_normal_chars:
                    sent_tokens.append(token)
                    continue
                    
                only_one_other = len(other_chars) == 1
                others_in_punct = any([c in string.punctuation for c in other_chars])
                if only_one_other and not others_in_punct and len(token) > 1:
                    sent_tokens.append(token)
            
            # Add to sentences
            if len(sent_tokens) >= min_sent_len:
                clean_sent = ' '.join(sent_tokens)
                sents.append(clean_sent)
    return sents

def grouper(iterable, n, fillvalue=None):
    """Returns iterator of certain length."""
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

def parallel_preprocess_lines(lines, tokenizer, n_jobs=3, min_sent_len=5):
    """Parallel preprocessing of lines.
    
    Args:
        lines (list): List of strings that are crawled lines and not in
            JSON format yet.
        tokenizer (object): 
        n_jobs (int): Number of parallel workers to use. Defaults to 3.
        min_sent_len (int): Minimum number of tokens to be considered
            as a sentence. Defaults to 5.
            
    Returns:
        List of unique sentences in an array.
    """
    n = int(np.ceil(len(lines) / n_jobs))
    job_lines = [lines[i:i + n] for i in range(0, len(lines), n)]
    sent_lists = Parallel(n_jobs=n_jobs)(
        delayed(preprocess_lines)(
            lines=lines,
            tokenizer=tokenizer,
            min_sent_len=min_sent_len
        )
        for lines in job_lines
    )
    sents = [sent for sent_list in sent_lists for sent in sent_list]
    return pd.unique(sents)
      
def process_file(filepath, tokenizer, out_filepath='../data/processed/test.sl',
                 mode='a', lines_per_chunk=30000, min_sent_len=5, n_jobs=3):
    with open(filepath, 'r', encoding='utf8') as f:
        logger.info('Reading number of lines in the file...')
        n_total_lines = sum(1 for _ in f)
        f.seek(0)
        
        with open(out_filepath, mode=mode, encoding='utf8') as fout:
            it = grouper(f, lines_per_chunk)
            for i,lines in enumerate(it):
                start_time = time.perf_counter()
                end_line = min((i + 1) * lines_per_chunk, n_total_lines) / 1e3
                logger.info(f'\nLines {i * lines_per_chunk / 1e3:.0f}'
                            f' - {end_line:.0f}k'
                            f'/ {n_total_lines / 1e3:.0f}k')
                
                sents = parallel_preprocess_lines(
                    [l for l in lines if l], n_jobs=n_jobs,
                    tokenizer=tokenizer, min_sent_len=min_sent_len)
                
                logger.info(f'Writing {len(sents)} sentences...')
                fout.write('\n'.join(sents))
                
                time_passed = time.perf_counter() - start_time
                logger.info(f'Chunk done in {time_passed:.0f} seconds!')
                gc.collect()
                
def process_all_files(in_filedir='../data/feed/',
                      out_filepath='../data/processed/all.sl',
                      lines_per_chunk=30000,
                      create_uncased=True,
                      min_sent_len=5,
                      tokenizer='tweet',
                      n_jobs=3):
    start_time = time.perf_counter()
    
    filepaths = [os.path.abspath(p) for p in glob.glob(in_filedir + '*.jl')]
    
    out_filepath = os.path.abspath(out_filepath)
    out_dir = os.path.dirname(out_filepath)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
        logger.warn(f'Created directory in {out_dir}')
    
    if tokenizer == 'tweet':
        tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, 
                                   preserve_case=True)
    else:
        raise ValueError('Currently only "tweet" tokenizer is supported!')
    
    for i,path in enumerate(filepaths):
        logger.info(f'\n\nProcessing file "{path}" '
                    f'({i + 1} / {len(filepaths)})')
        process_file(path, mode='w' if i == 0 else 'a',
                     lines_per_chunk=lines_per_chunk,
                     out_filepath=out_filepath, min_sent_len=min_sent_len,
                     tokenizer=tokenizer,
                     n_jobs=n_jobs)
        
    if create_uncased:
        uncased_filepath = os.path.join(
            os.path.dirname(out_filepath),
            os.path.splitext(out_filepath)[0] + '_uncased.sl')
        logger.info(f'Creating uncased into "{uncased_filepath}"...')
        with open(out_filepath, 'r', encoding='utf8') as f:
            with open(uncased_filepath, 'w', encoding='utf8') as fout:
                for line in f:
                    fout.write(line.lower())
                    
    logger.info(f'All done in {time.perf_counter() - start_time:.0f} seconds!')
    
    
process_all_files()

In [None]:
with open('./fwe/data/preprocessed/sents2.csv', 'r', encoding='utf8') as f:
    with open('./fwe/data/preprocessed/sents2_lowercase.csv', 'w', encoding='utf8') as fout:
        for line in f:
            fout.write(line.lower() + '\n')

## Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
from gensim.models.word2vec import LineSentence

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
sents = LineSentence('./fwe/data/preprocessed/sents2_lowercase.csv')

In [None]:
w2v = Word2Vec(
    window=5,
    size=100,
    min_count=5,
    max_vocab_size=None,
    workers=4
)

In [None]:
w2v.build_vocab(sents, progress_per=1e6)

In [None]:
w2v.train(
    sents,
    total_examples=w2v.corpus_count,
    epochs=w2v.epochs,
    queue_factor=2
)

In [None]:
w2v.wv.most_similar('vittu')

In [None]:
w2v.save('./fwe/data/preprocessed/w2v2_lowercase.pkl')

In [None]:
w2v.wv.save_word2vec_format('./fwe/data/preprocessed/w2v2_lowercase.vec', binary=False)

## FastText

In [None]:
from gensim.models import FastText

In [None]:
ft = FastText(
    size=100,
    window=5,
    min_count=5,
    workers=4
)

In [None]:
ft.build_vocab(sents, progress_per=1e6)

In [None]:
ft.train(
    sents,
    total_examples=ft.corpus_count,
    epochs=ft.epochs,
    queue_factor=2
)

In [None]:
ft.save('./fwe/data/preprocessed/ft2_lowercase.pkl')

In [None]:
ft.wv.save_word2vec_format('./fwe/data/preprocessed/ft2_lowercase.vec', binary=False)

## Random testing

In [None]:
from IPython.core.display import display

In [None]:
word = 'leijona'
topn = 40

display(ft.wv.most_similar(word, topn=topn))
w2v.wv.most_similar(word, topn=topn)

In [None]:
from gensim.models.keyedvectors import KeyedVectors
kv = KeyedVectors.load_word2vec_format('./fwe/data/preprocessed/w2v2_lowercase.vec')

In [None]:
len(kv.vocab)