In [2]:
import ujson as json
import os
import re

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook
import gc

%matplotlib inline
import time
import matplotlib.pyplot as plt

from joblib import Parallel,delayed
from itertools import islice

from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

sent_tokenizer = nltk.data.load('tokenizers/punkt/finnish.pickle')

In [189]:
with open('./fwe/data/feed/demi.jl', 'r', encoding='utf8') as f:
    lines = json.loads('[' + ','.join(f.readlines()) +']')

In [336]:
import string

from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)

def preprocess_lines_fast(lines, min_sent_len=3):
    contents = [c for js in lines for c in js['content']]
    
    filter_re = re.compile(r'[^\w\s]')
    url_re = re.compile(r'\w+:\/\/\S*')
    nums_re = re.compile(r'[0-9]+')
    
    sents = []
    for doc in contents:
        #print(f'\nOriginal:\n{doc}')
        for sent in sent_tokenizer.tokenize(doc):
            sent_tokens = []
            
            # Additional RE
            sent = url_re.sub('<URL>', sent)
            sent = nums_re.sub('<NUM>', sent)
            
            # Tokenization
            tokens = tweet_tokenizer.tokenize(sent)

            # Cleaning up
            for token in tokens:
                normal_chars = filter_re.sub('', token)
                other_chars = filter_re.findall(token)
                        
                enough_normal_chars = len(normal_chars) > 0
                if enough_normal_chars:
                    sent_tokens.append(token)
                    continue
                    
                only_one_other = len(other_chars) == 1
                others_in_punct = any([c in string.punctuation for c in other_chars])
                if only_one_other and not others_in_punct:
                    sent_tokens.append(token)
            
            # Add to sentences
            if len(sent_tokens) >= min_sent_len:
                clean_sent = ' '.join(sent_tokens)
                sents.append(clean_sent)

                #print(f'Result:\t{clean_sent}')
                
    return sents
    
#preprocess_lines_spacy(lines[:10])
%timeit -n 10 -r 2 preprocess_lines_fast(lines[:1000])

736 ms ± 2.58 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


In [247]:
def preprocess_lines(lines, min_sent_len=5): # TODO: Add features
    sents = []
    contents = [js['content'] for js in lines]
    
    html_regex = re.compile(r'(http\s|https\s|https|http|)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                            flags=re.MULTILINE)
    for content in contents:
        for doc in content:
            
            # Process raw text
            doc = html_regex.sub('', doc)
            doc = doc.replace('\xad','')
            
            doc_sents = sent_tokenizer.tokenize(doc)
            for doc_sent in doc_sents:
                sent = []
                doc_tokens = word_tokenize(doc_sent)
                for token in doc_tokens:
                    if len(re.sub(r'[^\w\s]', '', token)) > 0 and len(token) > 1:
                        sent.append(token)
                if len(sent) >= min_sent_len:
                    sents.append(' '.join(sent))
    #print(sents[1])
    return sents

def preprocess_lines_other(lines, min_sent_len=5):
    contents = [js['content'] for js in lines]
    html_regex = re.compile(r'(http\s|https\s|https|http|)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
                            flags=re.MULTILINE)
    sents = []
    for content in contents:
        for doc in content:
            
            # Process raw text
            doc = html_regex.sub('', doc)
            doc = doc.replace('\xad','')
            
            doc_sents = sent_tokenizer.tokenize(doc)
            for doc_sent in doc_sents:
                sent = []
                #doc_tokens = re_tokenizer.findall(doc_sent)
                #doc_tokens = tok_tokenizer.tokenize(doc_sent)
                doc_tokens = tweet_tokenizer.tokenize(doc_sent)
                for token in doc_tokens:
                    if len(token) > 1:
                        sent.append(token.strip())
                if len(sent) >= min_sent_len:
                    sents.append(' '.join(sent))
    #print(sents[1])
    return sents

In [330]:
%timeit -n 10 -r 2 preprocess_lines_other(lines[:1000])

672 ms ± 1.23 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)


In [331]:
%timeit -n 10 -r 2 preprocess_lines(lines[:1000])

1.59 s ± 2.54 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
