In [10]:
import pandas as pd
from pathlib import Path
from glob import glob
from multiprocessing import Pool
import tqdm
from utils import pdfunc as func
from ufal.udpipe import Model, Pipeline
import collections 

In [6]:
from_dir_prefix = '../../tmp/tw-super-tiny/archiveteam-twitter-stream-2013-01-0*'
to_dir = Path('../../tmp/tw-tiny-out/')
modelfile4udpipe = '/home/den/Documents/elmo/data_preparing/rutwitter/russian-syntagrus-ud-2.0-170801.udpipe'
files = list(glob(from_dir_prefix))[:5]
cpu_n = 5

In [7]:
model = Model.load(modelfile4udpipe)
udpipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'horizontal')

def worker(in_file):
    df = pd.read_fwf(in_file,  header=None).rename(columns={0:'text'})
    df['text'] = df['text'].apply(func.skip_empty)
    df = df.dropna()
    df['rec'] = df['text'].apply(func.get_rec_info)
    df['text'] = df['text'].apply(func.spec_tok_add)
    df['norm_text'] = df['text'].apply(func.normalization1)
    df['norm_text'] = df['text'].apply(func.udpipe_sent_and_tok, args = (udpipeline,))
    df['norm_text'] = df['norm_text'].apply(func.normalization2)
    df['rec_text'] = df.apply(func.recovery, axis=1)
    df['cleaned_text'] = df['norm_text'].apply(func.lower_case)
    df1 = func.split_df(df[['cleaned_text']],'cleaned_text', '\n')   
    df2 = func.split_df(df[['rec_text']],'rec_text', '\n')
    df = pd.concat([df1,df2], axis=1)
    with open('../../tmp/cl.txt', 'wt') as fd:
        df['cleaned_text'][:-1].apply(lambda line: fd.write('%s\n' % line.strip()))
        df['cleaned_text'][-1:].apply(lambda line: fd.write(line.strip()))
    with open('../../tmp/rec.txt', 'wt') as fd:
        df['rec_text'][:-1].apply(lambda line: fd.write('%s\n' % line.strip()))
        df['rec_text'][-1:].apply(lambda line: fd.write(line.strip()))
    return df

In [4]:
with Pool(cpu_n) as p:
        for process_lines in tqdm.tqdm(p.imap_unordered(worker, files), total=len(files)):
            pass

100%|██████████| 5/5 [00:00<00:00,  5.18it/s]


In [8]:
df = worker(files[0])

In [18]:
sum(df['cleaned_text'].apply(lambda line: collections.Counter(line.strip().split())).tolist(), collections.Counter())

Counter({'!': 26,
         '"': 10,
         '%': 1,
         '(': 4,
         ')': 42,
         '*': 2,
         ',': 45,
         '-': 8,
         '.': 76,
         '..весь': 1,
         '..сидит': 1,
         '.не': 1,
         '/': 8,
         ':': 13,
         ';': 2,
         '<hashtag>': 13,
         '<num>': 19,
         '<url>': 26,
         '<usr>': 40,
         '=': 1,
         '?': 7,
         '@alteravoce': 1,
         '___': 1,
         'answer': 1,
         'last': 1,
         'love': 1,
         'lumen': 1,
         'pepparkakor': 1,
         'phone': 1,
         'windows': 1,
         'wowan': 1,
         'x': 1,
         '©': 3,
         '«сарбанд»': 1,
         'ηkεу☆': 1,
         'а': 7,
         'ааааааа': 1,
         'автобусе': 1,
         'аву': 1,
         'ага': 1,
         'агрегат': 1,
         'адольф': 1,
         'адриано': 1,
         'аж': 1,
         'алан': 1,
         'анастасия': 1,
         'анимации': 1,
         'аннулировали': 1,
         'арды