In [53]:
import pandas as pd
import csv
import xml
from lxml.etree import ParserError
import os, path, sys
import glob

import re
import matplotlib.dates
import matplotlib
import matplotlib.pyplot
import datetime

import string
import pymorphy2

Text Preprocessing

In [59]:
df = pd.read_csv('users/df.csv', index_col=0, header=0,
                 lineterminator='\n')

In [60]:
len(df)

1829451

Check for duplicates

In [62]:
df.drop_duplicates(subset='text', inplace=True, keep='first')

In [63]:
len(df)

1779653

Text Preprocessing

In [64]:
from pynlple.processing.preprocessor import *

stack = [
                                        HtmlEntitiesUnescaper(),
                                        BoldTagReplacer(),
                                        URLReplacer(''),
                                        HtmlTagReplacer(' '),
                                        EmailReplacer(''),
                                        PhoneNumberReplacer(''),
                                        AtReferenceReplacer(''),
                                        CommaReplacer(),
                                        QuotesReplacer(),
                                        DoubleQuotesReplacer(),
                                        SoftHyphenReplacer(),
                                        DashAndMinusReplacer(),
                                        TripledotReplacer(),
                                        ToLowerer(),
                                        DigitsReplacer(True, '0'),
                                        MultiPunctuationReplacer(),
                                        MultiNewLineReplacer(),
                                        MultiWhitespaceReplacer(),
                                        WordTokenizer(),
                                        Trimmer(),
        ]

In [66]:
%time
t = StackingPreprocessor(stack)
df['preproc_text_without_tags'] = df["text"].apply(lambda x: t.preprocess(str(x))).values

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs


In [67]:
len(df)

1779653

Delete Empty Texts

In [68]:
df = df[df['preproc_text_without_tags']!= '']
len(df)

1776913

Language Detection

In [69]:
from pyfasttext import FastText
model = FastText('models/lid.176.bin')
df['prep_ft'] = [model.predict_proba([i], 1)[0][0][0] for i in df['preproc_text_without_tags']]

Check if text contains at least one  cyrrilic letter

In [73]:
def has_cyrillic(text):
    return bool(re.search('[\u0400-\u04FF]', text))

In [76]:
df['has_cyr'] = [has_cyrillic(str(i)) for i in df['preproc_text_without_tags']]

In [77]:
df = df[df['has_cyr']==True]
len(df)

1724989

Delete messages with 'I'm at' (Swarm app texts)

In [80]:
df = df[~df['preproc_text_without_tags'].str.contains('i\'m at')]
len(df)

1723371

delete row if it contains "ы" and ft is NOT uk

In [84]:
df =  df[~((df['preproc_text_without_tags'].str.contains('ы')) & (df['fasttext_lang'] != 'uk'))]

In [88]:
len(df)

1648655

Detected languages wich are closely similar to Ukrainian

In [98]:
df['prep_ft'].value_counts()

uk     1454392
ru      138978
bg       10547
be        7862
sr        7603
mk        4106
pl        3627
en        3365
mn        1824
ky        1612
sah       1485
es        1347
kk        1271
ar         751
ce         719
fr         709
lv         625
de         602
fy         555
mhr        482
tt         467
af         414
lt         412
eo         365
vi         306
tg         300
cv         229
mrj        204
nl         188
ba         183
        ...   
co           3
br           3
tl           3
diq          3
da           3
sk           3
ps           2
sq           2
fa           2
yue          2
bar          2
bpy          2
pfl          2
kn           2
nds          2
vec          1
id           1
mt           1
mzn          1
sw           1
is           1
te           1
km           1
xmf          1
pnb          1
pam          1
pa           1
ast          1
ms           1
kw           1
Name: prep_ft, Length: 140, dtype: int64

Delete Russian texts (you can delete any language or just leave only Ukrainian)

In [108]:
df = df[df['prep_ft']!='ru']
len(df)

1509677

In [129]:
df.to_csv('corpus.csv')

In [None]:
import gc

gc.collect()

In [41]:
df['text_length'] = df['preproc_text_without_tags'].apply(lambda x: len(x.split()))

In [19]:
def avg_word_len(text):
    text_length = len(text.split())
    return sum([len(word) for word in text.split()]) / text_length

In [20]:
avg_word_len('I love')

2.5

In [45]:
df['avg_word_length'] = df['preproc_text_without_tags'].apply(lambda x: avg_word_len(x))

In [38]:
exclude = set(string.punctuation)
exclude.update(['..'])

def text_prepare(text):
    text = ' '.join(ch for ch in text.split() if ch not in exclude)
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    #text = lemmatized(text)
    return text

In [39]:
df['preproc_text'] = df["preproc_text_without_tags"].apply(lambda x: text_prepare(x)).values