## Extraction of Notes from Downloaded Sites
Form lists of phrases for each text

In [1]:
from collections import defaultdict, Counter
import json
import jsonlines
import Levenshtein
import nltk
import numpy as np
import os
import re
from tqdm import tqdm

data_dir = '../data'

In [2]:
def look_soup(dd, link):
    print('\n'.join([abstract[0] for abstract in dd[link] if abstract[1] == 'text']))
    
def look_listphrases(listphrases):
    print('\n***********\n'.join(['\n'.join(paragraph) for paragraph in listphrases]))

#### prohiphop.org

In [3]:
dd_prohiphop = defaultdict(list)

with jsonlines.open(os.path.join(data_dir,'site_soup_prohiphop.jsonl')) as fd:
    for obj in tqdm(fd):
        dd_prohiphop[obj['link']] += obj['soup']

131it [00:00, 4429.90it/s]


texts didn't download :(

#### webrap.info

In [4]:
dd_webrap = defaultdict(list)

with jsonlines.open(os.path.join(data_dir,'site_soup_webrap.jsonl')) as fd:
    for obj in tqdm(fd):
        dd_webrap[obj['link']] += obj['soup']

3152it [00:00, 5601.72it/s]


In [5]:
list_text_uchastnikov_versusa = sorted([link for link in dd_webrap if re.match('^\S*/text-uchastnikov-versusa\S+$', link) != None])
print(len(list_text_uchastnikov_versusa))

247


In [6]:
list_text_pesen = sorted([link for link in dd_webrap if re.match('^\S*/text-pesen\S+/\S+$', link) != None])
print(len(list_text_pesen))

1725


In [8]:
def extraction(soup):
    list_text = []
    list_tmp = []
    fl_israp = 0
    fl_newparagraph = 0
    for abstract in soup:
        fl_isphrase = 1
        text = abstract[0]
        block = abstract[2]
        if re.match('^Просмотров: \d+$', text) != None:
            fl_israp = 1
            fl_isphrase = 0
        elif (fl_israp == 1) and ('a:' in block) and (len(list_text) > 0):
            fl_israp = 0
            fl_isphrase = 0
        elif re.match('^\S*h\d:\S*$', block) != None:
            fl_isphrase = 0
            fl_newparagraph = 1
            
        if (fl_israp == 1) and (fl_isphrase == 1):
            if fl_newparagraph == 1:
                if len(list_tmp) > 0:
                    list_text.append(list_tmp)
                    list_tmp = [text]
                else:
                    list_tmp = [text]
                fl_newparagraph = 0
            else:
                list_tmp.append(text)
    if len(list_tmp) > 0:
        list_text.append(list_tmp)
    return list_text

In [9]:
dd_text_uchastnikov_versusa = {link:extraction(dd_webrap[link]) for link in tqdm(list_text_uchastnikov_versusa)}

100%|██████████| 247/247 [00:00<00:00, 1512.97it/s]


In [10]:
def extraction(soup):
    list_text = []
    list_tmp = []
    fl_israp = 0
    fl_newparagraph = 0
    num_paragraph = -999
    for abstract in soup:
        fl_isphrase = 1
        text = abstract[0]
        block = abstract[2]
        if re.match('^Просмотров: \d+$', text) != None:
            fl_israp = 1
            fl_isphrase = 0
        elif 'a:' in block:
            fl_israp = 0
            fl_isphrase = 0
        elif ('припев' in text.lower()) and (len(text) <= 20):
            fl_isphrase = 0
            
        if re.match('^\S*p:\S*$', block) != None:
            num_paragraph_curr = int(re.findall(r'p:\d+', block)[0].split(':')[1])
            if num_paragraph_curr > num_paragraph:
                num_paragraph = num_paragraph_curr
                fl_newparagraph = 1
        
        if (fl_israp == 1) and (fl_isphrase == 1):
            if fl_newparagraph == 1:
                if len(list_tmp) > 0:
                    list_text.append(list_tmp)
                    list_tmp = [text]
                else:
                    list_tmp = [text]
                fl_newparagraph = 0
            else:
                list_tmp.append(text)
    if len(list_tmp) > 0:
        list_text.append(list_tmp)
    
    if len(list_text) >= 10:
        list_text = [[phrase for paragraph in list_text for phrase in paragraph]]
    
    return list_text

In [11]:
dd_text_pesen = {link:extraction(dd_webrap[link]) for link in tqdm(list_text_pesen)}

100%|██████████| 1725/1725 [00:00<00:00, 2062.57it/s]


In [12]:
dd_webrap_texts = dd_text_uchastnikov_versusa.copy()

for link, list_text in dd_text_pesen.items():
    dd_webrap_texts[link] = list_text
print(len(dd_webrap_texts))

1972


In [16]:
with jsonlines.open(os.path.join(data_dir, 'texts_webrap.jsonl'), mode='w') as fd:
    for link, list_texts in dd_webrap_texts.items():
        if len(list_texts) > 0:
            fd.write({'link':link, 'list_texts':list_texts})

## Repair Words with Star
(ex.: *ж*па* -> *жопа*)

In [3]:
dict_word_w_star = defaultdict(int)
dict_word_wo_star = defaultdict(int)

with jsonlines.open(os.path.join(data_dir, 'texts_webrap.jsonl')) as fd:
    for obj in tqdm(fd):
        for list_phrase in obj['list_texts']:
            for phrase in list_phrase:
                toks = [tok for tok in nltk.word_tokenize(phrase.lower()) \
                        if len(tok) > 1 and re.sub('[^a-zа-я*]', '', tok) != '']
                for tok in toks:
                    if '*' in tok:
                        if len(tok) == (len(re.sub('[*]', '', tok)) + 1):
                            dict_word_w_star[tok] += 1
                    else:
                        dict_word_wo_star[tok] += 1
                        
print('Number of words with star', len(dict_word_w_star))
print('Number of words without star', len(dict_word_wo_star))

1971it [00:21, 93.37it/s] 

Number of words with star 1325
Number of words without star 97895





In [4]:
sorted(dict_word_w_star.items(), key=lambda x: x[1], reverse=True)[:10]

[('бл*ть', 400),
 ('х*й', 340),
 ('нах*й', 274),
 ('пох*й', 150),
 ('бл*дь', 96),
 ('на*уй', 81),
 ('*бал', 79),
 ('еб*ть', 78),
 ('п*здец', 76),
 ('с*ка', 65)]

In [5]:
dict_word_wo_star['блять']

326

In [6]:
list_word_w_star = [word for word, count in sorted(dict_word_w_star.items(), key=lambda x: x[1], reverse=True)]
list_word_wo_star = [word for word, count in sorted(dict_word_wo_star.items(), key=lambda x: x[1], reverse=True)]

def repair_word(word_w_s, list_wo_s=list_word_wo_star, dist=1):
    len_word_w_s = len(word_w_s)
    for word_wo_s in list_wo_s:
        if len_word_w_s == len(word_wo_s) and Levenshtein.distance(word_w_s, word_wo_s) == dist:
            return word_wo_s
    return None

In [7]:
repair_word('бл*ть')

'блять'

In [8]:
dict_word_repaired = {}
list_word_notrepaired = []
for word_w_s in tqdm(list_word_w_star):
    word_wo_s = repair_word(word_w_s)
    if word_wo_s is not None:
        dict_word_repaired[word_w_s] = word_wo_s
    else:
        list_word_notrepaired.append(word_w_s)

print('Number of repaired words', len(dict_word_repaired), list(dict_word_repaired.items())[:10])
print('Number of not repaired words', len(list_word_notrepaired), list_word_notrepaired[:10])

100%|██████████| 1325/1325 [00:13<00:00, 95.25it/s]

Number of repaired words 761 [('бл*ть', 'блять'), ('х*й', 'хуй'), ('нах*й', 'нахуй'), ('пох*й', 'похуй'), ('бл*дь', 'блядь'), ('на*уй', 'нахуй'), ('*бал', 'ебал'), ('еб*ть', 'ебать'), ('п*здец', 'пиздец'), ('с*ка', 'сука')]
Number of not repaired words 564 ['на*хуй', 'х*есос', 'х*еплет', 'ох*евали', 'долбо*ба', 'п*здобол', 'у*бал', 'п*дараса', 'раз*бал', 'по*балу']





In [9]:
with open(os.path.join(data_dir, 'dict_wws.json'), 'w') as fd:
    fd.write(json.dumps(dict_word_repaired))
    
# with open(os.path.join(data_dir, 'dict_wws.json')) as fd:
#     dict_word_repaired = json.load(fd)

## Char Analysis

In [48]:
dict_char_filt = defaultdict(int)

with jsonlines.open(os.path.join(data_dir, 'texts_webrap.jsonl')) as fd:
    for obj in fd:
        list_texts = obj['list_texts']
        for list_text in list_texts:
            for phrase in list_text:
                for char in re.sub('''[^!"()*,-.:;? 0-9A-Za-zА-Яа-яЁё]''','',phrase):
                    dict_char_filt[char] += 1

In [53]:
print([char for char, freq in sorted(dict_char_filt.items(), key=lambda x: x[0])])

[' ', '!', '"', '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ё', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё']
