# 월간식당 빅데이터 분석

* 키워드별 단어 출현 빈도

In [1]:
import itertools
import json
import numpy as np
import pandas as pd
import random 
import re
import string
import warnings
warnings.simplefilter("ignore")

from collections import Counter
from gensim.models.word2vec import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from konlpy.utils import pprint
from konlpy.tag import Hannanum
hannanum = Hannanum()
from konlpy.tag import Okt
okt = Okt()
from konlpy.tag import Kkma
kkma = Kkma()
import matplotlib.pyplot as plt
%matplotlib inline

from pandas import read_excel
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from tqdm import trange
from wordcloud import WordCloud

from msba import posts as p
from msba import stopwords as stopwords

In [2]:
def interested_words():
    # 핵심단어 읽어 오기
    my_sheet = '소비키워드'
    keywords_filename = 'deskresearch_.xlsx'
    df = read_excel(keywords_filename, sheet_name = my_sheet, header=1) # index_col='번호'
    keywords = df['핵심단어']
    subkeywords = df['대체어']
    interested_words = df['키워드']
    return keywords, subkeywords, interested_words

def oneDArray(x):
    return list(itertools.chain(*x))

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
#     text = re.sub('[A-z]', '', text)
#     text = re.sub('[0-9]', '', text)
    text = re.sub('[\'\"]', '', text)
    text = re.sub('[\'\n"]', '', text)
    res = ''.join([i for i in text if not i.isdigit()]) 
    return text

def preprocessing_2(rows):
    pattern = re.compile(r"[.,?!★~]")
    sentences = []
    for row in rows:
        sentences = sentences + pattern.split(row.replace("\xa0", "").replace("\t","").strip())
        sentences = list(set(sentences))        
    return sentences

def convert_list_to_string(org_list, seperator=' '):
    """ Convert list to string, by joining all item in list with given separator.
        Returns the concatenated string """
    return seperator.join(org_list)

# 실전 : 말뭉치 생성 및 전처리
def preprocessing_3(sentences):
    # Convert list of strings to string
#     full_str = convert_list_to_string(sentences[1])
#     # print(full_str)

    pattern = re.compile(r".*(광고정보).*")

    corpus = []
    for sentence in sentences:
        if len(sentence):
            sentence = sentence.strip().split(" ")
            sentence = convert_list_to_string(sentence).replace("  ", " ").strip()
            if pattern.match(sentence):
                continue
            else:
                corpus.append(sentence)
    return corpus

def preprocessing_4(docs):
    # docs = [
    #         w for w in hannanum.nouns(" ".join(cell)) for cell in cells
    #         if ((not w[0].isnumeric()) and (w[0] not in string.punctuation))
    # ]
    vect = CountVectorizer(stop_words=stopwords.stopwords_kr, min_df=5, max_df=200).fit(docs)
    count = vect.transform(docs).toarray().sum(axis=0)
    idx = np.argsort(-count)
    count = count[idx]
    feature_name = np.array(vect.get_feature_names())[idx]
    # plt.bar(range(len(count)), count)
    # plt.show()

    tf_list = list(zip(feature_name, count))[:100]

    tf_df = pd.DataFrame(tf_list,columns=['단어', '빈도'])
    return tf_df

def remove_stopwords_from_list(lst):
    result = []
    for w in lst:
        if ((len(w) > 1) and (not w.isdigit())):
            if w not in stopwords.stopwords_kr: 
                result.append(w)
    return result

def remove_stopwords(line):
    result = ""
    for w in line.split(" "):
        if ((len(w) > 1) and (not w.isdigit())):
            if w not in stopwords.stopwords_kr: 
                result = result + w + " "
    return result.strip()

def save_to_csv(keyword, output, tf_df):
    # save to csv    
    filename = "./output/" + output + "_" + keyword.replace(" ","") + ".csv"   
    # filename_list.append(filename)
    tf_df.to_csv(filename, date_format='%Y%m%d', encoding='utf-8-sig')

## 키워드/관심어 불러오기

In [3]:
keywords, subkeywords, interested_words = interested_words()

for keyword, subkeyword, interested_word in zip(keywords, subkeywords, interested_words):
    subkeyword = subkeyword.replace(" ", "").replace(",","|")
    interested_word = subkeyword + "|" + interested_word.replace(" ", "").replace(",","|")
#     print(keyword, " : " , subkeyword, interested_word)

# 자신의 관심사에 맞는 단어로 데이터 가져오기

In [14]:
# %%time
for keyword, subkeyword, interested_word in tqdm(zip(keywords, subkeywords, interested_words)):

    #     keyword = '1인 외식'
    # keyword = keyword.replace(" ","")
    df = p.readall(keyword.replace(" ",""))
    df = df[ (df['date'] >= '2019-07-01') & (df['date'] < '2020-07-01')]
    df = df.drop_duplicates()
    # print(df.shape)  
    rows = df['title'].apply(preprocessing) + df['content'].apply(preprocessing)
    # print(len(rows), type(rows))
    sentences = preprocessing_2(rows)
    # print(len(sentences), type(sentences))
    sentences = preprocessing_3(sentences)
    # print(len(corpus), type(corpus))

    # 4. 각 문장별로 형태소 구분하기
    sentences_tag = []
    for sentence in sentences:
        morph = okt.pos(sentence)
        sentences_tag.append(morph)
    #     print(morph)
    #     print('-'*30)

    # print(sentences_tag)
    # print(len(sentences_tag))
    # print('\n'*3)

#     sentences_tag

    # 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기
    noun_adj_list = []

    noun_adj_list = []
    for sentence1 in sentences_tag:
        words = ""
        for word, tag in sentence1:
            if tag in ['Noun','Adjective']:
                words = words + word + " "
    #     print(words)
    #     print("------------")
        words = remove_stopwords(words)
        noun_adj_list.append(words)

    #             line = remove_stopwords(line)
    #         print(line)
    #     noun_adj_list.append(line)

#     noun_adj_list

    #########################################################################
    # 최빈어
    #########################################################################
    # import itertools

    # def oneDArray(x):
    #     return list(itertools.chain(*x))

    tf_list = []
    for sentence1 in noun_adj_list:
        sentence1.split()
        tf_list.append(sentence1.split())

#     type(tf_list)

    tf_list = oneDArray(tf_list)
    
#     tf_list

    # 6. 선별된 품사별 빈도수 계산 & 상위 빈도 10위 까지 출력
    counts = Counter(tf_list)

    toplist = counts.most_common(200)

    # type(toplist)

    tf_df = pd.DataFrame (toplist,columns=['단어', '빈도수'])
    save_to_csv(keyword, "최빈어", tf_df)
    
   
    #########################################################################
    # 유사도
    #########################################################################
    # using remove() to 
    # perform removal 
    while("" in noun_adj_list) : 
        noun_adj_list.remove("") 

    dataset = pd.DataFrame(noun_adj_list, columns=['문장'])

    nan_value = float("NaN")
    dataset.replace("", nan_value, inplace=True)
    dataset.dropna(subset = ["문장"], inplace=True)
    # dataset.reindex

    # noun_adj_list[12]
    # dataset['문장'][13]

    tmp_corpus = dataset['문장'].map(lambda x: x.split('.'))
    # tmp_corpus

    # type(tmp_corpus)
    # tmp_corpus[11]
    # tmp_corpus[13]

    # corpus [[w1,w2,w3..],[..]]
    corpus = []
    for i in range(len(tmp_corpus)):
        for line in tmp_corpus[i]:
    #         print(i)
            words = [x for x in line.split()]
            corpus.append(words)

#     corpus

    num_of_sentences = len(corpus)
    num_of_words = 0
    for line in corpus:
        num_of_words += len(line)

    print('Num of sentences - %s'%(num_of_sentences))
    print('Num of words - %s'%(num_of_words))

    phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
    bigram = Phraser(phrases)

    for index,sentence in enumerate(corpus):
        corpus[index] = bigram[sentence]

    # shuffle corpus
    def shuffle_corpus(sentences):
        shuffled = list(sentences)
        random.shuffle(shuffled)
        return shuffled

    # sg - skip gram |  window = size of the window | size = vector dimension
    size = 100
    window_size = 5 # sentences weren't too long, so
    epochs = 100
    min_count = 2
    workers = 4

    # train word2vec model using gensim
    # model = Word2Vec(corpus, sg=1,window=window_size,size=size,
    #                  min_count=min_count,workers=workers,iter=epochs,sample=0.01)

#     %%time
    model = Word2Vec(dataset,            # 불용어 처리한 후 
                    sg=2,                # skip-gram 적용: 중심 단어로 주변 단어를 예측
                    window=window_size,  # 중심 단어로부터 좌우 5개 단어까지 학습에 적용
                    iter=epochs,
                    #workers=workers,
                    #size=size,
                    sample=0.01,
                    min_count=1          # 전체문서에서 최소 1회 이상 출현단어로 학습 진행             
                    )
    model.init_sims(replace=True)

    model.build_vocab(sentences=shuffle_corpus(corpus),update=True)

    for i in trange(5):
        model.train(sentences=shuffle_corpus(corpus),epochs=50,total_examples=model.corpus_count)

#     corpus

    # save model
    modelname = "./model/" + "w2v_" + keyword.replace(" ","")  
    model.save(modelname)

0it [00:00, ?it/s]

Num of sentences - 35366
Num of words - 200682



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:24<01:39, 24.81s/it][A
 40%|████      | 2/5 [00:49<01:13, 24.64s/it][A
 60%|██████    | 3/5 [01:13<00:48, 24.47s/it][A
 80%|████████  | 4/5 [01:37<00:24, 24.50s/it][A
100%|██████████| 5/5 [02:00<00:00, 24.19s/it][A
1it [04:28, 268.75s/it]

Num of sentences - 41273
Num of words - 247511



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:32<02:11, 32.92s/it][A
 40%|████      | 2/5 [01:04<01:37, 32.37s/it][A
 60%|██████    | 3/5 [01:34<01:03, 31.81s/it][A
 80%|████████  | 4/5 [02:05<00:31, 31.56s/it][A
100%|██████████| 5/5 [02:35<00:00, 31.18s/it][A
2it [09:49, 284.38s/it]

Num of sentences - 14890
Num of words - 121135



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:18<01:12, 18.20s/it][A
 40%|████      | 2/5 [00:34<00:52, 17.65s/it][A
 60%|██████    | 3/5 [00:50<00:34, 17.06s/it][A
 80%|████████  | 4/5 [01:05<00:16, 16.61s/it][A
100%|██████████| 5/5 [01:21<00:00, 16.34s/it][A
3it [12:23, 245.11s/it]

Num of sentences - 30822
Num of words - 181760



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:23<01:35, 23.92s/it][A
 40%|████      | 2/5 [00:46<01:10, 23.46s/it][A
 60%|██████    | 3/5 [01:07<00:45, 22.83s/it][A
 80%|████████  | 4/5 [01:29<00:22, 22.42s/it][A
100%|██████████| 5/5 [01:51<00:00, 22.22s/it][A
4it [16:26, 244.49s/it]

Num of sentences - 36238
Num of words - 216080



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:54, 28.51s/it][A
 40%|████      | 2/5 [00:57<01:25, 28.56s/it][A
 60%|██████    | 3/5 [01:25<00:56, 28.38s/it][A
 80%|████████  | 4/5 [01:52<00:28, 28.09s/it][A
100%|██████████| 5/5 [02:20<00:00, 28.02s/it][A
5it [21:11, 256.76s/it]

Num of sentences - 23363
Num of words - 156434



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:19<01:18, 19.65s/it][A
 40%|████      | 2/5 [00:39<00:58, 19.59s/it][A
 60%|██████    | 3/5 [00:57<00:38, 19.30s/it][A
 80%|████████  | 4/5 [01:15<00:18, 18.77s/it][A
100%|██████████| 5/5 [01:33<00:00, 18.72s/it][A
6it [24:24, 237.65s/it]

Num of sentences - 37288
Num of words - 232770



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:30<02:01, 30.28s/it][A
 40%|████      | 2/5 [00:59<01:30, 30.06s/it][A
 60%|██████    | 3/5 [01:29<00:59, 29.88s/it][A
 80%|████████  | 4/5 [01:58<00:29, 29.59s/it][A
100%|██████████| 5/5 [02:26<00:00, 29.34s/it][A
7it [29:38, 260.52s/it]

Num of sentences - 42575
Num of words - 244789



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:29<01:58, 29.64s/it][A
 40%|████      | 2/5 [00:58<01:28, 29.47s/it][A
 60%|██████    | 3/5 [01:28<00:58, 29.48s/it][A
 80%|████████  | 4/5 [01:57<00:29, 29.42s/it][A
100%|██████████| 5/5 [02:25<00:00, 29.08s/it][A
8it [35:09, 281.76s/it]

Num of sentences - 28182
Num of words - 176676



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:21<01:27, 21.82s/it][A
 40%|████      | 2/5 [00:42<01:04, 21.58s/it][A
 60%|██████    | 3/5 [01:04<00:43, 21.61s/it][A
 80%|████████  | 4/5 [01:26<00:21, 21.67s/it][A
100%|██████████| 5/5 [01:47<00:00, 21.48s/it][A
9it [38:59, 266.16s/it]

Num of sentences - 15828
Num of words - 102422



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:13<00:52, 13.18s/it][A
 40%|████      | 2/5 [00:26<00:39, 13.14s/it][A
 60%|██████    | 3/5 [00:38<00:25, 12.73s/it][A
 80%|████████  | 4/5 [00:49<00:12, 12.43s/it][A
100%|██████████| 5/5 [01:01<00:00, 12.31s/it][A
10it [41:02, 223.19s/it]

Num of sentences - 36452
Num of words - 225356



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:28<01:54, 28.75s/it][A
 40%|████      | 2/5 [00:56<01:24, 28.31s/it][A
 60%|██████    | 3/5 [01:23<00:55, 27.99s/it][A
 80%|████████  | 4/5 [01:50<00:27, 27.64s/it][A
100%|██████████| 5/5 [02:18<00:00, 27.74s/it][A
11it [45:58, 244.91s/it]

Num of sentences - 30462
Num of words - 165123



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:20<01:22, 20.61s/it][A
 40%|████      | 2/5 [00:39<01:00, 20.15s/it][A
 60%|██████    | 3/5 [00:57<00:39, 19.55s/it][A
 80%|████████  | 4/5 [01:16<00:19, 19.39s/it][A
100%|██████████| 5/5 [01:34<00:00, 18.99s/it][A
12it [49:47, 240.38s/it]

Num of sentences - 10297
Num of words - 77239



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:11<00:45, 11.25s/it][A
 40%|████      | 2/5 [00:21<00:32, 10.86s/it][A
 60%|██████    | 3/5 [00:30<00:20, 10.43s/it][A
 80%|████████  | 4/5 [00:40<00:10, 10.13s/it][A
100%|██████████| 5/5 [00:49<00:00,  9.91s/it][A
13it [51:22, 196.57s/it]

Num of sentences - 3899
Num of words - 26135



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:03<00:14,  3.53s/it][A
 40%|████      | 2/5 [00:06<00:10,  3.50s/it][A
 60%|██████    | 3/5 [00:10<00:06,  3.47s/it][A
 80%|████████  | 4/5 [00:13<00:03,  3.51s/it][A
100%|██████████| 5/5 [00:17<00:00,  3.45s/it][A
14it [51:53, 147.06s/it]

Num of sentences - 20263
Num of words - 129208



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:17<01:08, 17.05s/it][A
 40%|████      | 2/5 [00:32<00:49, 16.66s/it][A
 60%|██████    | 3/5 [00:49<00:33, 16.75s/it][A
 80%|████████  | 4/5 [01:05<00:16, 16.58s/it][A
100%|██████████| 5/5 [01:21<00:00, 16.30s/it][A
15it [54:36, 151.70s/it]

Num of sentences - 25233
Num of words - 155293



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:19<01:16, 19.17s/it][A
 40%|████      | 2/5 [00:37<00:56, 18.99s/it][A
 60%|██████    | 3/5 [00:55<00:37, 18.68s/it][A
 80%|████████  | 4/5 [01:13<00:18, 18.47s/it][A
100%|██████████| 5/5 [01:33<00:00, 18.68s/it][A
16it [57:49, 164.15s/it]

Num of sentences - 17450
Num of words - 103628



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:14<00:56, 14.24s/it][A
 40%|████      | 2/5 [00:27<00:41, 13.90s/it][A
 60%|██████    | 3/5 [00:40<00:27, 13.55s/it][A
 80%|████████  | 4/5 [00:52<00:13, 13.10s/it][A
100%|██████████| 5/5 [01:03<00:00, 12.78s/it][A
17it [59:55, 152.64s/it]

Num of sentences - 38234
Num of words - 270564



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:37<02:28, 37.11s/it][A
 40%|████      | 2/5 [01:11<01:48, 36.30s/it][A
 60%|██████    | 3/5 [01:45<01:11, 35.70s/it][A
 80%|████████  | 4/5 [02:19<00:35, 35.18s/it][A
100%|██████████| 5/5 [02:52<00:00, 34.50s/it][A
18it [1:05:31, 207.79s/it]

Num of sentences - 2349
Num of words - 14484



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:02<00:11,  2.81s/it][A
 40%|████      | 2/5 [00:05<00:08,  2.80s/it][A
 60%|██████    | 3/5 [00:08<00:05,  2.70s/it][A
 80%|████████  | 4/5 [00:10<00:02,  2.63s/it][A
100%|██████████| 5/5 [00:12<00:00,  2.56s/it][A
19it [1:05:54, 152.26s/it]

Num of sentences - 32303
Num of words - 203307



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:26<01:46, 26.69s/it][A
 40%|████      | 2/5 [00:52<01:19, 26.51s/it][A
 60%|██████    | 3/5 [01:18<00:52, 26.31s/it][A
 80%|████████  | 4/5 [01:46<00:26, 26.69s/it][A
100%|██████████| 5/5 [02:12<00:00, 26.47s/it][A
20it [1:10:11, 183.58s/it]

Num of sentences - 33915
Num of words - 220404



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:26<01:46, 26.53s/it][A
 40%|████      | 2/5 [00:51<01:18, 26.18s/it][A
 60%|██████    | 3/5 [01:16<00:51, 25.81s/it][A
 80%|████████  | 4/5 [01:41<00:25, 25.48s/it][A
100%|██████████| 5/5 [02:06<00:00, 25.24s/it][A
21it [1:14:49, 212.02s/it]

Num of sentences - 54426
Num of words - 326067



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:41<02:47, 41.78s/it][A
 40%|████      | 2/5 [01:22<02:04, 41.52s/it][A
 60%|██████    | 3/5 [02:02<01:21, 41.00s/it][A
 80%|████████  | 4/5 [02:42<00:40, 40.70s/it][A
100%|██████████| 5/5 [03:21<00:00, 40.38s/it][A
22it [1:21:34, 269.93s/it]

Num of sentences - 43766
Num of words - 271472



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:32<02:11, 32.97s/it][A
 40%|████      | 2/5 [01:04<01:37, 32.63s/it][A
 60%|██████    | 3/5 [01:35<01:04, 32.16s/it][A
 80%|████████  | 4/5 [02:07<00:31, 31.91s/it][A
100%|██████████| 5/5 [02:37<00:00, 31.55s/it][A
23it [1:27:08, 289.09s/it]

Num of sentences - 27205
Num of words - 167685



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:19<01:18, 19.62s/it][A
 40%|████      | 2/5 [00:38<00:58, 19.49s/it][A
 60%|██████    | 3/5 [00:57<00:38, 19.26s/it][A
 80%|████████  | 4/5 [01:16<00:19, 19.23s/it][A
100%|██████████| 5/5 [01:35<00:00, 19.03s/it][A
24it [1:30:39, 265.63s/it]

Num of sentences - 713
Num of words - 4830



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:01<00:04,  1.19s/it][A
 40%|████      | 2/5 [00:02<00:03,  1.21s/it][A
 60%|██████    | 3/5 [00:03<00:02,  1.20s/it][A
 80%|████████  | 4/5 [00:04<00:01,  1.18s/it][A
100%|██████████| 5/5 [00:05<00:00,  1.17s/it][A
25it [1:30:48, 188.86s/it]

Num of sentences - 41488
Num of words - 225039



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:25<01:42, 25.67s/it][A
 40%|████      | 2/5 [00:50<01:16, 25.40s/it][A
 60%|██████    | 3/5 [01:14<00:50, 25.14s/it][A
 80%|████████  | 4/5 [01:39<00:24, 24.90s/it][A
100%|██████████| 5/5 [02:03<00:00, 24.65s/it][A
26it [1:35:26, 215.43s/it]

Num of sentences - 35122
Num of words - 214245



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:26<01:46, 26.54s/it][A
 40%|████      | 2/5 [00:51<01:18, 26.13s/it][A
 60%|██████    | 3/5 [01:16<00:51, 25.85s/it][A
 80%|████████  | 4/5 [01:41<00:25, 25.62s/it][A
100%|██████████| 5/5 [02:06<00:00, 25.34s/it][A
27it [1:40:02, 233.70s/it]

Num of sentences - 43820
Num of words - 281905



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:33<02:12, 33.18s/it][A
 40%|████      | 2/5 [01:05<01:39, 33.03s/it][A
 60%|██████    | 3/5 [01:37<01:05, 32.67s/it][A
 80%|████████  | 4/5 [02:09<00:32, 32.41s/it][A
100%|██████████| 5/5 [02:41<00:00, 32.25s/it][A
28it [1:45:26, 260.79s/it]

Num of sentences - 27761
Num of words - 175754



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:22<01:30, 22.54s/it][A
 40%|████      | 2/5 [00:44<01:06, 22.28s/it][A
 60%|██████    | 3/5 [01:05<00:43, 21.93s/it][A
 80%|████████  | 4/5 [01:25<00:21, 21.48s/it][A
100%|██████████| 5/5 [01:45<00:00, 21.19s/it][A
29it [1:48:55, 245.08s/it]

Num of sentences - 66905
Num of words - 456253



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:53<03:35, 53.99s/it][A
 40%|████      | 2/5 [01:46<02:40, 53.45s/it][A
 60%|██████    | 3/5 [02:39<01:46, 53.35s/it][A
 80%|████████  | 4/5 [03:31<00:53, 53.13s/it][A
100%|██████████| 5/5 [04:24<00:00, 52.89s/it][A
30it [1:57:12, 320.67s/it]

Num of sentences - 43213
Num of words - 263589



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:31<02:05, 31.46s/it][A
 40%|████      | 2/5 [01:01<01:33, 31.11s/it][A
 60%|██████    | 3/5 [01:31<01:01, 30.75s/it][A
 80%|████████  | 4/5 [02:01<00:30, 30.61s/it][A
100%|██████████| 5/5 [02:31<00:00, 30.39s/it][A
31it [2:02:19, 316.61s/it]

Num of sentences - 48566
Num of words - 315288



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:41<02:44, 41.19s/it][A
 40%|████      | 2/5 [01:21<02:02, 40.79s/it][A
 60%|██████    | 3/5 [02:00<01:20, 40.25s/it][A
 80%|████████  | 4/5 [02:38<00:39, 39.69s/it][A
100%|██████████| 5/5 [03:16<00:00, 39.27s/it][A
32it [2:13:27, 422.05s/it]

Num of sentences - 20861
Num of words - 137480



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:16<01:05, 16.34s/it][A
 40%|████      | 2/5 [00:32<00:48, 16.17s/it][A
 60%|██████    | 3/5 [00:47<00:31, 15.85s/it][A
 80%|████████  | 4/5 [01:02<00:15, 15.79s/it][A
100%|██████████| 5/5 [01:17<00:00, 15.59s/it][A
33it [2:16:11, 344.63s/it]

Num of sentences - 19630
Num of words - 105647



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:12<00:48, 12.18s/it][A
 40%|████      | 2/5 [00:23<00:35, 11.95s/it][A
 60%|██████    | 3/5 [00:34<00:23, 11.65s/it][A
 80%|████████  | 4/5 [00:45<00:11, 11.40s/it][A
100%|██████████| 5/5 [00:56<00:00, 11.20s/it][A
34it [2:18:16, 278.78s/it]

Num of sentences - 20445
Num of words - 123214



  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:14<00:59, 14.92s/it][A
 40%|████      | 2/5 [00:29<00:44, 14.68s/it][A
 60%|██████    | 3/5 [00:43<00:29, 14.58s/it][A
 80%|████████  | 4/5 [00:57<00:14, 14.31s/it][A
100%|██████████| 5/5 [01:10<00:00, 14.10s/it][A
35it [2:20:44, 241.28s/it]
