In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import VocabTransform

from common import *
import os
import glob
from joblib import Parallel, delayed
import itertools
import multiprocessing
import copy 

import logging

from gzip import GzipFile
import json
from glob import glob

cpu_count = multiprocessing.cpu_count()
DATA_FOLDER = '../data/'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


# Extract documents and tokenize

In [2]:
class DocumentExtractor(object):
    def __init__(self, doc_list):
        self.doc_list = doc_list
 
    def __iter__(self):
        for fn in tqdm(self.doc_list):
            text = open(fn).read()
            document = list(tokenize(text))
            yield document
            
def save_documents(doc_list, prefix):
    docs = list(DocumentExtractor(doc_list))
    
    fname = join(DATA_FOLDER, 'documents/%s' % prefix + '.json.gz')
    with GzipFile(fname, 'w') as fout:
        json_str = json.dumps(docs)
        json_bytes = json_str.encode('utf-8')
        
        fout.write(json_bytes)

In [2]:
all_docs = get_all_docs(DATA_FOLDER)

In [4]:
parallelizer = Parallel(n_jobs=cpu_count)

# this iterator returns the functions to execute for each task
tasks_iterator = ( delayed(save_documents)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(all_docs)//1000, all_docs)) ) 
result = parallelizer( tasks_iterator )

100%|██████████| 1194/1194 [00:38<00:00, 30.95it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.74it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.96it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.72it/s]
100%|██████████| 1194/1194 [00:40<00:00, 13.58it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.94it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.25it/s]
100%|██████████| 1194/1194 [00:40<00:00, 33.16it/s]
100%|██████████| 1194/1194 [00:38<00:00, 30.85it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.82it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.53it/s]
100%|██████████| 1194/1194 [00:39<00:00, 29.95it/s]
100%|██████████| 1194/1194 [00:39<00:00, 38.45it/s]
100%|██████████| 1194/1194 [00:39<00:00, 37.92it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.83it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.62it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.48it/s]
100%|██████████| 1194/1194 [00:45<00:00, 26.37it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.07it/s]
100%|███████

100%|██████████| 1194/1194 [00:44<00:00, 27.01it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.63it/s]
100%|██████████| 1194/1194 [00:44<00:00, 26.98it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.23it/s]
100%|██████████| 1194/1194 [00:42<00:00, 29.08it/s]
100%|██████████| 1194/1194 [00:41<00:00, 22.64it/s]
 73%|███████▎  | 871/1194 [00:32<00:14, 22.89it/s]]
100%|██████████| 1194/1194 [00:43<00:00, 27.38it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.07it/s]
100%|██████████| 1194/1194 [00:40<00:00, 31.70it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.68it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.88it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.21it/s]
100%|██████████| 1194/1194 [00:42<00:00, 28.09it/s]
100%|██████████| 1194/1194 [00:43<00:00, 41.49it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.44it/s]
100%|██████████| 1194/1194 [00:42<00:00, 21.50it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.37it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.88it/s]
100%|███████

100%|██████████| 1194/1194 [00:07<00:00, 161.53it/s]
 78%|███████▊  | 935/1194 [00:05<00:01, 143.54it/s]]
100%|██████████| 1194/1194 [00:07<00:00, 159.14it/s]
100%|██████████| 1194/1194 [00:07<00:00, 158.73it/s]
100%|██████████| 1194/1194 [00:07<00:00, 144.00it/s]
100%|██████████| 1194/1194 [00:07<00:00, 157.31it/s]
100%|██████████| 1194/1194 [00:07<00:00, 165.48it/s]
100%|██████████| 1194/1194 [00:07<00:00, 159.24it/s]
100%|██████████| 1194/1194 [00:07<00:00, 151.95it/s]
100%|██████████| 1194/1194 [00:07<00:00, 159.59it/s]
100%|██████████| 1194/1194 [00:07<00:00, 164.23it/s]
100%|██████████| 1194/1194 [00:07<00:00, 165.10it/s]
100%|██████████| 1194/1194 [00:07<00:00, 165.66it/s]
100%|██████████| 1194/1194 [00:07<00:00, 151.02it/s]
100%|██████████| 1194/1194 [00:07<00:00, 151.60it/s]
100%|██████████| 1194/1194 [00:07<00:00, 158.41it/s]
100%|██████████| 1194/1194 [00:07<00:00, 154.59it/s]
 76%|███████▌  | 906/1194 [00:05<00:01, 151.41it/s]]
100%|██████████| 1194/1194 [00:06<00:00, 171.8

100%|██████████| 1194/1194 [00:03<00:00, 319.10it/s]

100%|██████████| 1194/1194 [00:03<00:00, 348.87it/s]
100%|██████████| 1194/1194 [00:04<00:00, 286.27it/s]
100%|██████████| 1194/1194 [00:03<00:00, 328.24it/s]
100%|██████████| 1194/1194 [00:03<00:00, 302.60it/s]
100%|██████████| 1194/1194 [00:03<00:00, 306.38it/s]
100%|██████████| 1194/1194 [00:04<00:00, 255.42it/s]
100%|██████████| 1194/1194 [00:04<00:00, 271.24it/s]
100%|██████████| 1194/1194 [00:04<00:00, 280.61it/s]
100%|██████████| 1194/1194 [00:04<00:00, 274.85it/s]
100%|██████████| 1194/1194 [00:04<00:00, 243.60it/s]
100%|██████████| 1194/1194 [00:04<00:00, 265.51it/s]
100%|██████████| 1194/1194 [00:05<00:00, 234.93it/s]
100%|██████████| 1194/1194 [00:04<00:00, 259.05it/s]
100%|██████████| 1194/1194 [00:04<00:00, 251.71it/s]
 97%|█████████▋| 1157/1194 [00:03<00:00, 274.13it/s]
100%|██████████| 1194/1194 [00:04<00:00, 291.35it/s]
100%|██████████| 1194/1194 [00:04<00:00, 280.62it/s]
100%|██████████| 1194/1194 [00:04<00:00, 267.

# Calc bow

In [5]:
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

In [6]:
class DocumentCorpus(gensim.corpora.TextCorpus):
    def __init__(self, ziped_files=None):
        super(DocumentCorpus, self).__init__()
        self.input = ziped_files
        if ziped_files:
            self.dictionary.add_documents(self.get_texts(), prune_at=None)
    def get_texts(self):
        for fn in tqdm(self.input): # for each relevant file
            with GzipFile(fn, 'r') as myzip:
                text = myzip.read()
            docs = json.loads(text)
            for doc in docs:
                yield doc
                
                
def save_doc_corpus(ziped_files, dir_name, prefix):
    corpus = DocumentCorpus(ziped_files)

    dic_name = join(dir_name, '%s.dict' % prefix)    
    corp_name = join(dir_name, '%s_corpus.mm' % prefix)    
    
    corpus.dictionary.save(dic_name)
    corpora.MmCorpus.serialize(corp_name, corpus)
    
    return dic_name, corp_name
                

In [7]:
save_doc_corpus(ziped_files, DATA_FOLDER, 'pure')

100%|██████████| 1001/1001 [21:12<00:00,  1.01it/s]
100%|██████████| 1001/1001 [23:43<00:00,  1.12s/it]


('../data/pure.dict', '../data/pure_corpus.mm')

# Alternative approach

In [6]:
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))

In [7]:
corpus = DocumentCorpus()
corpus.input = ziped_files
corpus.dictionary = dictionary

corp_name = join(DATA_FOLDER, '%s_corpus.mm' % 'pure')
corpora.MmCorpus.serialize(corp_name, corpus)

100%|██████████| 1001/1001 [23:27<00:00,  1.53s/it]


# Filter out tokens that appear in less than 5 documents (absolute number) or more than 50% documents

In [10]:
# filter the dictionary
old_dict = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))
new_dict = copy.deepcopy(old_dict)
new_dict.filter_extremes(no_below=5, no_above=0.25, keep_n=30000)
new_dict.save(join(DATA_FOLDER, 'filtered.dict'))

2017-07-18 17:16:53,381 : INFO : loading Dictionary object from ../data/pure.dict
2017-07-18 17:16:55,599 : INFO : loaded ../data/pure.dict
2017-07-18 17:17:09,234 : INFO : discarding 3396256 tokens: [('нефтешламовой', 34), ('вода', 270238), ('нефтехимперерабатывающий', 3), ('направлять', 216990), ('среда', 214378), ('азотно-фосфорные', 26), ('недопать', 38), ('технический', 443065), ('воздействие', 200626), ('продукт', 214096)]...
2017-07-18 17:17:09,235 : INFO : keeping 100000 tokens which were in no less than 5 and no more than 179200 (=15.0%) documents
2017-07-18 17:17:09,752 : INFO : resulting dictionary: Dictionary(100000 unique tokens: ['очистка', 'донный', 'отложение', 'накопитель', 'переработка']...)
2017-07-18 17:17:09,842 : INFO : saving Dictionary object under ../data/filtered.dict, separately None
2017-07-18 17:17:09,889 : INFO : saved ../data/filtered.dict


In [11]:
print(new_dict)

Dictionary(100000 unique tokens: ['очистка', 'донный', 'отложение', 'накопитель', 'переработка']...)


In [12]:
# now transform the corpus
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'pure_corpus.mm'))
old2new = {old_dict.token2id[token]:new_id for new_id, token in new_dict.iteritems()}
vt = VocabTransform(old2new)

2017-07-18 17:17:10,045 : INFO : loaded corpus index from ../data/pure_corpus.mm.index
2017-07-18 17:17:10,046 : INFO : initializing corpus reader from ../data/pure_corpus.mm
2017-07-18 17:17:10,047 : INFO : accepted corpus with 1194670 documents, 3496256 features, 192176433 non-zero entries


In [13]:
corpora.MmCorpus.serialize(join(DATA_FOLDER, 'filtered_corpus.mm'), vt[corpus], id2word=new_dict)

2017-07-18 17:17:13,586 : INFO : storing corpus in Matrix Market format to ../data/filtered_corpus.mm
2017-07-18 17:17:13,599 : INFO : saving sparse matrix to ../data/filtered_corpus.mm
2017-07-18 17:17:13,601 : INFO : PROGRESS: saving document #0
2017-07-18 17:17:14,773 : INFO : PROGRESS: saving document #1000
2017-07-18 17:17:15,970 : INFO : PROGRESS: saving document #2000
2017-07-18 17:17:17,178 : INFO : PROGRESS: saving document #3000
2017-07-18 17:17:18,350 : INFO : PROGRESS: saving document #4000
2017-07-18 17:17:19,469 : INFO : PROGRESS: saving document #5000
2017-07-18 17:17:20,622 : INFO : PROGRESS: saving document #6000
2017-07-18 17:17:21,756 : INFO : PROGRESS: saving document #7000
2017-07-18 17:17:22,928 : INFO : PROGRESS: saving document #8000
2017-07-18 17:17:24,105 : INFO : PROGRESS: saving document #9000
2017-07-18 17:17:25,318 : INFO : PROGRESS: saving document #10000
2017-07-18 17:17:26,535 : INFO : PROGRESS: saving document #11000
2017-07-18 17:17:27,740 : INFO : PR

2017-07-18 17:19:38,092 : INFO : PROGRESS: saving document #122000
2017-07-18 17:19:39,197 : INFO : PROGRESS: saving document #123000
2017-07-18 17:19:40,358 : INFO : PROGRESS: saving document #124000
2017-07-18 17:19:41,434 : INFO : PROGRESS: saving document #125000
2017-07-18 17:19:42,573 : INFO : PROGRESS: saving document #126000
2017-07-18 17:19:43,769 : INFO : PROGRESS: saving document #127000
2017-07-18 17:19:44,896 : INFO : PROGRESS: saving document #128000
2017-07-18 17:19:46,023 : INFO : PROGRESS: saving document #129000
2017-07-18 17:19:47,136 : INFO : PROGRESS: saving document #130000
2017-07-18 17:19:48,244 : INFO : PROGRESS: saving document #131000
2017-07-18 17:19:49,415 : INFO : PROGRESS: saving document #132000
2017-07-18 17:19:50,577 : INFO : PROGRESS: saving document #133000
2017-07-18 17:19:51,993 : INFO : PROGRESS: saving document #134000
2017-07-18 17:19:53,147 : INFO : PROGRESS: saving document #135000
2017-07-18 17:19:54,267 : INFO : PROGRESS: saving document #13

2017-07-18 17:22:02,101 : INFO : PROGRESS: saving document #245000
2017-07-18 17:22:03,305 : INFO : PROGRESS: saving document #246000
2017-07-18 17:22:04,470 : INFO : PROGRESS: saving document #247000
2017-07-18 17:22:05,627 : INFO : PROGRESS: saving document #248000
2017-07-18 17:22:06,780 : INFO : PROGRESS: saving document #249000
2017-07-18 17:22:07,975 : INFO : PROGRESS: saving document #250000
2017-07-18 17:22:09,145 : INFO : PROGRESS: saving document #251000
2017-07-18 17:22:10,355 : INFO : PROGRESS: saving document #252000
2017-07-18 17:22:11,533 : INFO : PROGRESS: saving document #253000
2017-07-18 17:22:12,768 : INFO : PROGRESS: saving document #254000
2017-07-18 17:22:13,919 : INFO : PROGRESS: saving document #255000
2017-07-18 17:22:15,145 : INFO : PROGRESS: saving document #256000
2017-07-18 17:22:16,368 : INFO : PROGRESS: saving document #257000
2017-07-18 17:22:17,566 : INFO : PROGRESS: saving document #258000
2017-07-18 17:22:18,751 : INFO : PROGRESS: saving document #25

2017-07-18 17:24:26,244 : INFO : PROGRESS: saving document #368000
2017-07-18 17:24:27,781 : INFO : PROGRESS: saving document #369000
2017-07-18 17:24:28,931 : INFO : PROGRESS: saving document #370000
2017-07-18 17:24:30,114 : INFO : PROGRESS: saving document #371000
2017-07-18 17:24:31,356 : INFO : PROGRESS: saving document #372000
2017-07-18 17:24:32,550 : INFO : PROGRESS: saving document #373000
2017-07-18 17:24:33,736 : INFO : PROGRESS: saving document #374000
2017-07-18 17:24:34,892 : INFO : PROGRESS: saving document #375000
2017-07-18 17:24:36,140 : INFO : PROGRESS: saving document #376000
2017-07-18 17:24:37,361 : INFO : PROGRESS: saving document #377000
2017-07-18 17:24:38,547 : INFO : PROGRESS: saving document #378000
2017-07-18 17:24:39,790 : INFO : PROGRESS: saving document #379000
2017-07-18 17:24:41,029 : INFO : PROGRESS: saving document #380000
2017-07-18 17:24:42,221 : INFO : PROGRESS: saving document #381000
2017-07-18 17:24:43,430 : INFO : PROGRESS: saving document #38

2017-07-18 17:26:54,462 : INFO : PROGRESS: saving document #491000
2017-07-18 17:26:55,732 : INFO : PROGRESS: saving document #492000
2017-07-18 17:26:57,001 : INFO : PROGRESS: saving document #493000
2017-07-18 17:26:58,301 : INFO : PROGRESS: saving document #494000
2017-07-18 17:26:59,529 : INFO : PROGRESS: saving document #495000
2017-07-18 17:27:00,753 : INFO : PROGRESS: saving document #496000
2017-07-18 17:27:02,102 : INFO : PROGRESS: saving document #497000
2017-07-18 17:27:03,295 : INFO : PROGRESS: saving document #498000
2017-07-18 17:27:04,442 : INFO : PROGRESS: saving document #499000
2017-07-18 17:27:05,590 : INFO : PROGRESS: saving document #500000
2017-07-18 17:27:06,731 : INFO : PROGRESS: saving document #501000
2017-07-18 17:27:07,946 : INFO : PROGRESS: saving document #502000
2017-07-18 17:27:09,104 : INFO : PROGRESS: saving document #503000
2017-07-18 17:27:10,282 : INFO : PROGRESS: saving document #504000
2017-07-18 17:27:11,434 : INFO : PROGRESS: saving document #50

2017-07-18 17:28:29,289 : INFO : PROGRESS: saving document #614000
2017-07-18 17:28:29,524 : INFO : PROGRESS: saving document #615000
2017-07-18 17:28:29,753 : INFO : PROGRESS: saving document #616000
2017-07-18 17:28:29,974 : INFO : PROGRESS: saving document #617000
2017-07-18 17:28:30,198 : INFO : PROGRESS: saving document #618000
2017-07-18 17:28:30,420 : INFO : PROGRESS: saving document #619000
2017-07-18 17:28:30,648 : INFO : PROGRESS: saving document #620000
2017-07-18 17:28:30,880 : INFO : PROGRESS: saving document #621000
2017-07-18 17:28:31,113 : INFO : PROGRESS: saving document #622000
2017-07-18 17:28:31,341 : INFO : PROGRESS: saving document #623000
2017-07-18 17:28:31,571 : INFO : PROGRESS: saving document #624000
2017-07-18 17:28:31,808 : INFO : PROGRESS: saving document #625000
2017-07-18 17:28:32,045 : INFO : PROGRESS: saving document #626000
2017-07-18 17:28:32,261 : INFO : PROGRESS: saving document #627000
2017-07-18 17:28:32,484 : INFO : PROGRESS: saving document #62

2017-07-18 17:28:57,987 : INFO : PROGRESS: saving document #737000
2017-07-18 17:28:58,210 : INFO : PROGRESS: saving document #738000
2017-07-18 17:28:58,440 : INFO : PROGRESS: saving document #739000
2017-07-18 17:28:58,666 : INFO : PROGRESS: saving document #740000
2017-07-18 17:28:58,903 : INFO : PROGRESS: saving document #741000
2017-07-18 17:28:59,152 : INFO : PROGRESS: saving document #742000
2017-07-18 17:28:59,386 : INFO : PROGRESS: saving document #743000
2017-07-18 17:28:59,619 : INFO : PROGRESS: saving document #744000
2017-07-18 17:28:59,847 : INFO : PROGRESS: saving document #745000
2017-07-18 17:29:00,070 : INFO : PROGRESS: saving document #746000
2017-07-18 17:29:00,300 : INFO : PROGRESS: saving document #747000
2017-07-18 17:29:00,518 : INFO : PROGRESS: saving document #748000
2017-07-18 17:29:00,743 : INFO : PROGRESS: saving document #749000
2017-07-18 17:29:00,976 : INFO : PROGRESS: saving document #750000
2017-07-18 17:29:01,204 : INFO : PROGRESS: saving document #75

2017-07-18 17:29:27,844 : INFO : PROGRESS: saving document #860000
2017-07-18 17:29:28,084 : INFO : PROGRESS: saving document #861000
2017-07-18 17:29:28,328 : INFO : PROGRESS: saving document #862000
2017-07-18 17:29:28,560 : INFO : PROGRESS: saving document #863000
2017-07-18 17:29:28,798 : INFO : PROGRESS: saving document #864000
2017-07-18 17:29:29,023 : INFO : PROGRESS: saving document #865000
2017-07-18 17:29:29,257 : INFO : PROGRESS: saving document #866000
2017-07-18 17:29:29,504 : INFO : PROGRESS: saving document #867000
2017-07-18 17:29:29,731 : INFO : PROGRESS: saving document #868000
2017-07-18 17:29:29,964 : INFO : PROGRESS: saving document #869000
2017-07-18 17:29:30,199 : INFO : PROGRESS: saving document #870000
2017-07-18 17:29:30,431 : INFO : PROGRESS: saving document #871000
2017-07-18 17:29:30,660 : INFO : PROGRESS: saving document #872000
2017-07-18 17:29:30,887 : INFO : PROGRESS: saving document #873000
2017-07-18 17:29:31,118 : INFO : PROGRESS: saving document #87

2017-07-18 17:29:56,843 : INFO : PROGRESS: saving document #983000
2017-07-18 17:29:57,063 : INFO : PROGRESS: saving document #984000
2017-07-18 17:29:57,294 : INFO : PROGRESS: saving document #985000
2017-07-18 17:29:57,531 : INFO : PROGRESS: saving document #986000
2017-07-18 17:29:57,756 : INFO : PROGRESS: saving document #987000
2017-07-18 17:29:57,991 : INFO : PROGRESS: saving document #988000
2017-07-18 17:29:58,216 : INFO : PROGRESS: saving document #989000
2017-07-18 17:29:58,447 : INFO : PROGRESS: saving document #990000
2017-07-18 17:29:58,676 : INFO : PROGRESS: saving document #991000
2017-07-18 17:29:58,902 : INFO : PROGRESS: saving document #992000
2017-07-18 17:29:59,140 : INFO : PROGRESS: saving document #993000
2017-07-18 17:29:59,370 : INFO : PROGRESS: saving document #994000
2017-07-18 17:29:59,596 : INFO : PROGRESS: saving document #995000
2017-07-18 17:29:59,832 : INFO : PROGRESS: saving document #996000
2017-07-18 17:30:00,051 : INFO : PROGRESS: saving document #99

2017-07-18 17:30:33,078 : INFO : PROGRESS: saving document #1104000
2017-07-18 17:30:33,233 : INFO : PROGRESS: saving document #1105000
2017-07-18 17:30:33,391 : INFO : PROGRESS: saving document #1106000
2017-07-18 17:30:33,549 : INFO : PROGRESS: saving document #1107000
2017-07-18 17:30:33,708 : INFO : PROGRESS: saving document #1108000
2017-07-18 17:30:33,865 : INFO : PROGRESS: saving document #1109000
2017-07-18 17:30:34,032 : INFO : PROGRESS: saving document #1110000
2017-07-18 17:30:34,193 : INFO : PROGRESS: saving document #1111000
2017-07-18 17:30:34,354 : INFO : PROGRESS: saving document #1112000
2017-07-18 17:30:34,515 : INFO : PROGRESS: saving document #1113000
2017-07-18 17:30:34,673 : INFO : PROGRESS: saving document #1114000
2017-07-18 17:30:34,831 : INFO : PROGRESS: saving document #1115000
2017-07-18 17:30:34,992 : INFO : PROGRESS: saving document #1116000
2017-07-18 17:30:35,152 : INFO : PROGRESS: saving document #1117000
2017-07-18 17:30:35,311 : INFO : PROGRESS: savin