In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import VocabTransform

from common import *
import os
import glob
from joblib import Parallel, delayed
import itertools
import multiprocessing
import copy 

from gzip import GzipFile
import json
from glob import glob

cpu_count = multiprocessing.cpu_count()
DATA_FOLDER = '../data/'

# Extract documents and tokenize

In [2]:
class DocumentExtractor(object):
    def __init__(self, doc_list):
        self.doc_list = doc_list
 
    def __iter__(self):
        for fn in tqdm(self.doc_list):
            text = open(fn).read()
            document = list(tokenize(text))
            yield document
            
def save_documents(doc_list, prefix):
    docs = list(DocumentExtractor(doc_list))
    
    fname = join(DATA_FOLDER, 'documents/%s' % prefix + '.json.gz')
    with GzipFile(fname, 'w') as fout:
        json_str = json.dumps(docs)
        json_bytes = json_str.encode('utf-8')
        
        fout.write(json_bytes)

In [3]:
all_docs = get_all_docs(DATA_FOLDER)

In [None]:
parallelizer = Parallel(n_jobs=cpu_count)

# this iterator returns the functions to execute for each task
tasks_iterator = ( delayed(save_documents)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(all_docs)//1000, all_docs)) ) 
result = parallelizer( tasks_iterator )

100%|██████████| 1194/1194 [00:14<00:00, 83.67it/s] 
100%|██████████| 1194/1194 [00:15<00:00, 78.98it/s] 
100%|██████████| 1194/1194 [00:15<00:00, 78.23it/s]
100%|██████████| 1194/1194 [00:15<00:00, 77.95it/s]
100%|██████████| 1194/1194 [00:15<00:00, 77.80it/s]
100%|██████████| 1194/1194 [00:16<00:00, 72.25it/s]
100%|██████████| 1194/1194 [00:16<00:00, 71.37it/s]
100%|██████████| 1194/1194 [00:16<00:00, 69.23it/s]
100%|██████████| 1194/1194 [00:17<00:00, 60.10it/s]
100%|██████████| 1194/1194 [00:17<00:00, 68.52it/s]
100%|██████████| 1194/1194 [00:18<00:00, 63.92it/s]
100%|██████████| 1194/1194 [00:18<00:00, 54.77it/s]
100%|██████████| 1194/1194 [00:20<00:00, 59.32it/s]
100%|██████████| 1194/1194 [00:23<00:00, 51.79it/s] 
  9%|▉         | 108/1194 [00:01<00:16, 65.18it/s]]
100%|██████████| 1194/1194 [00:24<00:00, 48.25it/s]
100%|██████████| 1194/1194 [00:25<00:00, 47.76it/s]
100%|██████████| 1194/1194 [00:25<00:00, 46.10it/s]
100%|██████████| 1194/1194 [00:26<00:00, 45.23it/s]
 91%|████

100%|██████████| 1194/1194 [00:16<00:00, 69.79it/s]
100%|██████████| 1194/1194 [00:17<00:00, 68.68it/s]
100%|██████████| 1194/1194 [00:16<00:00, 74.02it/s]
 21%|██▏       | 254/1194 [00:03<00:11, 78.80it/s]]
 10%|▉         | 117/1194 [00:01<00:12, 89.08it/s]]
100%|██████████| 1194/1194 [00:16<00:00, 73.34it/s]
100%|██████████| 1194/1194 [00:16<00:00, 71.67it/s]
 55%|█████▌    | 661/1194 [00:08<00:10, 52.99it/s]]
 83%|████████▎ | 994/1194 [00:13<00:02, 93.56it/s] 
 83%|████████▎ | 989/1194 [00:14<00:03, 66.83it/s]]
 78%|███████▊  | 928/1194 [00:13<00:06, 43.18it/s]]
100%|██████████| 1194/1194 [00:16<00:00, 70.56it/s]
100%|██████████| 1194/1194 [00:16<00:00, 72.63it/s]
 21%|██▏       | 256/1194 [00:03<00:14, 63.59it/s]]]
100%|██████████| 1194/1194 [00:14<00:00, 80.24it/s] 
100%|██████████| 1194/1194 [00:16<00:00, 71.54it/s]
100%|██████████| 1194/1194 [00:16<00:00, 74.57it/s]
100%|██████████| 1194/1194 [00:16<00:00, 70.53it/s]]
100%|██████████| 1194/1194 [00:16<00:00, 72.99it/s] 
100%|███

100%|██████████| 1194/1194 [00:15<00:00, 76.98it/s]
 97%|█████████▋| 1154/1194 [00:15<00:00, 77.05it/s]
 48%|████▊     | 568/1194 [00:07<00:10, 61.25it/s]]
100%|██████████| 1194/1194 [00:15<00:00, 75.81it/s]
100%|██████████| 1194/1194 [00:15<00:00, 77.52it/s]
 85%|████████▍ | 1009/1194 [00:14<00:02, 72.29it/s]
100%|██████████| 1194/1194 [00:15<00:00, 64.06it/s]
 54%|█████▍    | 648/1194 [00:08<00:07, 73.92it/s]]
 23%|██▎       | 273/1194 [00:03<00:15, 57.60it/s]]
100%|██████████| 1194/1194 [00:18<00:00, 66.02it/s]
 87%|████████▋ | 1033/1194 [00:14<00:02, 65.16it/s]
 33%|███▎      | 397/1194 [00:05<00:10, 76.49it/s]]
100%|██████████| 1194/1194 [00:16<00:00, 74.22it/s]
 76%|███████▌  | 908/1194 [00:13<00:04, 68.85it/s]]]
100%|██████████| 1194/1194 [00:16<00:00, 70.77it/s] 
 18%|█▊        | 213/1194 [00:02<00:12, 77.77it/s]]
100%|██████████| 1194/1194 [00:16<00:00, 72.44it/s]
 64%|██████▎   | 761/1194 [00:11<00:13, 31.19it/s]] 
 54%|█████▎    | 641/1194 [00:08<00:10, 51.22it/s]]
100%|████

# Calc bow

In [2]:
flist = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

In [5]:
class DocumentCorpus(gensim.corpora.TextCorpus):
    def get_texts(self):
        for fn in tqdm(self.input): # for each relevant file
            with GzipFile(fn, 'r') as myzip:
                text = myzip.read()
            docs = json.loads(text)
            for doc in docs:
                yield doc

                
def save_doc_corpus(ziped_files, dir_name, prefix):
    corpus = DocumentCorpus(ziped_files)
    
    dic_name = join(dir_name, '%s.dict' % prefix)
    corp_name = join(dir_name, '%s_corpus.mm' % prefix)
    
    corpus.dictionary.save(dic_name)
    corpora.MmCorpus.serialize(corp_name, corpus)
    
    return dic_name, corp_name
                

In [6]:
save_doc_corpus(flist, DATA_FOLDER, 'pure')

100%|██████████| 1001/1001 [36:04<00:00,  2.22s/it]
 64%|██████▎   | 637/1001 [18:42<23:27,  3.87s/it]

KeyboardInterrupt: 

### Filter out tokens that appear in less than 5 documents (absolute number) or more than 50% documents

In [None]:
%%time
# filter the dictionary
old_dict = corpora.Dictionary.load(join(DATA_FOLDER, 'old.dict'))
new_dict = copy.deepcopy(old_dict)
new_dict.filter_extremes(no_below=3, keep_n=None)
new_dict.save(join(DATA_FOLDER, 'filtered.dict'))

In [None]:
%%time
# now transform the corpus
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus.mm'))
old2new = {old_dict.token2id[token]:new_id for new_id, token in new_dict.iteritems()}
vt = VocabTransform(old2new)

In [None]:
%%time
corpora.MmCorpus.serialize(join(DATA_FOLDER, 'filtered_corpus.mm'), vt[corpus], id2word=new_dict)

In [55]:
!cp ../data/*corpus* ~/Yandex.Disk
!cp ../data/*.dict ~/Yandex.Disk
!cp ../data/all_docs.txt ~/Yandex.Disk

cp: cannot stat '../data/*corpus*': No such file or directory
cp: cannot stat '../data/*.dict': No such file or directory
cp: cannot stat '../data/all_docs.txt': No such file or directory
