In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import VocabTransform

from common import *
import os
import glob
from joblib import Parallel, delayed
import itertools
import multiprocessing
import copy 

from gzip import GzipFile
import json
from glob import glob

cpu_count = multiprocessing.cpu_count()
DATA_FOLDER = '../data/'

Using TensorFlow backend.


# Extract documents and tokenize

In [2]:
class DocumentExtractor(object):
    def __init__(self, doc_list):
        self.doc_list = doc_list
 
    def __iter__(self):
        for fn in tqdm(self.doc_list):
            text = open(fn).read()
            document = list(tokenize(text))
            yield document
            
def save_documents(doc_list, prefix):
    docs = list(DocumentExtractor(doc_list))
    
    fname = join(DATA_FOLDER, 'documents/%s' % prefix + '.json.gz')
    with GzipFile(fname, 'w') as fout:
        json_str = json.dumps(docs)
        json_bytes = json_str.encode('utf-8')
        
        fout.write(json_bytes)

In [3]:
all_docs = get_all_docs(DATA_FOLDER)

In [None]:
parallelizer = Parallel(n_jobs=cpu_count)

# this iterator returns the functions to execute for each task
tasks_iterator = ( delayed(save_documents)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(all_docs)//1000, all_docs)) ) 
result = parallelizer( tasks_iterator )

100%|██████████| 1194/1194 [00:38<00:00, 30.95it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.74it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.96it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.72it/s]
100%|██████████| 1194/1194 [00:40<00:00, 13.58it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.94it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.25it/s]
100%|██████████| 1194/1194 [00:40<00:00, 33.16it/s]
100%|██████████| 1194/1194 [00:38<00:00, 30.85it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.82it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.53it/s]
100%|██████████| 1194/1194 [00:39<00:00, 29.95it/s]
100%|██████████| 1194/1194 [00:39<00:00, 38.45it/s]
100%|██████████| 1194/1194 [00:39<00:00, 37.92it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.83it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.62it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.48it/s]
100%|██████████| 1194/1194 [00:45<00:00, 26.37it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.07it/s]
100%|███████

100%|██████████| 1194/1194 [00:48<00:00, 24.39it/s]
100%|██████████| 1194/1194 [00:42<00:00, 34.06it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.21it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.98it/s]
100%|██████████| 1194/1194 [00:43<00:00, 11.05it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.94it/s]
100%|██████████| 1194/1194 [00:39<00:00, 23.53it/s]
100%|██████████| 1194/1194 [00:37<00:00, 30.56it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.95it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.03it/s]
100%|██████████| 1194/1194 [00:38<00:00, 31.36it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.52it/s]
100%|██████████| 1194/1194 [00:45<00:00, 26.37it/s]
100%|██████████| 1194/1194 [00:56<00:00, 18.48it/s]
100%|██████████| 1194/1194 [01:00<00:00, 19.32it/s]
100%|██████████| 1194/1194 [00:59<00:00, 19.93it/s]
100%|██████████| 1194/1194 [01:00<00:00, 24.49it/s]
100%|██████████| 1194/1194 [01:00<00:00, 19.60it/s]
 74%|███████▍  | 887/1194 [00:45<00:14, 21.23it/s]]
100%|███████

100%|██████████| 1194/1194 [00:44<00:00, 27.01it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.63it/s]
100%|██████████| 1194/1194 [00:44<00:00, 26.98it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.23it/s]
100%|██████████| 1194/1194 [00:42<00:00, 29.08it/s]
100%|██████████| 1194/1194 [00:41<00:00, 22.64it/s]
 73%|███████▎  | 871/1194 [00:32<00:14, 22.89it/s]]
100%|██████████| 1194/1194 [00:43<00:00, 27.38it/s]
100%|██████████| 1194/1194 [00:39<00:00, 30.07it/s]
100%|██████████| 1194/1194 [00:40<00:00, 31.70it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.68it/s]
100%|██████████| 1194/1194 [00:42<00:00, 27.88it/s]
100%|██████████| 1194/1194 [00:40<00:00, 29.21it/s]
100%|██████████| 1194/1194 [00:42<00:00, 28.09it/s]
100%|██████████| 1194/1194 [00:43<00:00, 41.49it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.44it/s]
100%|██████████| 1194/1194 [00:42<00:00, 21.50it/s]
100%|██████████| 1194/1194 [00:43<00:00, 27.37it/s]
100%|██████████| 1194/1194 [00:41<00:00, 28.88it/s]
100%|███████

# Calc bow

In [None]:
ziped_files = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

In [None]:
class DocumentCorpus(gensim.corpora.TextCorpus):
    def __init__(self, ziped_files=None):
        super(DocumentCorpus, self).__init__()
        self.input = ziped_files
        if ziped_files:
            self.dictionary.add_documents(self.get_texts(), prune_at=None)
    def get_texts(self):
        for fn in tqdm(self.input): # for each relevant file
            with GzipFile(fn, 'r') as myzip:
                text = myzip.read()
            docs = json.loads(text)
            for doc in docs:
                yield doc
                
                
def save_doc_corpus(ziped_files, dir_name, prefix):
    corpus = DocumentCorpus(ziped_files)

    dic_name = join(dir_name, '%s.dict' % prefix)    
    corp_name = join(dir_name, '%s_corpus.mm' % prefix)    
    
    corpus.dictionary.save(dic_name)
    corpora.MmCorpus.serialize(corp_name, corpus)
    
    return dic_name, corp_name
                

In [None]:
save_doc_corpus(ziped_files, DATA_FOLDER, 'pure')

# Alternative approach

In [6]:
dictionary = corpora.Dictionary.load(join(DATA_FOLDER, 'pure.dict'))

In [7]:
corpus = DocumentCorpus()
corpus.input = ziped_files
corpus.dictionary = dictionary

corp_name = join(DATA_FOLDER, '%s_corpus.mm' % 'pure')
corpora.MmCorpus.serialize(corp_name, corpus)

100%|██████████| 1001/1001 [23:27<00:00,  1.53s/it]


# Filter out tokens that appear in less than 5 documents (absolute number) or more than 50% documents

In [None]:
%%time
# filter the dictionary
old_dict = corpora.Dictionary.load(join(DATA_FOLDER, 'old.dict'))
new_dict = copy.deepcopy(old_dict)
new_dict.filter_extremes(no_below=3, keep_n=None)
new_dict.save(join(DATA_FOLDER, 'filtered.dict'))

In [None]:
%%time
# now transform the corpus
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus.mm'))
old2new = {old_dict.token2id[token]:new_id for new_id, token in new_dict.iteritems()}
vt = VocabTransform(old2new)

In [None]:
%%time
corpora.MmCorpus.serialize(join(DATA_FOLDER, 'filtered_corpus.mm'), vt[corpus], id2word=new_dict)

In [55]:
!cp ../data/*corpus* ~/Yandex.Disk
!cp ../data/*.dict ~/Yandex.Disk
!cp ../data/all_docs.txt ~/Yandex.Disk

cp: cannot stat '../data/*corpus*': No such file or directory
cp: cannot stat '../data/*.dict': No such file or directory
cp: cannot stat '../data/all_docs.txt': No such file or directory
