In [1]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import VocabTransform

from common import *
import os
import glob
from joblib import Parallel, delayed
import itertools
import multiprocessing
import copy 

from gzip import GzipFile
import json
from glob import glob

cpu_count = multiprocessing.cpu_count()
DATA_FOLDER = '../data/'

Using TensorFlow backend.


# Extract documents and tokenize

In [2]:
class DocumentExtractor(object):
    def __init__(self, doc_list):
        self.doc_list = doc_list
 
    def __iter__(self):
        for fn in tqdm(self.doc_list):
            text = open(fn).read()
            document = list(tokenize(text))
            yield document
            
def save_documents(doc_list, prefix):
    docs = list(DocumentExtractor(doc_list))
    
    fname = join(DATA_FOLDER, 'documents/%s' % prefix + '.json.gz')
    with gzip.GzipFile(fname, 'w') as fout:
        json_str = json.dumps(docs)
        json_bytes = json_str.encode('utf-8')
        
        fout.write(json_bytes)

In [None]:
all_docs = get_all_docs(DATA_FOLDER)

In [None]:
parallelizer = Parallel(n_jobs=cpu_count)

# this iterator returns the functions to execute for each task
tasks_iterator = ( delayed(save_documents)(list_block, i) for 
                  i, list_block in enumerate(grouper(len(all_docs)//1000, all_docs)) ) 
result = parallelizer( tasks_iterator )

# Calc bow

In [2]:
flist = sorted(glob(DATA_FOLDER + '/documents/*.gz'), key=natural_keys)

In [5]:
class DocumentCorpus(gensim.corpora.TextCorpus):
    def get_texts(self):
        for fn in tqdm(self.input): # for each relevant file
            with GzipFile(fn, 'r') as myzip:
                text = myzip.read()
            docs = json.loads(text)
            for doc in docs:
                yield doc

                
def save_doc_corpus(ziped_files, dir_name, prefix):
    corpus = DocumentCorpus(ziped_files)
    
    dic_name = join(dir_name, '%s.dict' % prefix)
    corp_name = join(dir_name, '%s_corpus.mm' % prefix)
    
    corpus.dictionary.save(dic_name)
    corpora.MmCorpus.serialize(corp_name, corpus)
    
    return dic_name, corp_name
                

In [6]:
save_doc_corpus(flist, DATA_FOLDER, 'pure')

100%|██████████| 1001/1001 [36:04<00:00,  2.22s/it]
 64%|██████▎   | 637/1001 [18:42<23:27,  3.87s/it]

KeyboardInterrupt: 

### Filter out tokens that appear in less than 5 documents (absolute number) or more than 50% documents

In [None]:
%%time
# filter the dictionary
old_dict = corpora.Dictionary.load(join(DATA_FOLDER, 'old.dict'))
new_dict = copy.deepcopy(old_dict)
new_dict.filter_extremes(no_below=3, keep_n=None)
new_dict.save(join(DATA_FOLDER, 'filtered.dict'))

In [None]:
%%time
# now transform the corpus
corpus = corpora.MmCorpus(join(DATA_FOLDER, 'corpus.mm'))
old2new = {old_dict.token2id[token]:new_id for new_id, token in new_dict.iteritems()}
vt = VocabTransform(old2new)

In [None]:
%%time
corpora.MmCorpus.serialize(join(DATA_FOLDER, 'filtered_corpus.mm'), vt[corpus], id2word=new_dict)

In [55]:
!cp ../data/*corpus* ~/Yandex.Disk
!cp ../data/*.dict ~/Yandex.Disk
!cp ../data/all_docs.txt ~/Yandex.Disk

cp: cannot stat '../data/*corpus*': No such file or directory
cp: cannot stat '../data/*.dict': No such file or directory
cp: cannot stat '../data/all_docs.txt': No such file or directory
