# Generate Gensim dictionary and corpus, then store to disk

In [1]:
import gc
from time import time
from os import listdir

CORPUS_DIR = './processed_wiki/'
OBJECT_DIR = './objects/'

processed_corpus = listdir(CORPUS_DIR)

In [2]:
from os.path import join

def load_corpus(corpus_file, docs, path=CORPUS_DIR):
    partial_docs = {}
    with open(join(path, corpus_file), 'r') as f:
        for line in f:
            line = line.strip('\n').split(' ')
            title = line[0]
            partial_docs[title] = partial_docs.get(title, []) + line[2:]
    docs.update(partial_docs)

In [3]:
from multiprocessing import Pool, Process, Manager
from functools import partial

start = time()

pool = Pool(processes = 12)

manager = Manager()
docs = manager.dict()

pool.map(partial(load_corpus, docs=docs), processed_corpus)
pool.close()
pool.join()

print(len(docs), "docs in all.")
print("takes", time() - start, "seconds")

5396106 docs in all.
takes 134.1604151725769 seconds


In [4]:
from tqdm import tqdm

contents = [] # {doc_id, title}
titles = {}
i = 0

for title in tqdm(docs.keys()):
    titles[i] = title
    i += 1
    contents.append(docs[title])

del docs
gc.collect()

100%|██████████| 5396106/5396106 [08:40<00:00, 10359.63it/s] 


0

### Filter dictionary and store to disk

In [5]:
from gensim import corpora
from gensim.corpora import Dictionary

start = time()

word_dict = Dictionary(contents)
# Filter out tokens in dict by frequency:
# no_below: Keep tokens which are contained in at least no_below documents
# no_above: Keep tokens which are contained in no more than no_above documents
#   (fraction of total corpus size, not an absolute number).
word_dict.filter_extremes(no_below=5, no_above=0.5, keep_n=700000) # 2002971 -> 558522
print("final word dictionary size:", len(word_dict))

# store the dictionary
# load by: corpora.Dictionary.load(path)
word_dict.save(join(OBJECT_DIR, 'wiki_gensim_70.dict'))

print("This step takes", (time() - start) / 60, "mins")



final word dictionary size: 558522
This step takes 25.36000674565633 mins


### Save represented corpus to disk

In [6]:
start = time()

contents = [word_dict.doc2bow(doc) for doc in contents]  # convert corpus to BoW format

# store to disk
# load by: corpora.MmCorpus(path)
corpora.MmCorpus.serialize(join(OBJECT_DIR, 'wiki_corpus_gensim_70.mm'), contents)

print("This step takes", (time() - start) / 60, "mins")

This step takes 8.97870710293452 mins


In [7]:
import pickle

with open(join(OBJECT_DIR, 'titles_gensim_70.pkl'), 'wb') as f_docs:
        pickle.dump(titles, f_docs)