# Generate Gensim dictionary and corpus, then store to disk

In [1]:
import gc
from time import time
from os import listdir

CORPUS_DIR = './processed_wiki/'
processed_corpus = listdir(CORPUS_DIR)

In [2]:
from os.path import join

def load_corpus(corpus_file, docs, path=CORPUS_DIR):
    partial_docs = {}
    with open(join(path, corpus_file), 'r') as f:
        for line in f:
            line = line.strip('\n').split(' ')
            title = line[0]
            partial_docs[title] = partial_docs.get(title, []) + line[2:]
    docs.update(partial_docs)

In [3]:
from multiprocessing import Pool, Process, Manager
from functools import partial

start = time()

pool = Pool(processes = 12)

manager = Manager()
docs = manager.dict()

pool.map(partial(load_corpus, docs=docs), processed_corpus)
pool.close()
pool.join()

print(len(docs), "docs in all.")
print("takes", time() - start, "seconds")

5396106 docs in all.
takes 99.4082760810852 seconds


In [4]:
from tqdm import tqdm

contents = [] # {doc_id, title}
titles = {}
i = 0

for title in tqdm(docs.keys()):
    titles[i] = title
    i += 1
    contents.append(docs[title])

del docs
gc.collect()

100%|██████████| 5396106/5396106 [07:10<00:00, 12527.08it/s]


0

### Save dictionary and represented corpus to disk

In [5]:
from gensim import corpora
from gensim.corpora import Dictionary

word_dict = Dictionary(contents)

# store the dictionary
# load by: corpora.Dictionary.load(path)
word_dict.save('./objects/wiki_gensim.dict')
print(len(word_dict)) # 2002971

contents = [word_dict.doc2bow(doc) for doc in contents]  # convert corpus to BoW format

# store to disk
# load by: corpora.MmCorpus(path)
corpora.MmCorpus.serialize('./objects/wiki_corpus_gensim.mm', contents)




2002958


In [7]:
import pickle
OBJECT_DIR = './objects/'

with open(join(OBJECT_DIR, 'titles_gensim.pkl'), 'wb') as f_docs:
        pickle.dump(titles, f_docs)