# Sklearn TF-IDF matrix

In [1]:
from os import listdir
from time import time
import gc

DIR = './objects/'

processed_corpus = listdir(DIR)

In [2]:
from os.path import join

def load_corpus(corpus_file, docs, path=DIR):
    partial_docs = {}
    with open(join(path, corpus_file), 'r') as f:
        for line in f:
            title, sent_id_, sents = line.strip('\n').split(' ', 2)
            partial_docs[title] = partial_docs.get(title, "") + sents + " "
    docs.update(partial_docs)

In [3]:
from multiprocessing import Pool, Manager
from functools import partial

pool = Pool(processes = 12)

manager = Manager()
docs = manager.dict()

pool.map(partial(load_corpus, docs=docs), processed_corpus)
pool.close()
pool.join()

print(len(docs), "docs in all.")

5396106 docs in all.


In [4]:
from tqdm import tqdm

contents = [] # {doc_id, title}
titles = {}
i = 0

for title in tqdm(docs.keys()):
    titles[i] = title
    i += 1
    contents.append(docs[title])

100%|██████████| 5396106/5396106 [04:50<00:00, 18606.88it/s]


In [6]:
import pickle
with open('titles_dict.pkl', 'wb') as f_titles:
    pickle.dump(titles, f_titles)

In [7]:
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from time import time

start = time()

tf_idf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer())
])
tf_idf_matrix = tf_idf.fit_transform(contents)
# tfidf_matrix in chunks: https://gist.github.com/pbellon/575041e22320b8bc011929421a9d6263


print("cost", (time() - start) / 60, "mins get all docs vectors")
print(tf_idf_matrix.shape)

cost 5.611696068445841 mins get all docs vectors
(5396106, 3085725)


# Applying Singular Value Decomposition (SVD)
**MemoryError** currently, matrix tooo large!

Solutions:
1. chunk corpus
2. Database
3. Trie instead of dict (https://github.com/pytries/marisa-trie)

Ref: https://medium.com/@AgenceSkoli/how-to-avoid-memory-overloads-using-scikit-learn-f5eb911ae66c

In [None]:
# from sklearn.decomposition import TruncatedSVD

# svd = TruncatedSVD(n_components=1000, random_state=90042)
# truncated_matrix = svd.fit_transform(tf_idf_matrix)

# from scipy.sparse.linalg import svds

# Find most relevent documents

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

start = time()
claim1 = tf_idf.transform(["Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."])
cosine_similarities = [similarity[0] for similarity in cosine_similarity(tf_idf_matrix, claim1)]
doc_ids = np.argsort(cosine_similarities)[-50:][::-1]

print("takes around", time() - start, "seconds for each claim.")

takes around 8.324501037597656 seconds for each claim.


In [12]:
for doc_id in doc_ids:
    print("title: ", titles[doc_id])

title:  Coster
title:  Ved_verdens_ende
title:  Waldau_-LRB-surname-RRB-
title:  Waldau
title:  New_Amsterdam_-LRB-TV_series-RRB-
title:  Nikolaj
title:  A_Second_Chance_-LRB-2014_film-RRB-
title:  Coster_-LRB-disambiguation-RRB-
title:  A_Thousand_Times_Good_Night
title:  Sven_Coster
title:  The_Baker_-LRB-film-RRB-
title:  Kalle_Coster
title:  Nukaaka_Coster-Waldau
title:  Nikolaj_Coster-Waldau
title:  Jean-Baptiste_De_Coster_-LRB-Jesuit-RRB-
title:  De_Coster
title:  Saskia_De_Coster
title:  Tracy_Coster
title:  Project_Laurens_Janszoon_Coster
title:  The_Other_Woman_-LRB-2014_film-RRB-
title:  Dick_Coster
title:  Salomon_Coster
title:  Shot_Caller_-LRB-film-RRB-
title:  Harry_Waldau
title:  Small_Crimes
title:  Arnold_Coster
title:  Nikolaj_Plads
title:  Kunsthallen_Nikolaj
title:  Lionel_Lockridge
title:  Gustav_Waldau
title:  Samuel_Coster
title:  Nikolaj_Hess
title:  Coster–Kronig_transition
title:  45_-LRB-film-RRB-
title:  Stan_Coster
title:  Fox_House
title:  Tommy_Coster
tit