# ProQuest: Topic Modelling and Search with Top2Vec

In [1]:
import json
from top2vec import Top2Vec

In [2]:
with open("../data/raw/210119_en_deter_preprocessed.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [3]:
docs = [doc.get("fulltext", "") for doc in data if not doc.get("fulltext", "").startswith("Not available.")]

In [4]:
top2vec = Top2Vec(documents=docs, speed="test-learn", workers=2)
# top2vec = Top2Vec(documents=docs, embedding_model="universal-sentence-encoder")

2021-02-05 11:23:24,410 - top2vec - INFO - Pre-processing documents for training
2021-02-05 11:28:21,436 - top2vec - INFO - Creating joint document/word embedding
2021-02-05 11:39:24,835 - top2vec - INFO - Creating lower dimension embedding of documents
2021-02-05 11:40:28,047 - top2vec - INFO - Finding dense areas of documents
2021-02-05 11:40:32,922 - top2vec - INFO - Finding topics


In [5]:
top2vec.save("../models/pq-model")

## Search Topics

In [23]:
topic_words, word_scores, topic_scores, topic_nums = top2vec.search_topics(
    keywords=["russia"], 
#     keywords_neg=[],
    num_topics=20
)

## Search Papers by Topic

In [42]:
documents, document_scores, document_nums = top2vec.search_documents_by_topic(
    topic_num=20, 
    num_docs=5
)

## Search Papers by Keywords

In [51]:
documents, document_scores, document_nums = top2vec.search_documents_by_keywords(
    keywords=["russia"], 
#     keywords_neg=[],
    num_docs=20
)

## Find Similar Words

In [56]:
words, word_scores = top2vec.similar_words(
    keywords=["russia"], 
    keywords_neg=[], 
    num_words=20
)

In [57]:
words

array(['russian', 'moscow', 'russias', 'ukraine', 'kremlin', 'putin',
       'dmitry', 'vladimir', 'kommersant', 'dmitri', 'crimea',
       'kaliningrad', 'eastward', 'golts', 'kiev', 'lavrov', 'izvestia',
       'putins', 'medvedev', 'cis'], dtype='<U11')