In [1]:
from top2vec import Top2Vec

model_file = "/home/cds/Documents/Models/top2vec.model"

model = Top2Vec.load(model_file)

In [2]:
def topic_parser(file):
    topics = dict()
    with open(file, 'r') as fp:
        for line in fp.readlines():
            elems = line.split('|')
            topics[int(elems[0])] = elems[1]
    return topics

In [3]:
topics_file = "/home/cds/Documents/Topics/topics_full_2.txt"

topics = topic_parser(topics_file)

In [31]:
keywords = topics[51].split(" ")

keywords.remove("document")
keywords.remove("mention")
keywords.remove("discuss")
print(keywords)

['government', 'assistance', 'airbus', 'industrie', 'trade', 'dispute', 'airbus', 'aircraft', 'producer', 'issue', 'subsidy']


In [32]:
document_scores, document_ids = model.search_documents_by_keywords(keywords=keywords, num_docs=10000)
#for score, doc_id in zip(document_scores, document_ids):
#    print(f"Document: {doc_id}, Score: {score}")
  #  print("-----------")
 #   print()

In [4]:
def save_runs(doc_ids, doc_scores, topic_no, file):
    with open(file,  'a') as fp:
        for i, (doc_id, doc_score) in enumerate(zip(doc_ids, doc_scores)):
            fp.write(str(topic_no) + " Q0 " + doc_id + " " + str(i) + " " + str(doc_score) + " top2vec" + "\n")


In [33]:
run_file = "/home/cds/Documents/Runs/run_desc.txt"

save_runs(document_ids, document_scores, 51, run_file)

In [5]:
trcv_manifest = {"trcv1":["wsj87", "wsj88", "wsj89", "fr89", "ap89", "doe", "zf1"],
                 "trcv2":["wsj90", "wsj91", "wsj92", "fr88", "ap88", "zf2"],
                 "trcv3":["sjmn", "ap90", "pt", "zf3"],
                 "trcv4":["ft", "cr", "fr94"],
                 "trcv5":["fbis", "la"],
                 "trcv4_nocr":["ft", "fr94"]}

trec_manifest = {"trec1":["trcv1", "trcv2"],
                 "trec2":["trcv1", "trcv2"],
                 "trec3":["trcv1", "trcv2"],
                 "trec4":["trcv2", "trcv3"],
                 "trec5":["trcv2", "trcv4"],
                 "trec6":["trcv4", "trcv5"],
                 "trec7":["trcv4_nocr", "trcv5"],
                 "trec8":["trcv4_nocr", "trcv5"]}

def get_doc_prefix(trec_no):
    prefix_list = set()
    for trcv in trec_manifest["trec"+str(trec_no)]:
        for prefix in trcv_manifest[trcv]:
            prefix_list.add(prefix)
    return prefix_list

def result_filter(doc_scores, doc_ids, trec_no):
    filtered_scores = list()
    filtered_ids = list()

    prefixes = get_doc_prefix(trec_no)

    for i, doc_id in enumerate(doc_ids):
        for prefix in prefixes:
            if doc_id.lower().startswith(prefix):
                filtered_ids.append(doc_id)
                filtered_scores.append(doc_scores[i])

    return filtered_scores, filtered_ids

def get_topic_list(trec_no):
    return [i for i in range(50*trec_no+1, 50*(trec_no+1)+1)]

def check_keywords(keywords):
    oov_keywords = list()
    filtered_keywords = list()
    vocab = model.model.wv.vocab
    for word in keywords:
        if word not in vocab:
            oov_keywords.append(word)
        else:
            filtered_keywords.append(word)
    return filtered_keywords, oov_keywords

In [8]:
trec_no = 5

run_file = "/home/cds/Documents/Runs/run_top2vec.txt"

oov = list()

for topic_no in get_topic_list(trec_no):
    keywords = topics[topic_no].split(" ")
    filtered_keywords, oov_keywords = check_keywords(keywords)
    oov.extend(oov_keywords)
    document_scores, document_ids = model.search_documents_by_keywords(keywords=filtered_keywords, num_docs=10000)
    filtered_scores, filtered_ids = result_filter(document_scores, document_ids, trec_no)
    save_runs(filtered_ids, filtered_scores, topic_no, run_file)

In [9]:
def search_documents_by_topkeys(model=model, keywords=keywords, num_docs=num_docs):
    _,_,_,topics_nums = model.search_topics(keywords, 10)

    for topics_no in topics_nums:
        model.get_documents_by_topic()
        model.search_documents_by_topic(topics_no, model.get, return_documents=False)



In [1]:
test = ["apple", "sads"]
t = ["apple"]
model._get_word_vectors(t)

NameError: name 'model' is not defined

In [2]:
ds = model._get_document_vectors()
vs = model.model.wv.vectors

In [6]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np
import math

def search_documents_by_tfidf(top2vec, keywords, topn=10000):
    docvecs = top2vec._get_document_vectors()
    keywords_valid = list()
    for keyword in keywords:
        if keyword not in model.model.wv.vocab:
            print(keyword)
        else:
            keywords_valid.append(keyword)
    keywords_vecs = top2vec._get_word_vectors(keywords_valid)

    N = len(docvecs)
    scores = np.zeros(N)
    for word_vec in keywords_vecs:
        sim_docs = model.model.docvecs.most_similar(positive=[word_vec], topn=None)

        doc_tfs = np.array(sim_docs)

        most_sim_top_nums, _ = model._search_vectors_by_vector(model.topic_vectors, word_vec, num_res=1)
        nt = np.sum(model.doc_top == most_sim_top_nums[0])

        idf = math.log(N / nt)

        #print(nt, idf)
        tfidf = np.multiply(doc_tfs, idf)
        tfidf[tfidf < 0] = 0

        for i in range(N):
            scores[i] += tfidf[i]

    indexes = np.flip(np.argsort(scores)[-topn:])
    sorted_scores = np.array([round(scores[rank], 6) for rank in indexes])
    doc_ids = model._get_document_ids(indexes)
    return sorted_scores, doc_ids


In [8]:
ss, dd = search_documents_by_tfidf(model, ["disorder", "syndrome"], 1634243)
print(np.min(ss))

0.0


In [20]:
wordvecs = model._get_word_vectors(["disorder"])
res = model.model.docvecs.most_similar(positive=wordvecs, topn=None)

In [13]:
minim = res.min()

In [9]:
%%time
trec_no = 5

run_file = "/home/cds/Documents/Runs/run_top2vec_tfidf.txt"

oov = list()

for topic_no in get_topic_list(trec_no):
    print(topic_no)
    keywords = topics[topic_no].split(" ")
    filtered_keywords, oov_keywords = check_keywords(keywords)
    oov.extend(oov_keywords)

    document_scores, document_ids = search_documents_by_tfidf(model, filtered_keywords, topn=10000)
    filtered_scores, filtered_ids = result_filter(document_scores, document_ids, trec_no)
    save_runs(filtered_ids, filtered_scores, topic_no, run_file)


251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
CPU times: user 4min 43s, sys: 1min 34s, total: 6min 17s
Wall time: 1min 48s


In [27]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np
import math

def search_documents_by_euclidean(top2vec, keywords, topn=10000):
    docvecs = top2vec._get_document_vectors()
    keywords_valid = list()
    for keyword in keywords:
        if keyword not in model.model.wv.vocab:
            print(keyword)
        else:
            keywords_valid.append(keyword)
    keywords_vecs = top2vec._get_word_vectors(keywords_valid)

    combined_vec = model._get_combined_vec(keywords_vecs, [])

    results = euclidean_distances(docvecs, combined_vec.reshape(1,-1))
    results = [r[0] for r in results]

    scores = results
    # scores = np.reciprocal(results)

    indexes = np.flip(np.argsort(scores)[-topn:])
    sorted_scores = np.array([scores[rank] for rank in indexes])
    doc_ids = model._get_document_ids(indexes)
    return sorted_scores, doc_ids

In [24]:
ss, dd = search_documents_by_euclidean(model, ["disorder", "syndrome"], 100)
print(np.min(ss))

0.4039933073197139


In [28]:
%%time
trec_no = 5

run_file = "/home/cds/Documents/Runs/run_top2vec_euclidean_increase.txt"

oov = list()

for topic_no in get_topic_list(trec_no):
    print(topic_no)
    keywords = topics[topic_no].split(" ")
    filtered_keywords, oov_keywords = check_keywords(keywords)
    oov.extend(oov_keywords)

    document_scores, document_ids = search_documents_by_euclidean(model, filtered_keywords, topn=10000)
    filtered_scores, filtered_ids = result_filter(document_scores, document_ids, trec_no)
    save_runs(filtered_ids, filtered_scores, topic_no, run_file)

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
CPU times: user 2min 31s, sys: 34.9 s, total: 3min 6s
Wall time: 1min 15s


In [10]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np
import math

def search_documents_by_queryexp(top2vec, keywords, topn=10000):

    keywords_valid = list()
    for keyword in keywords:
        if keyword not in model.model.wv.vocab:
            print(keyword)
        else:
            keywords_valid.append(keyword)
    keywords_vecs = top2vec._get_word_vectors(keywords_valid)

    candidate_vecs = list()
    candidate_vecs.extend(keywords_vecs)
    # N = len(docvecs)
    # scores = np.zeros(N)
    for word_vec in keywords_vecs:
        # sim_docs = model.model.docvecs.most_similar(positive=[word_vec], topn=None)

        # doc_tfs = np.array(sim_docs)

        most_sim_top_nums, _ = model._search_vectors_by_vector(model.topic_vectors, word_vec, num_res=1)
        most_sim_top_vec = model.topic_vectors[most_sim_top_nums]

        # print(most_sim_top_vec[0])

        candidate_vecs.append(most_sim_top_vec[0])

    sim_docs = model.model.docvecs.most_similar(positive=candidate_vecs, topn=topn)
    doc_indexes = [doc[0] for doc in sim_docs]
    doc_scores = np.array([round(doc[1], 6) for doc in sim_docs])

    return doc_scores, model._get_document_ids(doc_indexes)

In [11]:
%%time
trec_no = 5

run_file = "/home/cds/Documents/Runs/run_top2vec_queryexp_by_topicvecs.txt"

oov = list()

for topic_no in get_topic_list(trec_no):
    print(topic_no)
    keywords = topics[topic_no].split(" ")
    filtered_keywords, oov_keywords = check_keywords(keywords)
    oov.extend(oov_keywords)

    document_scores, document_ids = search_documents_by_queryexp(model, filtered_keywords, topn=10000)
    filtered_scores, filtered_ids = result_filter(document_scores, document_ids, trec_no)
    save_runs(filtered_ids, filtered_scores, topic_no, run_file)


251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
CPU times: user 1min 7s, sys: 32.3 s, total: 1min 39s
Wall time: 10.6 s


In [45]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np
import math

def search_documents_by_cluster(top2vec, keywords, topn=10000):

    keywords_valid = list()
    for keyword in keywords:
        if keyword not in model.model.wv.vocab:
            print(keyword)
        else:
            keywords_valid.append(keyword)
    keywords_vecs = top2vec._get_word_vectors(keywords_valid)

    combined_vec = model._get_combined_vec(keywords_vecs,[])

    most_sim_top_nums,_ = model._search_vectors_by_vector(model.topic_vectors, combined_vec, num_res=50)

    scores = model.model.docvecs.most_similar(positive=keywords_vecs, topn=None)

    cand_docs_nums = list()

    for topnum in most_sim_top_nums:
        cand_docs_nums.extend(np.where(model.doc_top == topnum)[0])

    cand_scores = [scores[i] for i in cand_docs_nums]

    indexes = np.flip(np.argsort(cand_scores))
    sorted_scores = np.array([round(cand_scores[rank], 6) for rank in indexes])

    num_indexes = [cand_docs_nums[i] for i in indexes]

    doc_ids = model._get_document_ids(num_indexes)
    return sorted_scores, doc_ids

In [46]:
%%time
trec_no = 5

run_file = "/home/cds/Documents/Runs/run_top2vec_clusters.txt"

oov = list()

for topic_no in get_topic_list(trec_no):
    print(topic_no)
    keywords = topics[topic_no].split(" ")
    filtered_keywords, oov_keywords = check_keywords(keywords)
    oov.extend(oov_keywords)

    document_scores, document_ids = search_documents_by_cluster(model, filtered_keywords, topn=10000)
    filtered_scores, filtered_ids = result_filter(document_scores, document_ids, trec_no)
    save_runs(filtered_ids, filtered_scores, topic_no, run_file)

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
CPU times: user 1min 6s, sys: 26.5 s, total: 1min 32s
Wall time: 12.9 s


In [36]:
a = np.array([1,2,3,4,5, 3,3,3,3])
b= np.where(a == 3)
print(b)
print(a)

c = list()
c.extend(b)

(array([2, 5, 6, 7, 8]),)
[1 2 3 4 5 3 3 3 3]
