### Task 0. Train your own doc2vec model on a test dataset. Most of the example files use Parquet file format. A short guide below.

In [27]:
import gensim
import gensim.test.utils
import pandas as pd
from gensim.models.doc2vec import TaggedDocument

In [31]:
def read_corpus_from_parquet(parquet_file, text_column="text", tokens_only=False):
    df = pd.read_parquet(parquet_file)
    for i, text in enumerate(df[text_column]):
        tokens = gensim.utils.simple_preprocess(str(text))  
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])

train_parquet = 'train-DataEntity_chunk_15.parquet'
test_parquet = 'train-DataEntity_chunk_1.parquet'

train_corpus = list(read_corpus_from_parquet(train_parquet))
test_corpus = list(read_corpus_from_parquet(test_parquet, tokens_only=True))

print(train_corpus[2])
print(test_corpus[2])


TaggedDocument<['while', 'you', 're', 'enjoying', 'delicious', 'meal', 'thousands', 'of', 'disaster', 'victims', 'children', 'amp', 'refugees', 'suffer', 'from', 'hunger', 'and', 'extreme', 'climate', 'chill', 'heat', 'challenge', 'you', 'to', 'donate', 'the', 'equivalent', 'cost', 'of', 'cup', 'of', 'coffee', 'to', 'unicef', 'victims', 'chill', 'heatasg'], [2]>
['rt', 'mmcrypto', 'usd', 'is', 'just', 'another', 'memecoin']


In [33]:
import gensim.models

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [35]:
model.build_vocab(train_corpus)

In [37]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [39]:
model.dv

<gensim.models.keyedvectors.KeyedVectors at 0x186530bd9c0>

In [41]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])
print(vector)

[-0.01817703  0.02002949  0.03789958 -0.03567206 -0.01733046 -0.14292629
  0.15115239  0.19528143 -0.10090808 -0.11357246 -0.00113263 -0.03175327
  0.07791322  0.08267552  0.12688895  0.03368278 -0.00714415 -0.07524869
 -0.1872876  -0.02960718 -0.01637765 -0.01885884  0.21894124 -0.07404923
  0.02480913 -0.11801729 -0.02311789  0.05502791 -0.17846859 -0.19118147
 -0.04015056  0.04998495 -0.29385495  0.13571197 -0.01317876  0.2122886
  0.12050559 -0.03167946 -0.02401617  0.07572369  0.15291314  0.04277322
 -0.07160015  0.07232527  0.17524458  0.02997306 -0.28075707 -0.02115787
  0.00558015  0.06790783]


In [43]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 1214, 1: 210, 2: 97, 3: 69, 4: 42, 5: 41, 7: 26, 6: 19, 8: 12, 9: 12, 16: 10, 10: 10, 14: 9, 13: 9, 12: 8, 11: 7, 18: 6, 26: 6, 21: 6, 17: 6, 19: 5, 20: 4, 25: 4, 38: 4, 29: 3, 48: 3, 15: 3, 24: 3, 33: 3, 46: 2, 30: 2, 23: 2, 60: 2, 83: 2, 35: 2, 34: 2, 511: 2, 27: 2, 124: 2, 144: 2, 164: 2, 31: 2, 193: 2, 89: 2, 107: 2, 95: 2, 113: 2, 73: 2, 39: 2, 22: 2, 1291: 1, 905: 1, 50: 1, 68: 1, 541: 1, 1201: 1, 368: 1, 1831: 1, 98: 1, 101: 1, 533: 1, 492: 1, 1050: 1, 1316: 1, 1112: 1, 1243: 1, 147: 1, 1508: 1, 350: 1, 182: 1, 478: 1, 787: 1, 434: 1, 540: 1, 526: 1, 416: 1, 117: 1, 158: 1, 116: 1, 407: 1, 142: 1, 523: 1, 240: 1, 901: 1, 40: 1, 1293: 1, 100: 1, 1883: 1, 1929: 1, 1549: 1, 1714: 1, 1612: 1, 1574: 1, 1777: 1, 1596: 1, 1389: 1, 805: 1, 1334: 1, 296: 1, 379: 1, 503: 1, 1048: 1, 1019: 1, 213: 1, 337: 1, 572: 1, 248: 1, 70: 1, 133: 1, 197: 1, 251: 1, 361: 1, 387: 1, 104: 1, 143: 1, 224: 1, 52: 1, 28: 1, 108: 1, 935: 1, 67: 1, 37: 1, 766: 1, 314: 1, 639: 1, 1829: 1, 1790: 1,

In [45]:
import random

# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (16061): «seiyans unified liquidity is here openoceanglobal joins sei for the best swap prices in one place seinetwork defi»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (132, 0.7481477856636047): «irreverent vibes reserva por whatsapp laleche puertovallarta dinner food puertovallarta gastronomia fashion white juevesitos»

MEDIAN (1430, 0.5219685435295105): «every outfit is an opportunity to reinvent yourself buy now log on todownload the app now designerstyles outfit fashion churchsuits»

LEAST (1444, -0.42181989550590515): «ash that falls art fashion music»



In [49]:
# Pick a random test document
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=5)

print(f"Test Document ({doc_id}): «{' '.join(test_corpus[doc_id])}»\n")

# Display most similar documents
print("Most Similar Documents:")
for sim_id, similarity in sims:
    try:
        sim_id = int(sim_id)  # Convert document ID to integer
        if 0 <= sim_id < len(train_corpus):  
            print(f"Document {sim_id} (Similarity: {similarity:.4f}): «{' '.join(train_corpus[sim_id].words)}»\n")
        else:
            print(f"Document {sim_id} is out of bounds for train_corpus.\n")
    except (ValueError, IndexError) as e:
        print(f"Error processing document {sim_id}: {e}\n")

Test Document (9542): «aliceusdt losing its support for more signals link in bio crypto_news aliceusdt portalusdt fkokiusdt darusdt magicusdt bybit ethusdt btcusdt btc btc binance»

Most Similar Documents:
Document 1707 (Similarity: 0.6293): «vote for trump gt btc to gt dog to»

Document 286 (Similarity: 0.5626): «join me for dice and slots with unlimited faucet grab treasure boxes to share btc bonus pool dice slots btc gaming onlinecasino crypto onlinegaming»

Document 310 (Similarity: 0.5259): «join me for dice and slots with unlimited faucet grab treasure boxes thare btc bonus pool dice slots btc gaming onlinecasino crypto onlinegaming»

Document 1386 (Similarity: 0.5253): «join me for dice and slots with unlimited faucet grab treasure boxes to share btc bonus pool dice slots btc gaming onlinecasino crypto onlinegaming»

Document 333 (Similarity: 0.5189): «join me for dice and slots with unlimited faucet grab treasure boxes to share btc bonus pool dice slots btc gaming onlinecasino 