In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# Train Doc2Vec model on the articles dataset
def train_doc2vec_model(documents):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

# Calculate Doc2Vec similarity
def doc2vec_similarity(doc2vec_model, main_articles, articles):
    similarities = []
    
    # Get the vectors for the articles in the dataset
    article_vectors = [doc2vec_model.infer_vector(word_tokenize(article.lower())) for article in articles]

    for main_article in main_articles:
        # Infer vector for the main article
        main_vector = doc2vec_model.infer_vector(word_tokenize(main_article.lower()))
        
        # Calculate cosine similarity between the main article vector and each article vector
        sims = cosine_similarity([main_vector], article_vectors)[0]
        similarities.append(sims)
    
    return np.array(similarities)

# Load datasets and prepare texts
articles_df = pd.read_csv('articles_dataset.csv')
main_articles_df = pd.read_csv('main_articles.csv')
article_texts = articles_df['content'].tolist()
main_article_texts = main_articles_df['content'].tolist()

# Train the Doc2Vec model
doc2vec_model = train_doc2vec_model(article_texts)

# Calculate similarities
doc2vec_results = doc2vec_similarity(doc2vec_model, main_article_texts, article_texts)

# Print results for debugging
print("Doc2Vec Similarity Results:")
print(doc2vec_results)


Doc2Vec Similarity Results:
[[0.9992943  0.9978086  0.99901503 0.9969616  0.99825    0.9879755
  0.9882103  0.9905945  0.98555416 0.9868718  0.9729185  0.97818434
  0.9836251  0.97857493 0.98245525 0.9966964  0.9929135  0.99635804
  0.9947279  0.9868724  0.99167717 0.98907524 0.991978   0.9914845
  0.99201125 0.9965451  0.98692256 0.988443   0.9888123  0.99362165]
 [0.9904594  0.9827363  0.9868343  0.9792856  0.9900706  0.9991852
  0.99528784 0.9968176  0.9963294  0.9992756  0.9808547  0.9848486
  0.988054   0.9857073  0.98626906 0.9909829  0.9933637  0.9920384
  0.99562556 0.99339867 0.99681747 0.99305415 0.9969336  0.9959613
  0.9949959  0.98581207 0.9931747  0.99238133 0.99242735 0.9918972 ]
 [0.9807769  0.96946317 0.9736219  0.9669678  0.9809723  0.98463047
  0.98044765 0.98260623 0.98439103 0.9883889  0.99849486 0.99616444
  0.9969496  0.9985168  0.9948186  0.98190725 0.9804834  0.98151654
  0.9858149  0.9808108  0.9831131  0.9821359  0.9844606  0.9869005
  0.9870114  0.97455454 0