In [3]:
pip install tensorflow tensorflow_hub torch transformers nltk gensim scikit-learn


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow_hub as hub
from transformers import BertTokenizer, BertModel
import torch

# Download stopwords and tokenizer models if not available
import nltk
nltk.download('punkt')
nltk.download('stopwords')




  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaavy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaavy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
# Load datasets
articles_df = pd.read_csv('articles_dataset.csv')
main_articles_df = pd.read_csv('main_articles.csv')


### JACCARD SIMILARITY 
##### The Jaccard similarity compares the overlap of unique words between two documents. 

In [6]:
def jaccard_similarity(doc1, doc2):
    tokens1 = set(word_tokenize(doc1.lower()))
    tokens2 = set(word_tokenize(doc2.lower()))
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection) / len(union) if union else 0


### TF-IDF Cosine Similarity
##### TF-IDF (Term Frequency-Inverse Document Frequency) represents text as weighted terms based on their frequency across documents.

In [7]:
# Calculate TF-IDF cosine similarity
def tfidf_cosine_similarity(articles, main_articles):
    documents = articles + main_articles
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf_matrix[len(articles):], tfidf_matrix[:len(articles)])
    return similarity_matrix


In [14]:
def train_doc2vec_model(documents):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

# Calculate Doc2Vec similarity
def doc2vec_similarity(doc2vec_model, main_articles, articles):
    similarities = []
    
    # Get the vectors for the articles in the dataset
    article_vectors = [doc2vec_model.infer_vector(word_tokenize(article.lower())) for article in articles]

    for main_article in main_articles:
        # Infer vector for the main article
        main_vector = doc2vec_model.infer_vector(word_tokenize(main_article.lower()))
        
        # Calculate cosine similarity between the main article vector and each article vector
        sims = cosine_similarity([main_vector], article_vectors)[0]
        similarities.append(sims)
    
    return np.array(similarities)

In [9]:
# Load USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Calculate USE similarity
def use_similarity(use_model, main_articles, articles):
    embeddings = use_model(articles + main_articles)
    article_embeddings = embeddings[:len(articles)]
    main_article_embeddings = embeddings[len(articles):]
    similarity_matrix = cosine_similarity(main_article_embeddings, article_embeddings)
    return similarity_matrix














In [10]:
# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to get BERT embeddings
def bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings

# Calculate BERT similarity
def bert_cosine_similarity(main_articles, articles):
    article_embeddings = torch.stack([bert_embedding(doc) for doc in articles])
    main_article_embeddings = torch.stack([bert_embedding(doc) for doc in main_articles])
    similarity_matrix = cosine_similarity(main_article_embeddings.detach().numpy(), article_embeddings.detach().numpy())
    return similarity_matrix


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [37]:
# Prepare documents
article_texts = articles_df['content'].tolist()
main_article_texts = main_articles_df['content'].tolist()

# Calculate similarities
# Jaccard Similarity
jaccard_results = [[jaccard_similarity(main_article, article) for article in article_texts] for main_article in main_article_texts]
print("JACCARD\n",jaccard_results)

most_relevant_indices_jaccard = np.argmax(jaccard_results, axis=1)  # Get the index of the highest Jaccard score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_jaccard):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {jaccard_results[i][index]}')


JACCARD
 [[0.32231404958677684, 0.17647058823529413, 0.234375, 0.24812030075187969, 0.17777777777777778, 0.09859154929577464, 0.06756756756756757, 0.07236842105263158, 0.0896551724137931, 0.08904109589041095, 0.0763888888888889, 0.10714285714285714, 0.10869565217391304, 0.0851063829787234, 0.09859154929577464, 0.1323529411764706, 0.13138686131386862, 0.11267605633802817, 0.07042253521126761, 0.06756756756756757, 0.0763888888888889, 0.09523809523809523, 0.06944444444444445, 0.05517241379310345, 0.08450704225352113, 0.1450381679389313, 0.07801418439716312, 0.06944444444444445, 0.06993006993006994, 0.14492753623188406], [0.08333333333333333, 0.10638297872340426, 0.08450704225352113, 0.08, 0.06896551724137931, 0.35714285714285715, 0.14925373134328357, 0.14388489208633093, 0.16666666666666666, 0.34782608695652173, 0.08633093525179857, 0.10218978102189781, 0.1037037037037037, 0.09558823529411764, 0.10144927536231885, 0.08695652173913043, 0.07857142857142857, 0.06206896551724138, 0.1044776119

In [36]:
# TF-IDF Cosine Similarity
tfidf_results = tfidf_cosine_similarity(article_texts, main_article_texts)
print("tf-IDF\n", tfidf_results)

most_relevant_indices_tfidf = np.argmax(tfidf_results, axis=1)  # Get the index of the highest TF-IDF score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_tfidf):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {tfidf_results[i][index]}')



tf-IDF
 [[0.4688665  0.24893798 0.25074102 0.24238164 0.18279797 0.00965629
  0.00821295 0.02451711 0.0142275  0.         0.         0.
  0.01308215 0.         0.00487419 0.10231106 0.0723798  0.03936148
  0.00446883 0.         0.02044007 0.05737888 0.02312622 0.
  0.0727262  0.16288353 0.02875954 0.00717856 0.         0.05234328]
 [0.01361154 0.0146547  0.00857552 0.00513095 0.         0.41302784
  0.13265192 0.11786258 0.12705544 0.41613266 0.         0.
  0.0162036  0.00640428 0.005092   0.01545356 0.0050568  0.01278371
  0.02402345 0.02765484 0.07540858 0.02738852 0.09523189 0.05264441
  0.02048835 0.005362   0.0415268  0.01660042 0.00518938 0.01222151]
 [0.         0.         0.         0.         0.         0.0280058
  0.         0.         0.         0.01380417 0.58234194 0.12446177
  0.1162371  0.16678141 0.2012572  0.01312809 0.         0.
  0.         0.01130896 0.         0.00747022 0.         0.04411599
  0.03419052 0.         0.         0.00781984 0.02259741 0.02344958]]
M

In [35]:
# Doc2Vec Similarity
doc2vec_model = train_doc2vec_model(article_texts)
doc2vec_results = doc2vec_similarity(doc2vec_model, main_article_texts, article_texts)
print("DOC2VEC\n",doc2vec_results)

most_similar_indices_doc2vec = np.argmax(doc2vec_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, print the most similar articles
for i, index in enumerate(most_similar_indices_doc2vec):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {doc2vec_results[i][index]}')


DOC2VEC
 [[0.9995862  0.99775237 0.9989814  0.9970363  0.998332   0.9879494
  0.9884095  0.9915683  0.9869577  0.9872266  0.9751868  0.9781804
  0.984184   0.97782177 0.9827848  0.9970736  0.99291795 0.99702764
  0.9944109  0.98695755 0.9920814  0.99057436 0.9918618  0.9920186
  0.993199   0.9963825  0.98819846 0.9895997  0.9892794  0.9941271 ]
 [0.990297   0.9830488  0.98728025 0.9807074  0.99015033 0.9989609
  0.9954284  0.99689007 0.9969426  0.9991759  0.98153555 0.98524094
  0.9884725  0.98435754 0.9868573  0.9910061  0.99344736 0.9925432
  0.9960423  0.9932451  0.9966734  0.99286985 0.99690986 0.9957485
  0.99477947 0.9854026  0.9937084  0.9919193  0.9920916  0.99210066]
 [0.98014253 0.96918756 0.97383493 0.9669619  0.98083764 0.9836617
  0.9804289  0.9823265  0.9848832  0.98813766 0.9986988  0.99582475
  0.9969885  0.998369   0.9955824  0.98202294 0.9802786  0.98088086
  0.9859782  0.98045355 0.9823452  0.98111534 0.984378   0.98651236
  0.98646516 0.97211176 0.9780463  0.9814614

In [38]:
# Universal Sentence Encoder (USE) Similarity
use_results = use_similarity(use_model, main_article_texts, article_texts)
print("USE\n",use_results)

most_similar_indices = np.argmax(use_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, you can print the most similar articles
for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {use_results[i][index]}')

USE
 [[0.86069465 0.76420885 0.740463   0.76352227 0.7595467  0.19063026
  0.21240547 0.1879184  0.2070472  0.2100079  0.14990816 0.16397636
  0.17742212 0.14600855 0.17950413 0.6065383  0.36086747 0.48561922
  0.05517561 0.21453068 0.22229032 0.32367644 0.22682057 0.19416277
  0.19770594 0.65311444 0.26242983 0.26345915 0.24115716 0.2938951 ]
 [0.22951713 0.19375473 0.17456728 0.1695176  0.18252458 0.8090339
  0.63131773 0.6268543  0.6458459  0.8375203  0.09526568 0.1528458
  0.15937665 0.08314531 0.19652267 0.10925554 0.20254564 0.1765585
  0.36833516 0.4398242  0.5382788  0.2800901  0.4004066  0.16178066
  0.27254206 0.10683735 0.3967726  0.34346431 0.21737628 0.21110648]
 [0.13338564 0.15166816 0.10684954 0.09623087 0.15250912 0.17563823
  0.11751913 0.09462082 0.12815009 0.09429544 0.91972935 0.8196043
  0.7525368  0.84020984 0.7285485  0.14224401 0.09994093 0.1745681
  0.1474084  0.10642788 0.11604126 0.26533926 0.08276558 0.28898537
  0.24927774 0.13183558 0.08518702 0.16668099 

In [33]:
# BERT Cosine Similarity
bert_results = bert_cosine_similarity(main_article_texts, article_texts)
print("BERT\n", bert_results)

# Assuming `bert_results` is your 2D array of BERT similarity scores
most_similar_indices = np.argmax(bert_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, you can print the most similar articles
for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {bert_results[i][index]}')


BERT
 [[0.9734473  0.952684   0.9555909  0.93727356 0.9566747  0.7709882
  0.7754092  0.7811339  0.76857364 0.77283347 0.70799327 0.7506634
  0.74585706 0.74036765 0.74729294 0.9285645  0.82276434 0.881871
  0.7599436  0.78573954 0.7939458  0.8056215  0.7691313  0.79088104
  0.77249223 0.9178061  0.78859925 0.790879   0.7649845  0.7915998 ]
 [0.7437817  0.75796056 0.7642921  0.7440599  0.76163083 0.9705292
  0.9090331  0.9000414  0.9213861  0.9762016  0.7652032  0.77109146
  0.78228766 0.7657695  0.79718626 0.7354659  0.83528644 0.7883999
  0.83507013 0.8684075  0.86954373 0.8121923  0.8663658  0.8247652
  0.80878794 0.7150284  0.80993736 0.8401869  0.81717443 0.7820946 ]
 [0.6844796  0.6867629  0.6980556  0.6723521  0.70423406 0.75020003
  0.70057404 0.7042655  0.7130294  0.74019855 0.9811785  0.9295438
  0.9017242  0.95280284 0.8681864  0.687739   0.71629876 0.69612914
  0.6955874  0.6975173  0.70476335 0.6943746  0.71442294 0.7603599
  0.7312821  0.6888039  0.6989481  0.70684314 0.7

In [45]:
article_ids = articles_df['article_id'].tolist() 

comparison_data = {
    'Most Similar (Jaccard)': [],
    'Most Similar (TF-IDF)': [],
    'Most Similar (Doc2Vec)': [],
    'Most Similar (USE)': [],
    'Most Similar (BERT)': []
}

# Example for processing the results (ensure that the matrices and lengths match your data)
for i in range(3):  # Assuming you have three main articles
    most_relevant_indices_jaccard = np.argmax(jaccard_results[i])  # Get the index of the highest score
    most_relevant_indices_tfidf = np.argmax(tfidf_results[i])
    most_relevant_indices_doc2vec = np.argmax(doc2vec_results[i])
    most_relevant_indices_use = np.argmax(use_results[i])
    most_relevant_indices_bert = np.argmax(bert_results[i])

    comparison_data['Most Similar (Jaccard)'].append(article_ids[most_relevant_indices_jaccard])
    comparison_data['Most Similar (TF-IDF)'].append(article_ids[most_relevant_indices_tfidf])
    comparison_data['Most Similar (Doc2Vec)'].append(article_ids[most_relevant_indices_doc2vec])
    comparison_data['Most Similar (USE)'].append(article_ids[most_relevant_indices_use])
    comparison_data['Most Similar (BERT)'].append(article_ids[most_relevant_indices_bert])

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)


   Most Similar (Jaccard)  Most Similar (TF-IDF)  Most Similar (Doc2Vec)  \
0                       1                      1                       1   
1                       6                     10                      10   
2                      11                     11                      11   

   Most Similar (USE)  Most Similar (BERT)  
0                   1                    1  
1                  10                   10  
2                  11                   11  


In [47]:
def calculate_overlap(set1, set2):
    return len(set1.intersection(set2))

# Assuming you have the results stored in sets
overlaps = {}
for algo1, algo2 in [('Jaccard', 'TF-IDF'), ('Jaccard', 'Doc2Vec'), ('Jaccard', 'USE'), ('Jaccard', 'BERT'), 
                     ('TF-IDF', 'Doc2Vec'), ('TF-IDF', 'USE'), ('TF-IDF', 'BERT'), 
                     ('Doc2Vec', 'USE'), ('Doc2Vec', 'BERT'), ('USE', 'BERT')]:
    
    overlaps['jaccard-tfidf'] = calculate_overlap(set(most_relevant_indices_jaccard), 
                                                      set(most_relevant_indices_tfidf)) # Adjust for each pair

print(overlaps)


TypeError: 'numpy.int64' object is not iterable