In [None]:
pip install tensorflow tensorflow_hub torch transformers nltk gensim scikit-learn


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow_hub as hub
from transformers import BertTokenizer, BertModel
import torch
import time

# Download stopwords and tokenizer models if not available
import nltk
nltk.download('punkt')
nltk.download('stopwords')
execution_times = {}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load datasets
articles_df = pd.read_csv('articles_dataset.csv')
main_articles_df = pd.read_csv('main_articles.csv')


### JACCARD SIMILARITY
##### The Jaccard similarity compares the overlap of unique words between two documents.

In [None]:
import nltk

# Download the 'punkt_tab' data package
nltk.download('punkt_tab')

def jaccard_similarity(doc1, doc2):
    tokens1 = set(word_tokenize(doc1.lower()))
    tokens2 = set(word_tokenize(doc2.lower()))
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection) / len(union) if union else 0


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### TF-IDF Cosine Similarity
##### TF-IDF (Term Frequency-Inverse Document Frequency) represents text as weighted terms based on their frequency across documents.

In [None]:
# Calculate TF-IDF cosine similarity
def tfidf_cosine_similarity(articles, main_articles):
    documents = articles + main_articles
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf_matrix[len(articles):], tfidf_matrix[:len(articles)])
    return similarity_matrix


In [None]:
def train_doc2vec_model(documents):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model

# Calculate Doc2Vec similarity
def doc2vec_similarity(doc2vec_model, main_articles, articles):
    similarities = []

    # Get the vectors for the articles in the dataset
    article_vectors = [doc2vec_model.infer_vector(word_tokenize(article.lower())) for article in articles]

    for main_article in main_articles:
        # Infer vector for the main article
        main_vector = doc2vec_model.infer_vector(word_tokenize(main_article.lower()))

        # Calculate cosine similarity between the main article vector and each article vector
        sims = cosine_similarity([main_vector], article_vectors)[0]
        similarities.append(sims)

    return np.array(similarities)

In [None]:
# Load USE model
import tensorflow_hub as hub

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Calculate USE similarity
def use_similarity(use_model, main_articles, articles):
    embeddings = use_model(articles + main_articles)
    article_embeddings = embeddings[:len(articles)]
    main_article_embeddings = embeddings[len(articles):]
    similarity_matrix = cosine_similarity(main_article_embeddings, article_embeddings)
    return similarity_matrix


In [None]:
# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to get BERT embeddings
def bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings

# Calculate BERT similarity
def bert_cosine_similarity(main_articles, articles):
    article_embeddings = torch.stack([bert_embedding(doc) for doc in articles])
    main_article_embeddings = torch.stack([bert_embedding(doc) for doc in main_articles])
    similarity_matrix = cosine_similarity(main_article_embeddings.detach().numpy(), article_embeddings.detach().numpy())
    return similarity_matrix


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
print(articles_df.head())
articles_df.columns = articles_df.columns.str.strip()

   article_id                                              title  \
0           1                        Protect Veterans From Fraud   
1           2                             ‘It’s Green and Slimy’   
2           3  Meteor Showers in 2020 That Will Light Up Nigh...   
3           4           Sync your calendar with the solar system   
4           5  Rocket Launches, Trips to Mars and More 2020 S...   

                                             content  
0  Congress could do much more to protect America...  
1  Christina Iverson and Jeff Chen ring in the Ne...  
2  All year long, Earth passes through streams of...  
3  Never miss an eclipse, a meteor shower, a rock...  
4  A year full of highs and lows in space just en...  


In [None]:
# Prepare documents
article_texts = articles_df['content'].tolist()
main_article_texts = main_articles_df['content'].tolist()
import time
# Calculate similarities
# Jaccard Similarity
start_time = time.time()

jaccard_results = [[jaccard_similarity(main_article, article) for article in article_texts] for main_article in main_article_texts]
print("JACCARD\n",jaccard_results)
execution_times['Jaccard'] = time.time() - start_time

most_relevant_indices_jaccard = np.argmax(jaccard_results, axis=1)  # Get the index of the highest Jaccard score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_jaccard):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {jaccard_results[i][index]}')


JACCARD
 [[0.02040816326530612, 0.056818181818181816, 0.08080808080808081, 0.10416666666666667, 0.08163265306122448, 0.07, 0.04, 0.0594059405940594, 0.05263157894736842, 0.08163265306122448, 0.06315789473684211, 0.06796116504854369, 0.052083333333333336, 0.05102040816326531, 0.08080808080808081, 0.06315789473684211, 0.05154639175257732, 0.057692307692307696, 0.09375, 0.03260869565217391, 0.028846153846153848, 0.061224489795918366, 0.061224489795918366, 0.06451612903225806, 0.022727272727272728, 0.056818181818181816, 0.07, 0.08045977011494253, 0.03409090909090909, 0.08163265306122448, 0.10204081632653061, 0.0, 0.07142857142857142, 0.057692307692307696, 0.08163265306122448, 0.061855670103092786, 0.041237113402061855, 0.07291666666666667, 0.06666666666666667, 0.03260869565217391, 0.08, 0.06451612903225806, 0.09574468085106383, 0.04672897196261682, 0.04854368932038835, 0.06593406593406594, 0.0625, 0.07692307692307693, 0.05102040816326531, 0.10204081632653061, 0.02247191011235955, 0.0873786

In [None]:
# TF-IDF Cosine Similarity
start_time = time.time()

tfidf_results = tfidf_cosine_similarity(article_texts, main_article_texts)
print("tf-IDF\n", tfidf_results)
execution_times['TF-IDF'] = time.time() - start_time


most_relevant_indices_tfidf = np.argmax(tfidf_results, axis=1)  # Get the index of the highest TF-IDF score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_tfidf):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {tfidf_results[i][index]}')



tf-IDF
 [[0.         0.0173833  0.03073753 0.         0.01330794 0.
  0.         0.         0.         0.02081091 0.         0.
  0.         0.         0.         0.01097502 0.         0.
  0.         0.         0.         0.         0.02874446 0.
  0.         0.0226208  0.         0.         0.         0.
  0.         0.         0.01212491 0.         0.         0.03543759
  0.         0.         0.         0.         0.         0.01641157
  0.         0.         0.         0.02016429 0.         0.0110524
  0.         0.0116768  0.         0.         0.         0.
  0.01823729 0.0119119  0.         0.         0.         0.02410075
  0.         0.         0.         0.         0.         0.02186644
  0.         0.01283615 0.         0.         0.         0.
  0.02184342 0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.01910159 0.         0.         0.01462339 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.  

In [None]:
# Doc2Vec Similarity
start_time = time.time()

doc2vec_model = train_doc2vec_model(article_texts)
doc2vec_results = doc2vec_similarity(doc2vec_model, main_article_texts, article_texts)
print("DOC2VEC\n",doc2vec_results)
execution_times['Doc2Vec'] = time.time() - start_time

most_similar_indices_doc2vec = np.argmax(doc2vec_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, print the most similar articles
for i, index in enumerate(most_similar_indices_doc2vec):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {doc2vec_results[i][index]}')


DOC2VEC
 [[-0.99483603 -0.9460615  -0.99553496 -0.9917936  -0.9911784  -0.9937661
  -0.99241906 -0.9924186   0.96613526 -0.990114   -0.9931623  -0.9913816
  -0.99394953 -0.9936038  -0.9928282   0.97255015 -0.98875475 -0.9930615
  -0.99157596 -0.9950886  -0.9952273  -0.9937973  -0.98640376 -0.99220544
  -0.99420244 -0.9913173  -0.9827979  -0.8779424  -0.9939197   0.9220609
  -0.9878917  -0.995278   -0.99463093 -0.99135005 -0.73165137 -0.9946455
  -0.85555494 -0.9945583  -0.99381685 -0.9948403  -0.9838815  -0.9932702
  -0.98827916 -0.9923656  -0.99385124 -0.9949657  -0.9740522  -0.9933861
  -0.9930747  -0.9930207  -0.9913932  -0.9937761  -0.99379444 -0.99473745
  -0.99527216 -0.99423677 -0.9941499  -0.9907769  -0.9927721  -0.9928973
  -0.97884685 -0.98933774 -0.9941922  -0.99175626 -0.99177676 -0.9937895
  -0.9904279  -0.98538053 -0.9919865  -0.9943938  -0.99477786 -0.993286
  -0.9946135  -0.98137987 -0.99319035 -0.99314326 -0.99528867 -0.99411565
  -0.9948381  -0.99527085]
 [ 0.8994054 

In [None]:
# Universal Sentence Encoder (USE) Similarity
start_time = time.time()

use_results = use_similarity(use_model, main_article_texts, article_texts)
print("USE\n",use_results)
execution_times['USE'] = time.time() - start_time


most_similar_indices = np.argmax(use_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, you can print the most similar articles
for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {use_results[i][index]}')

USE
 [[ 0.10184351 -0.00564305  0.2120504   0.1082342   0.08979507  0.11046913
  -0.04803941  0.02138601 -0.11103307  0.0675718   0.07270236  0.15105507
  -0.03112235  0.07142918 -0.00925982  0.05794662 -0.00075393  0.12702256
   0.06479087 -0.06175876  0.05222875  0.0439371   0.1435355   0.09038855
   0.05633016 -0.04819982  0.12291719 -0.02597863  0.04794332  0.06869033
  -0.03091208  0.04544554  0.08904423  0.04551072  0.18722008  0.14633697
  -0.09583176 -0.04267279  0.16156605 -0.05499463  0.15007706 -0.01042744
   0.1114385   0.11492153 -0.10887101  0.06045847  0.07620849  0.05502977
  -0.05554678  0.01657506 -0.04129944  0.03371548 -0.07598454  0.11170264
   0.09525298  0.09060757  0.02594017  0.0376205  -0.01774844  0.06740013
   0.0104291   0.14691955  0.10388587 -0.03649531 -0.05357149  0.03229859
   0.12880781  0.01505594 -0.0210156  -0.00119725  0.03084277  0.06309149
  -0.02810375  0.174852    0.01276321 -0.03053543 -0.00387191  0.03254422
   0.13201493  0.01333273]
 [-0.0

In [None]:
# BERT Cosine Similarity
start_time = time.time()

bert_results = bert_cosine_similarity(main_article_texts, article_texts)
print("BERT\n", bert_results)
execution_times['BERT'] = time.time() - start_time


most_similar_indices = np.argmax(bert_results, axis=1)

for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {bert_results[i][index]}')


BERT
 [[0.6494107  0.4761923  0.76134944 0.68800426 0.7055745  0.66193724
  0.58085144 0.6796427  0.5883754  0.71515805 0.6101371  0.7015213
  0.6168039  0.67823625 0.650724   0.64912283 0.69946593 0.6704029
  0.5968019  0.54583925 0.6763689  0.67594796 0.69470847 0.63898736
  0.5091212  0.57600796 0.66694    0.54815793 0.5043409  0.5877644
  0.6608181  0.49189344 0.6800303  0.6644603  0.7020194  0.6931313
  0.5901623  0.6828496  0.6781292  0.5953261  0.7010174  0.61983645
  0.67954576 0.6760391  0.64974797 0.62293196 0.5699592  0.69831413
  0.5931066  0.6980835  0.4898113  0.6542382  0.5474931  0.627667
  0.692111   0.6884595  0.6390749  0.6712745  0.55410755 0.68925667
  0.66154236 0.6527417  0.66384125 0.6672207  0.64718723 0.6371175
  0.6522472  0.6402405  0.5519166  0.6764267  0.6066115  0.6509303
  0.68070954 0.67353487 0.6571313  0.6311752  0.59369    0.6156581
  0.6719749  0.6336035 ]
 [0.6577013  0.54876137 0.6421484  0.6223647  0.69392765 0.67559195
  0.6191943  0.65291345 0.

In [1]:
article_ids = articles_df['article_id'].tolist()

comparison_data = {
    'Most Similar (Jaccard)': [],
    'Most Similar (TF-IDF)': [],
    'Most Similar (Doc2Vec)': [],
    'Most Similar (USE)': [],
    'Most Similar (BERT)': []
}

# Example for processing the results (ensure that the matrices and lengths match your data)
for i in range(3):
    most_relevant_indices_jaccard = np.argmax(jaccard_results[i])  # Get the index of the highest score
    most_relevant_indices_tfidf = np.argmax(tfidf_results[i])
    most_relevant_indices_doc2vec = np.argmax(doc2vec_results[i])
    most_relevant_indices_use = np.argmax(use_results[i])
    most_relevant_indices_bert = np.argmax(bert_results[i])

    comparison_data['Most Similar (Jaccard)'].append(article_ids[most_relevant_indices_jaccard])
    comparison_data['Most Similar (TF-IDF)'].append(article_ids[most_relevant_indices_tfidf])
    comparison_data['Most Similar (Doc2Vec)'].append(article_ids[most_relevant_indices_doc2vec])
    comparison_data['Most Similar (USE)'].append(article_ids[most_relevant_indices_use])
    comparison_data['Most Similar (BERT)'].append(article_ids[most_relevant_indices_bert])

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)



   Most Similar (Jaccard)  Most Similar (TF-IDF)  Most Similar (Doc2Vec)  \
0                     141                     84                      28   
1                      47                     47                      28   
2                      64                    133                      64   

   Most Similar (USE)  Most Similar (BERT)  
0                   3                    3  
1                  62                   17  
2                  63                   55  


In [2]:
for index, row in comparison_df.iterrows():
    print(f"Article {index + 1}:")
    print(f"  Most Similar (Jaccard): {row['Most Similar (Jaccard)']}")
    print(f"  Most Similar (TF-IDF): {row['Most Similar (TF-IDF)']}")
    print(f"  Most Similar (Doc2Vec): {row['Most Similar (Doc2Vec)']}")
    print(f"  Most Similar (USE): {row['Most Similar (USE)']}")
    print(f"  Most Similar (BERT): {row['Most Similar (BERT)']}")
    print("-" * 50)

Article 1:
  Most Similar (Jaccard): 141
  Most Similar (TF-IDF): 84
  Most Similar (Doc2Vec): 28
  Most Similar (USE): 3
  Most Similar (BERT): 3
--------------------------------------------------
Article 2:
  Most Similar (Jaccard): 47
  Most Similar (TF-IDF): 47
  Most Similar (Doc2Vec): 28
  Most Similar (USE): 62
  Most Similar (BERT): 17
--------------------------------------------------
Article 3:
  Most Similar (Jaccard): 64
  Most Similar (TF-IDF): 133
  Most Similar (Doc2Vec): 64
  Most Similar (USE): 63
  Most Similar (BERT): 55
--------------------------------------------------


In [None]:
def calculate_overlap(list1, list2):
    """Calculate the number of overlapping indices between two lists."""
    return len(set(list1).intersection(set(list2)))



most_relevant_indices_jaccard = [np.argmax(jaccard_results[i]) for i in range(len(jaccard_results))]
most_relevant_indices_tfidf = [np.argmax(tfidf_results[i]) for i in range(len(tfidf_results))]
most_relevant_indices_doc2vec = [np.argmax(doc2vec_results[i]) for i in range(len(doc2vec_results))]
most_relevant_indices_use = [np.argmax(use_results[i]) for i in range(len(use_results))]
most_relevant_indices_bert = [np.argmax(bert_results[i]) for i in range(len(bert_results))]


results_dict = {
    "Jaccard": most_relevant_indices_jaccard,
    "TF-IDF": most_relevant_indices_tfidf,
    "Doc2Vec": most_relevant_indices_doc2vec,
    "USE": most_relevant_indices_use,
    "BERT": most_relevant_indices_bert,
}

overlap_scores = {}
algorithms = list(results_dict.keys())

for i, algo1 in enumerate(algorithms):
    for j, algo2 in enumerate(algorithms):
        if i < j:
            list1 = results_dict[algo1]
            list2 = results_dict[algo2]
            overlap_scores[f"{algo1} vs {algo2}"] = calculate_overlap(list1, list2)


overlap_sum = {algo: 0 for algo in algorithms}

print(overlap_scores)

for pair, score in overlap_scores.items():
    algo1, algo2 = pair.split(" vs ")
    overlap_sum[algo1] += score
    overlap_sum[algo2] += score


winner = max(overlap_sum, key=overlap_sum.get)

# Display results
print("Overlap Scores Between Algorithms:")
for pair, score in overlap_scores.items():
    print(f"{pair}: {score}")

print("\nTotal Overlap Contributions:")
for algo, score in overlap_sum.items():
    print(f"{algo}: {score}")

print(f"\nWinner: {winner} (based on maximum total overlap)")

Overlap Scores Between Algorithms:
Jaccard vs TF-IDF: 1
Jaccard vs Doc2Vec: 1
Jaccard vs USE: 0
Jaccard vs BERT: 0
TF-IDF vs Doc2Vec: 0
TF-IDF vs USE: 0
TF-IDF vs BERT: 0
Doc2Vec vs USE: 0
Doc2Vec vs BERT: 0
USE vs BERT: 1

Total Overlap Contributions:
Jaccard: 2
TF-IDF: 1
Doc2Vec: 1
USE: 1
BERT: 1

Winner: Jaccard (based on maximum total overlap)


In [None]:

# Print execution times for each algorithm
print("Execution Times for Similarity Algorithms:")
for algo, exec_time in execution_times.items():
    print(f"{algo}: {exec_time:.4f} seconds")

# Output the winner based on time
fastest_algo = min(execution_times, key=execution_times.get)
print(f"\nFastest Algorithm: {fastest_algo} (Execution Time: {execution_times[fastest_algo]:.4f} seconds)")

Execution Times for Similarity Algorithms:
Jaccard: 0.5669 seconds
TF-IDF: 0.0222 seconds
Doc2Vec: 0.6064 seconds
USE: 0.8050 seconds
BERT: 17.2148 seconds

Fastest Algorithm: TF-IDF (Execution Time: 0.0222 seconds)
