In [42]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import tensorflow_hub as hub
from transformers import BertTokenizer, BertModel
import torch

# Download stopwords and tokenizer models if not available
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import tensorflow as tf
print(tf.__version__)


2.17.0


In [53]:
# Load datasets
articles_df = pd.read_csv('articles_dataset.csv')
main_articles_df = pd.read_csv('main_articles.csv')



In [None]:
def jaccard_similarity(doc1, doc2):
    tokens1 = set(word_tokenize(doc1.lower()))
    tokens2 = set(word_tokenize(doc2.lower()))
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection) / len(union) if union else 0


In [None]:
# Calculate TF-IDF cosine similarity
def tfidf_cosine_similarity(articles, main_articles):
    documents = articles + main_articles
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    similarity_matrix = cosine_similarity(tfidf_matrix[len(articles):], tfidf_matrix[:len(articles)])
    return similarity_matrix

In [30]:
def train_doc2vec_model(documents):
    tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i, doc in enumerate(documents)]
    model = Doc2Vec(vector_size=50, min_count=2, epochs=40)
    model.build_vocab(tagged_data)
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    return model
# Calculate Doc2Vec similarity
def doc2vec_similarity(doc2vec_model, main_articles, articles):
    similarities = []

    # Get the vectors for the articles in the dataset
    article_vectors = [doc2vec_model.infer_vector(word_tokenize(article.lower())) for article in articles]

    for main_article in main_articles:
        # Infer vector for the main article
        main_vector = doc2vec_model.infer_vector(word_tokenize(main_article.lower()))

        # Calculate cosine similarity between the main article vector and each article vector
        sims = cosine_similarity([main_vector], article_vectors)[0]
        similarities.append(sims)

    return np.array(similarities)

In [31]:
# Load USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Calculate USE similarity
def use_similarity(use_model, main_articles, articles):
    embeddings = use_model(articles + main_articles)
    article_embeddings = embeddings[:len(articles)]
    main_article_embeddings = embeddings[len(articles):]
    similarity_matrix = cosine_similarity(main_article_embeddings, article_embeddings)
    return similarity_matrix

In [32]:
# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to get BERT embeddings
def bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embeddings

# Calculate BERT similarity
def bert_cosine_similarity(main_articles, articles):
    article_embeddings = torch.stack([bert_embedding(doc) for doc in articles])
    main_article_embeddings = torch.stack([bert_embedding(doc) for doc in main_articles])
    similarity_matrix = cosine_similarity(main_article_embeddings.detach().numpy(), article_embeddings.detach().numpy())
    return similarity_matrix


In [44]:
# Prepare documents
article_texts = articles_df['content'].tolist()
main_article_texts = main_articles_df['content'].tolist()

# Calculate similarities
# Jaccard Similarity
jaccard_results = [[jaccard_similarity(main_article, article) for article in article_texts] for main_article in main_article_texts]
print("JACCARD\n",jaccard_results)

most_relevant_indices_jaccard = np.argmax(jaccard_results, axis=1)  # Get the index of the highest Jaccard score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_jaccard):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {jaccard_results[i][index]}')


JACCARD
 [[0.09774436090225563, 0.07633587786259542, 0.10655737704918032, 0.10077519379844961, 0.06299212598425197, 0.11023622047244094, 0.09836065573770492, 0.08527131782945736, 0.078125, 0.047244094488188976, 0.109375, 0.06153846153846154, 0.11764705882352941, 0.07936507936507936, 0.109375, 0.056, 0.06870229007633588, 0.104, 0.07936507936507936, 0.08333333333333333, 0.09322033898305085, 0.06349206349206349, 0.19827586206896552, 0.06611570247933884, 0.08, 0.07142857142857142, 0.06451612903225806, 0.088, 0.08064516129032258, 0.056], [0.08396946564885496, 0.11382113821138211, 0.09166666666666666, 0.078125, 0.11016949152542373, 0.10483870967741936, 0.08333333333333333, 0.14285714285714285, 0.06349206349206349, 0.075, 0.08661417322834646, 0.08064516129032258, 0.10256410256410256, 0.07317073170731707, 0.08661417322834646, 0.0847457627118644, 0.0625, 0.08943089430894309, 0.07317073170731707, 0.0859375, 0.06837606837606838, 0.0743801652892562, 0.08, 0.10619469026548672, 0.09166666666666666, 

In [45]:
# TF-IDF Cosine Similarity
tfidf_results = tfidf_cosine_similarity(article_texts, main_article_texts)
print("tf-IDF\n", tfidf_results)

most_relevant_indices_tfidf = np.argmax(tfidf_results, axis=1)  # Get the index of the highest TF-IDF score for each main article

# Now, print the most relevant articles
for i, index in enumerate(most_relevant_indices_tfidf):
    print(f'Most relevant article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {tfidf_results[i][index]}')


tf-IDF
 [[0.0217804  0.00481581 0.06499652 0.         0.0242082  0.01801648
  0.02362924 0.03452992 0.01910044 0.         0.03238434 0.01004606
  0.05569075 0.02081733 0.02705174 0.0043732  0.00449768 0.04023955
  0.00869304 0.         0.03135368 0.00987148 0.20071333 0.00640803
  0.00413691 0.01719533 0.01040261 0.00525467 0.00465664 0.        ]
 [0.02248223 0.08758718 0.01864782 0.         0.06888625 0.05153479
  0.01015005 0.0958795  0.         0.00915999 0.02504953 0.02021558
  0.         0.         0.         0.0201397  0.01926872 0.01605995
  0.         0.         0.         0.01350968 0.01456487 0.03241623
  0.01217191 0.02140081 0.02093306 0.01666734 0.0096415  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.01272873 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.012517
  0.         0.         0.         0.03351879 0.01596522 0.     

In [46]:
# Doc2Vec Similarity
doc2vec_model = train_doc2vec_model(article_texts)
doc2vec_results = doc2vec_similarity(doc2vec_model, main_article_texts, article_texts)
print("DOC2VEC\n",doc2vec_results)

most_similar_indices_doc2vec = np.argmax(doc2vec_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, print the most similar articles
for i, index in enumerate(most_similar_indices_doc2vec):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {doc2vec_results[i][index]}')


DOC2VEC
 [[ 0.9973186   0.99713916  0.99738204  0.9976624   0.9984712   0.9975536
   0.99732286  0.99533945  0.99650514  0.9958253   0.99805015  0.99614
   0.99621695  0.9968534   0.9978405   0.9965933   0.9969848   0.9983051
   0.99689424  0.99693084  0.99789953  0.99680096  0.9993549   0.99704283
   0.9951497   0.9960851   0.99633783  0.99747366  0.99664384  0.9958559 ]
 [ 0.9949321   0.99701655  0.9963679   0.9963525   0.9981228   0.99760276
   0.9971326   0.99611473  0.9960313   0.99670374  0.9976909   0.9962214
   0.99619275  0.9951264   0.9972073   0.99591124  0.9959329   0.9965089
   0.9969932   0.99749166  0.9974705   0.9955769   0.9977754   0.99706423
   0.9952171   0.99637115  0.9960503   0.9973905   0.9964137   0.9964267 ]
 [-0.95434403 -0.9573092  -0.95415914 -0.96013534 -0.9570882  -0.9562215
  -0.95451266 -0.9569132  -0.9554475  -0.9587459  -0.95767987 -0.9566163
  -0.9581738  -0.95659184 -0.9563042  -0.957017   -0.9646126  -0.9563819
  -0.9579512  -0.95547533 -0.95586973

In [47]:
# Universal Sentence Encoder (USE) Similarity
use_results = use_similarity(use_model, main_article_texts, article_texts)
print("USE\n",use_results)

most_similar_indices = np.argmax(use_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, you can print the most similar articles
for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {use_results[i][index]}')

USE
 [[ 4.02789652e-01  2.29685292e-01  5.89172244e-01  2.72433937e-01
   2.76163518e-01  3.52577507e-01  2.27192163e-01  2.11197421e-01
   2.77394205e-01  3.36850405e-01  2.66531169e-01  1.08863637e-01
   1.53085053e-01  3.25942546e-01  4.04314041e-01  1.66461170e-01
   2.58218706e-01  2.27151513e-01  2.55472720e-01  1.64927751e-01
   1.80166736e-01  1.76287889e-01  6.96550012e-01  1.05479538e-01
   1.53027639e-01  2.55720317e-01  2.33985215e-01  2.05823690e-01
   2.01395139e-01  2.53372282e-01]
 [ 1.59137890e-01  4.35144216e-01  1.69350490e-01  2.55951464e-01
   2.71451473e-01  2.48794556e-01  2.70397365e-01  4.83301818e-01
   1.86897218e-01  1.83519766e-01  1.72435641e-01  3.59144390e-01
   1.76433563e-01  2.73521990e-01  2.93322057e-01  2.79748857e-01
   2.66757965e-01  3.36545527e-01  2.76899457e-01  2.93179393e-01
   2.09583342e-01  1.42115027e-01  1.19277760e-01  3.46502364e-01
   2.62362838e-01  2.12230384e-01  3.48483205e-01  1.97962731e-01
   2.85226732e-01  2.54298061e-01]
 

In [48]:
# BERT Cosine Similarity
bert_results = bert_cosine_similarity(main_article_texts, article_texts)
print("BERT\n", bert_results)

# Assuming `bert_results` is your 2D array of BERT similarity scores
most_similar_indices = np.argmax(bert_results, axis=1)  # Get the index of the highest similarity score for each main article

# Now, you can print the most similar articles
for i, index in enumerate(most_similar_indices):
    print(f'Most similar article to main article {i + 1}: Article ID {articles_df["article_id"].iloc[index]}, Score: {bert_results[i][index]}')


BERT
 [[0.8154442  0.76445353 0.91049993 0.8025923  0.80479634 0.8290639
  0.75234413 0.79188144 0.8034493  0.7703412  0.79176444 0.7600287
  0.7378598  0.8104273  0.812924   0.7594214  0.80785906 0.78706604
  0.77083665 0.79576826 0.7492622  0.78824604 0.9157997  0.7850456
  0.7753484  0.78648454 0.7519563  0.75925255 0.7702719  0.77745   ]
 [0.79281884 0.84747493 0.73648024 0.811086   0.7961228  0.8182022
  0.79741544 0.8626021  0.7959354  0.7617864  0.8071481  0.81285673
  0.7570687  0.8353065  0.7441153  0.80144566 0.81114435 0.8200847
  0.8118242  0.84352535 0.76831675 0.8049456  0.7493039  0.83349544
  0.79247713 0.80129474 0.80984044 0.7874184  0.80164754 0.8080899 ]
 [0.6877612  0.6562478  0.684982   0.68645597 0.665676   0.7076949
  0.68225586 0.703962   0.7003481  0.66074795 0.68599594 0.6727643
  0.6554333  0.71447873 0.6726414  0.68971395 0.69052154 0.6948788
  0.662515   0.6842898  0.66977304 0.6788932  0.69185096 0.7039484
  0.67381227 0.6828054  0.6364924  0.6844574  0.6

In [52]:
article_ids = articles_df['article_id'].tolist()

comparison_data = {
    'Most Similar (Jaccard)': [],
    'Most Similar (TF-IDF)': [],
    'Most Similar (Doc2Vec)': [],
    'Most Similar (USE)': [],
    'Most Similar (BERT)': []
}

# Example for processing the results (ensure that the matrices and lengths match your data)
for i in range(3):  # Assuming you have three main articles
    most_relevant_indices_jaccard = np.argmax(jaccard_results[i])  # Get the index of the highest score
    most_relevant_indices_tfidf = np.argmax(tfidf_results[i])
    most_relevant_indices_doc2vec = np.argmax(doc2vec_results[i])
    most_relevant_indices_use = np.argmax(use_results[i])
    most_relevant_indices_bert = np.argmax(bert_results[i])

    comparison_data['Most Similar (Jaccard)'].append(article_ids[most_relevant_indices_jaccard])
    comparison_data['Most Similar (TF-IDF)'].append(article_ids[most_relevant_indices_tfidf])
    comparison_data['Most Similar (Doc2Vec)'].append(article_ids[most_relevant_indices_doc2vec])
    comparison_data['Most Similar (USE)'].append(article_ids[most_relevant_indices_use])
    comparison_data['Most Similar (BERT)'].append(article_ids[most_relevant_indices_bert])



In [50]:
for index, row in comparison_df.iterrows():
    print(f"Article {index + 1}:")
    print(f"  Most Similar (Jaccard): {row['Most Similar (Jaccard)']}")
    print(f"  Most Similar (TF-IDF): {row['Most Similar (TF-IDF)']}")
    print(f"  Most Similar (Doc2Vec): {row['Most Similar (Doc2Vec)']}")
    print(f"  Most Similar (USE): {row['Most Similar (USE)']}")
    print(f"  Most Similar (BERT): {row['Most Similar (BERT)']}")
    print("-" * 50)  # Add a separator for clarity

Article 1:
  Most Similar (Jaccard): 23
  Most Similar (TF-IDF): 23
  Most Similar (Doc2Vec): 23
  Most Similar (USE): 23
  Most Similar (BERT): 23
--------------------------------------------------
Article 2:
  Most Similar (Jaccard): 8
  Most Similar (TF-IDF): 8
  Most Similar (Doc2Vec): 5
  Most Similar (USE): 8
  Most Similar (BERT): 8
--------------------------------------------------
Article 3:
  Most Similar (Jaccard): 24
  Most Similar (TF-IDF): 28
  Most Similar (Doc2Vec): 23
  Most Similar (USE): 18
  Most Similar (BERT): 14
--------------------------------------------------


In [51]:
def calculate_overlap(list1, list2):
    return len(set(list1).intersection(set(list2)))

# Assuming you have the results stored in lists
overlaps = {}
# Get the indices for all main articles for each algorithm
most_relevant_indices_jaccard = [np.argmax(jaccard_results[i]) for i in range(len(jaccard_results))]
most_relevant_indices_tfidf = [np.argmax(tfidf_results[i]) for i in range(len(tfidf_results))]
most_relevant_indices_doc2vec = [np.argmax(doc2vec_results[i]) for i in range(len(doc2vec_results))]
most_relevant_indices_use = [np.argmax(use_results[i]) for i in range(len(use_results))]
most_relevant_indices_bert = [np.argmax(bert_results[i]) for i in range(len(bert_results))]

for algo1, algo2 in [('Jaccard', 'TF-IDF'), ('Jaccard', 'Doc2Vec'), ('Jaccard', 'USE'), ('Jaccard', 'BERT'),
                     ('TF-IDF', 'Doc2Vec'), ('TF-IDF', 'USE'), ('TF-IDF', 'BERT'),
                     ('Doc2Vec', 'USE'), ('Doc2Vec', 'BERT'), ('USE', 'BERT')]:

    # Fix: Use .replace to handle both 'tf-idf' and 'tfidf'
    list1 = locals()[f"most_relevant_indices_{algo1.lower().replace('-', '')}"]
    list2 = locals()[f"most_relevant_indices_{algo2.lower().replace('-', '')}"]

    overlaps[f"{algo1.lower()}-{algo2.lower()}"] = calculate_overlap(list1, list2)

print(overlaps)

{'jaccard-tf-idf': 2, 'jaccard-doc2vec': 1, 'jaccard-use': 2, 'jaccard-bert': 2, 'tf-idf-doc2vec': 1, 'tf-idf-use': 2, 'tf-idf-bert': 2, 'doc2vec-use': 1, 'doc2vec-bert': 1, 'use-bert': 2}


In [54]:
!apt-get install git


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [55]:
! git clone https://github.com/shruuti321/turbo-funicular.git


Cloning into 'turbo-funicular'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 6 (delta 0), reused 6 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), 18.40 KiB | 418.00 KiB/s, done.


In [56]:
!git config --global user.name "shruuti321"
!git config --global user.email "goelshruti2003@gmail.com"
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

In [57]:
# Navigate to the repository directory
%cd turbo-funicular

# Check the status of your repo to see changes
!git status

# Add all the changes (you can also specify specific files)
!git add .

# Commit the changes
!git commit -m "working code"


/content/turbo-funicular
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [58]:
# Step 1: Navigate to the repository directory
%cd /content/turbo-funicular/

# Step 2: Upload a file
from google.colab import files
uploaded = files.upload()

# Step 3: Add the file to git
!git add irsw.ipynb  # Or use `!git add .` for all changes

# Step 4: Commit the changes
!git commit -m "Added my new file"

# Step 5: Push the changes to GitHub
!git push https://shruuti321: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@github.com/shruuti321/turbo-funicular.git


/content/turbo-funicular


Saving articles_dataset.csv to articles_dataset (1).csv
fatal: pathspec 'irsw.ipynb' did not match any files
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   articles_dataset.csv[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31marticles_dataset (1).csv[m

no changes added to commit (use "git add" and/or "git commit -a")
Everything up-to-date
