In [1]:
!pip install gensim -qq

In [2]:
!pip install rich -qq

In [3]:
!pip install -U sentence-transformers -qq

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from rich import inspect
import torch 
import re

import pandas as pd
import nltk
import pickle
import numpy as np

In [5]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
data = pd.read_csv("data_ml.csv")
data.drop("Unnamed: 0",axis=1, inplace=True)
data['Id']=data['Id'].str.replace("http://arxiv.org/abs/","")
data['Summary']=data['Summary'].str.replace("\$.*?\$"," ")
data['Title']=data['Title'].str.replace("\$.*?\$"," ")
data.to_csv("cleaned_data")
data.head(2)

  data['Id']=data['Id'].str.replace("http://arxiv.org/abs/","")
  data['Summary']=data['Summary'].str.replace("\$.*?\$"," ")
  data['Title']=data['Title'].str.replace("\$.*?\$"," ")


Unnamed: 0,Title,Date,Id,Summary,URL
0,Mapping Tropical Forest Cover and Deforestatio...,2022-11-17 18:59:44+00:00,2211.09806v1,Monitoring changes in tree cover for rapid ass...,http://arxiv.org/pdf/2211.09806v1
1,Exact Quantum Algorithms for Quantum Phase Rec...,2022-11-17 18:59:20+00:00,2211.09803v1,We explore the relationship between renormaliz...,http://arxiv.org/pdf/2211.09803v1


## TF-IDF and Cosine Similariity

In [7]:
df = data[["Id","Summary", "Title"]]
df_train = df[:9990]
df_test = df[9990:]
# print(df.shape)

In [8]:
# Making vectors
tfidfvectorizer = TfidfVectorizer(ngram_range=(1,1))
tfidfmatrix = tfidfvectorizer.fit_transform(df_train.Summary)

In [9]:
data_frame = pd.DataFrame(tfidfmatrix.toarray())
data_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33887,33888,33889,33890,33891,33892,33893,33894,33895,33896
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def consine_test(title):
    cosine_sim = cosine_similarity(tfidfvectorizer.transform([title]).toarray()[0].reshape(1,-1), data_frame)[0]
    tfidf_top10 = cosine_sim.argsort()[-10:]
    title_list = [df_train.loc[i].Title for i in tfidf_top10 ]
    url_list = ["http://arxiv.org/abs/"+ df_train.loc[i].Id for i in tfidf_top10 ]
    summary_list = [df_train.loc[i].Summary for i in tfidf_top10]
    tfidf_scores = [cosine_sim[i] for i in tfidf_top10]
    return title_list, url_list, tfidf_scores

recommendation_list = [consine_test(df_test.loc[i].Summary) for i in range(9990,10000)]

In [12]:
recommendation_list[9]

(['Improving Policy Learning via Language Dynamics Distillation',
  'Replacing Language Model for Style Transfer',
  'FusionVAE: A Deep Hierarchical Variational Autoencoder for RGB Image Fusion',
  'LAD: Language Augmented Diffusion for Reinforcement Learning',
  'Probing for Incremental Parse States in Autoregressive Language Models',
  'Same Pre-training Loss, Better Downstream: Implicit Bias Matters for Language Models',
  'RobBERT-2022: Updating a Dutch Language Model to Account for Evolving Language Use',
  'Understanding Domain Learning in Language Models Through Subpopulation Analysis',
  'Progressive Fusion for Multimodal Integration',
  'Relaxed Attention for Transformer Models'],
 ['http://arxiv.org/abs/2210.00066v1',
  'http://arxiv.org/abs/2211.07343v1',
  'http://arxiv.org/abs/2209.11277v1',
  'http://arxiv.org/abs/2210.15629v1',
  'http://arxiv.org/abs/2211.09748v1',
  'http://arxiv.org/abs/2210.14199v1',
  'http://arxiv.org/abs/2211.08192v1',
  'http://arxiv.org/abs/2210

In [26]:
def final_test(recommendation_list):
    title_test_100 = []
    score_test_100 = []
    recs_test_100 = []
    for i in range(10):
        title_test_100 += [df_test.loc[i + 9990].Title]*10
        score_test_100 += recommendation_list[i][2]
        recs_test_100 += recommendation_list[i][0]
    d = {"Title":title_test_100, "Recommendation":recs_test_100,"Cosine Similarity":score_test_100}
    final_result = pd.DataFrame(data=d)
    return final_result

In [27]:
tfidf_csv = final_test(recommendation_list)
tfidf_csv.head()

Unnamed: 0,Title,Recommendation,Cosine Similarity
98,Fusing Sentence Embeddings Into LSTM-based Aut...,Progressive Fusion for Multimodal Integration,0.243919
99,Fusing Sentence Embeddings Into LSTM-based Aut...,Relaxed Attention for Transformer Models,0.255758


In [29]:
tfidf_csv.to_csv("tfidf_final_test.csv")

## KNN

In [13]:
neigh = NearestNeighbors(n_neighbors=10, metric="cosine")

In [14]:
vectorizer = CountVectorizer()
featurevectors = vectorizer.fit_transform ( df_train.Summary.tolist())
featurevectors

<9990x33897 sparse matrix of type '<class 'numpy.int64'>'
	with 1103301 stored elements in Compressed Sparse Row format>

In [15]:
neigh.fit ( featurevectors)

In [18]:
def knn_recommendation(title, model):
    text = vectorizer.transform([title]).toarray()
    final_knn = model.kneighbors(text, return_distance=True)
    final_knn_score = final_knn[0][0].tolist()
    final_knn_index = final_knn[1][0].tolist()
    title_list = [df.loc[i].Title for i in final_knn_index]
    Url_list = ["http://arxiv.org/abs/"+df.loc[i].Id for i in final_knn_index]
    return title_list, Url_list, final_knn_score

In [19]:
knn_rec_list = [knn_recommendation(df.loc[i].Title,neigh) for i in range(9990,10000) ]

In [21]:
knn_rec_list[9]

(['Bidirectional Language Models Are Also Few-shot Learners',
  'Probing for Incremental Parse States in Autoregressive Language Models',
  'Self-conditioned Embedding Diffusion for Text Generation',
  'SSD-LM: Semi-autoregressive Simplex-based Diffusion Language Model for Text Generation and Modular Control',
  'DiffusER: Discrete Diffusion via Edit-based Reconstruction',
  "Introducing Vision Transformer for Alzheimer's Disease classification task with 3D input",
  'DiffuSeq: Sequence to Sequence Text Generation with Diffusion Models',
  'Non-Autoregressive Sign Language Production via Knowledge Distillation',
  'RobBERT-2022: Updating a Dutch Language Model to Account for Evolving Language Use',
  'PromptCast: A New Prompt-based Learning Paradigm for Time Series Forecasting'],
 ['http://arxiv.org/abs/2209.14500v1',
  'http://arxiv.org/abs/2211.09748v1',
  'http://arxiv.org/abs/2211.04236v1',
  'http://arxiv.org/abs/2210.17432v1',
  'http://arxiv.org/abs/2210.16886v1',
  'http://arxi

In [None]:
knn_csv = final_test(recommendation_list=knn_rec_list)
knn_csv.head(2)

Unnamed: 0,Title,Recommendation,Cosine Similarity
0,DL-DRL: A double-layer deep reinforcement lear...,When does deep learning fail and how to tackle...,0.673729
1,DL-DRL: A double-layer deep reinforcement lear...,Is Multi-Task Learning an Upper Bound for Cont...,0.681929


In [162]:
knn_csv.to_csv("KNN_FINAL_TEST.csv")

## SentenceTransformer

In [22]:
from sentence_transformers import SentenceTransformer, util

In [23]:
model_ST = SentenceTransformer('paraphrase-MiniLM-L3-v2')
sentences1 = df_train.Summary.tolist()
sentences2 = df_test.Summary.tolist()

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [24]:
embeddings1 = model_ST.encode(sentences1, convert_to_tensor=True)
embeddings2 = model_ST.encode(sentences2, convert_to_tensor=True)

In [25]:
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_scores

tensor([[0.1608, 0.2284, 0.2609,  ..., 0.3062, 0.1687, 0.1272],
        [0.1808, 0.2275, 0.3163,  ..., 0.3334, 0.2178, 0.2478],
        [0.1368, 0.2660, 0.1318,  ..., 0.1259, 0.1986, 0.1337],
        ...,
        [0.2703, 0.4043, 0.2972,  ..., 0.3978, 0.3235, 0.4670],
        [0.2977, 0.4607, 0.3381,  ..., 0.4372, 0.3473, 0.3409],
        [0.4257, 0.5019, 0.3317,  ..., 0.3102, 0.7870, 0.3433]],
       device='cuda:0')

In [26]:
top10_list  = [torch.topk(cosine_scores[:,i], 10).indices.tolist() for i in range(10)]

In [27]:
ST_recs = []
for e, top in enumerate(top10_list):
    title_ST = [df_train.iloc[i].Title for i in top]
    url_ST = ["http://arxiv.org/abs/"+ df_train.iloc[i].Id for i in top]
    ST_recs.append({"Title":title_ST, "URL": url_ST, "ST_score":torch.topk(cosine_scores[:,e], 10).values.tolist()})

In [29]:
ST_recs[9]

{'Title': ["Don't Prompt, Search! Mining-based Zero-Shot Learning with Language Models",
  'i-MAE: Are Latent Representations in Masked Autoencoders Linearly Separable?',
  'Few-shot Text Classification with Dual Contrastive Consistency',
  'Masked Modeling Duo: Learning Representations by Encouraging Both Networks to Model the Input',
  'Fine-Tuning Pre-trained Transformers into Decaying Fast Weights',
  'Masked Contrastive Representation Learning',
  'Replacing Language Model for Style Transfer',
  'Probing for Incremental Parse States in Autoregressive Language Models',
  'On Reality and the Limits of Language Data',
  'How Mask Matters: Towards Theoretical Understandings of Masked Autoencoders'],
 'URL': ['http://arxiv.org/abs/2210.14803v1',
  'http://arxiv.org/abs/2210.11470v1',
  'http://arxiv.org/abs/2209.15069v1',
  'http://arxiv.org/abs/2210.14648v1',
  'http://arxiv.org/abs/2210.04243v1',
  'http://arxiv.org/abs/2211.06012v1',
  'http://arxiv.org/abs/2211.07343v1',
  'http://

In [160]:
title_test_100 = []
score_test_100 = []
recs_test_100 = []
for i in range(10):
    title_test_100 += [df_test.loc[i + 9990].Title]*10
    recs_test_100  += ST_recs[i]["Title"]
    score_test_100 += ST_recs[i]["ST_score"]
d = {"Title":title_test_100,"Recommendation":recs_test_100,"Cosine_Similarity":score_test_100}
final_SenTrans = pd.DataFrame(data=d)
final_SenTrans.sample(2)

Unnamed: 0,Title,Recommendation,Cosine_Similarity
25,Node Copying: A Random Graph Model for Effecti...,A Framework for Large Scale Synthetic Graph Da...,0.707458
86,MOVE: Effective and Harmless Ownership Verific...,Efficiently Finding Adversarial Examples with ...,0.751383


In [163]:
final_SenTrans.to_csv("SBERT_FINAL_TEST.csv")