In [1]:
!pip install gradio -qq

In [2]:
!pip install -U sentence-transformers -qq

In [3]:
import gradio as gr
import pandas as pd
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer, util

In [4]:
data = pd.read_csv("data_ml.csv")
data = data[:9990]
data = data[["Id", "Title","Summary"]]

In [5]:
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [6]:
def consine_test(title):
    tfidfvectorizer = TfidfVectorizer(ngram_range=(1,1))
    tfidfmatrix = tfidfvectorizer.fit_transform(data.Summary)
    data_frame = pd.DataFrame(tfidfmatrix.toarray())
    cosine_sim = cosine_similarity(tfidfvectorizer.transform([title]).toarray()[0].reshape(1,-1), data_frame)
    arr = np.array(cosine_sim[0])
    ind = np.argpartition(arr, -10)[-10:]
    title_list = [data.loc[i].Title for i in ind ]
    url_list = ["http://arxiv.org/abs/"+ data.loc[i].Id for i in ind ]
    title_str = ""
    url_str = ""
    for i in range(10):
        title_str += str(i+1) + ".  - " + title_list[i] + "\n\n"
        url_str += str(i+1) + ".  - " + url_list[i] + "\n"
    # title_str = "\n\n - ".join(title_list)
    # url_str = "\n".join(url_list)
    return  f"TITLE LIST :\n{title_str}\n\nURL LIST :\n{url_str}"

In [7]:
def knn_recommendation(title):
    neigh = NearestNeighbors ( n_neighbors=10,algorithm='auto',leaf_size=30,metric="cosine")
    vectorizer = CountVectorizer()
    featurevectors = vectorizer.fit_transform ( data.Summary.tolist()).todense ()
    neigh.fit ( featurevectors )
    text = vectorizer.transform([title]).toarray()
    final_knn = neigh.kneighbors(text, return_distance=False)
    final_knn_list = final_knn.tolist()
    title_list = [data.loc[i].Title for i in final_knn_list[0]]
    url_list = ["http://arxiv.org/abs/"+data.loc[i].Id for i in final_knn_list[0]]
    title_str = ""
    url_str = ""
    for i in range(10):
        title_str += str(i+1) + ".  - " + title_list[i] + "\n\n"
        url_str += str(i+1) + ".  - " + url_list[i] + "\n"
    # title_str = "\n\n - ".join(title_list)
    # url_str = "\n".join(url_list)
    return  f"TITLE LIST :\n{title_str}\n\nURL LIST :\n{url_str}"

In [8]:
def ST_recommendation(title):
    sentences1 = data.Summary.tolist()
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(title, convert_to_tensor=True)
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    top10 = torch.topk(cosine_scores.reshape(1,-1),10).indices[0].tolist()
    title_list = [data.loc[i].Title for i in top10]
    url_list = ["http://arxiv.org/abs/"+data.loc[i].Id for i in top10]
    title_str = ""
    url_str = ""
    for i in range(10):
        title_str += str(i+1) + ".  - " + title_list[i] + "\n\n"
        url_str += str(i+1) + ".  - " + url_list[i] + "\n"
    # title_str = "\n\n - ".join(title_list)
    # url_str = "\n".join(url_list)
    return  f"TITLE LIST :\n{title_str}\n\nURL LIST :\n{url_str}"

In [10]:
sample = "A Nonlinear PID-Enhanced Adaptive Latent Factor Analysis Model"
def fn(model_choice, input):
    if model_choice=="tfidf_CosineSimilarity":
        return consine_test(input)
    elif model_choice=="KNN":
        return knn_recommendation(input)
    elif model_choice == "SentenceTransformers":
        return ST_recommendation(input)

gr.Interface(fn, [gr.inputs.Dropdown(["tfidf_CosineSimilarity", "KNN", "SentenceTransformers"]), gr.Textbox(placeholder="Paper title")],
             gr.Textbox(placeholder="Title and URL of Recommended papers"),
             examples=[["tfidf_CosineSimilarity", "Node Copying: A Random Graph Model for Effective Graph Sampling"],
                       ["KNN", "Node Copying: A Random Graph Model for Effective Graph Sampling"],
                       ["SentenceTransformers", "Node Copying: A Random Graph Model for Effective Graph Sampling"]],
             title="Research Paper Recommendation").launch(share=True)




Running on local URL:  http://127.0.0.1:7861

Setting up a public link... we have recently upgraded the way public links are generated. If you encounter any problems, please report the issue and downgrade to gradio version 3.13.0
.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running on public URL: https://82c3f9ae-9f86-4c8a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


