In [66]:
import itertools

import pandas as pd
import requests
import untangle


class APIOperator:
    def findsubsets(self,s, n): 
        return list(itertools.combinations(s, n))

    def create_query_urls(self, iterable, max_results=10):

        urls = []
        queries = self.findsubsets(iterable,len(iterable)-2)
        queries = [' '.join(q) for q in queries]
        for query in queries:
            urls.append("http://export.arxiv.org/api/query?search_query=all:" +
                        query + "&max_results="+str(max_results))

        return urls
       

    def parse_xml(self, url):

        page = requests.get(url).text
        xml = untangle.parse(page)
        feed = xml.feed

        return feed

    def get_articles(self, feed):

        articles = []

        try:
            for item in feed.entry:

                title = item.title.cdata
                summary = item.summary.cdata
                pdf = item.id.cdata
                pdf = pdf.replace("abs", "pdf")
                published_date = item.published.cdata
                authors = [author.name.cdata for author in item.author]       
                article = {
                    "title": title,
                    "summary": summary,
                    "pdf": pdf,
                    "published_date":published_date,
                    "authors": authors
                }
                articles.append(article)
        except:
            pass

        return articles


    def get_article_queries(self, urls):

        articleList = []
        for url in urls:        
            xml = self.parse_xml(url)
            articles = self.get_articles(xml) 
            articleList += articles        

        return articleList


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def get_similarity(doc1, doc2):
        
        vectorizer = TfidfVectorizer()

        v1 = vectorizer.fit_transform(doc1)
        v2 = vectorizer.transform(doc2)

        similarity = cosine_similarity(v1, v2)

        return round(similarity.mean(axis=0)[0],3)*100

In [None]:
profile = pd.read_csv("../profile.csv")
res = pd.read_csv("../query2.csv")

In [None]:
allprofile = profile.text[0:3].to_list()

In [None]:
keywords = ['rotation', 'representation', 'learning','keypoint', 'detection', 'map', 'face', 'data', 'coded', ]

In [67]:
api = APIOperator()
urls = api.create_query_urls(keywords)

In [68]:
rs = api.get_article_queries(urls)

In [69]:
results = []
for txt in rs:
    results.append(get_similarity(allprofile,[txt['summary']]))

In [70]:
pd.DataFrame(results).sort_values(by=0,ascending=False)

Unnamed: 0,0
212,24.1
38,24.1
65,24.1
74,24.1
348,24.1
...,...
125,9.5
126,9.0
69,6.5
75,6.5
