In [1]:
import os
import re
import json
import arxiv
import yaml
import logging
import argparse
import datetime
import requests

In [2]:
logging.basicConfig(format='[%(asctime)s %(levelname)s] %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)

base_url = "https://arxiv.paperswithcode.com/api/v0/papers/"
github_url = "https://api.github.com/search/repositories"
arxiv_url = "http://arxiv.org/"

In [3]:
def get_authors(authors, first_author = False):
    output = str()
    if first_author == False:
        output = ", ".join(str(author) for author in authors)
    else:
        output = authors[0]
    return output
def sort_papers(papers):
    output = dict()
    keys = list(papers.keys())
    keys.sort(reverse=True)
    for key in keys:
        output[key] = papers[key]
    return output    

In [4]:
def get_code_link(qword:str) -> str:
    """
    This short function was auto-generated by ChatGPT. 
    I only renamed some params and added some comments.
    @param qword: query string, eg. arxiv ids and paper titles
    @return paper_code in github: string, if not found, return None
    """
    # query = f"arxiv:{arxiv_id}"
    query = f"{qword}"
    params = {
        "q": query,
        "sort": "stars",
        "order": "desc"
    }
    r = requests.get(github_url, params=params)
    results = r.json()
    code_link = None
    if results["total_count"] > 0:
        code_link = results["items"][0]["html_url"]
    return code_link

In [5]:
import numpy as np
import pandas as pd
file_path = "CVPR_2022_accepted_papers.xls"
df = pd.read_excel(file_path, 'Sheet1', header=0) 
data_array = np.array(df)
article_name_list = data_array[:, 0].reshape(-1)

In [6]:
query = article_name_list[1]
max_results = 1

In [7]:
# test for the search_engine.results()
search_engine = arxiv.Search(
        query = query,
        max_results = max_results,
        sort_by = arxiv.SortCriterion.Relevance
    )
for result in search_engine.results():

        paper_id            = result.get_short_id()
        paper_title         = result.title
        paper_url           = result.entry_id
        code_url            = base_url + paper_id #TODO
        paper_abstract      = result.summary.replace("\n"," ")
        paper_authors       = get_authors(result.authors)
        paper_first_author  = get_authors(result.authors,first_author = True)
        primary_category    = result.primary_category
        publish_time        = result.published.date()
        update_time         = result.updated.date()
        comments            = result.comment
        print(f"Time = {update_time} title = {paper_title} author = {paper_first_author}")
        print(paper_url)
        

[03/10/2023 19:59:30 INFO] Requesting 1 results at offset 0
[03/10/2023 19:59:30 INFO] Requesting page of results
[03/10/2023 19:59:31 INFO] Got first page; 1 of 1 results available


Time = 2022-03-28 title = Compositional Temporal Grounding with Structured Variational Cross-Graph Correspondence Learning author = Juncheng Li
http://arxiv.org/abs/2203.13049v2


In [8]:
def get_papers_content(topic,query="slam", max_results=2):
    """
    @param topic: str
    @param query: str
    @return paper_with_code: dict
    """
    # output 
    content = dict() 
    content_to_web = dict()
    search_engine = arxiv.Search(
        query = query,
        max_results = max_results,
        sort_by = arxiv.SortCriterion.Relevance
    )
    paper_pdf_url_list = []
    for result in search_engine.results():

        paper_id            = result.get_short_id()
        paper_title         = result.title
        paper_url           = result.entry_id
        code_url            = base_url + paper_id #TODO
        paper_abstract      = result.summary.replace("\n"," ")
        paper_authors       = get_authors(result.authors)
        paper_first_author  = get_authors(result.authors,first_author = True)
        primary_category    = result.primary_category
        publish_time        = result.published.date()
        update_time         = result.updated.date()
        comments            = result.comment
        print(f'successfully get {paper_title}')

        # eg: 2108.09112v1 -> 2108.09112
        ver_pos = paper_id.find('v')
        if ver_pos == -1:
            paper_key = paper_id
        else:
            paper_key = paper_id[0:ver_pos]    
        paper_url = arxiv_url + 'abs/' + paper_key
        # https://arxiv.org/pdf/2010.04159.pdf
        paper_pdf_url = arxiv_url + 'pdf/' + paper_key + '.pdf'
        https_paper_pdf_url = paper_pdf_url.split(':')[0] + 's:' + paper_pdf_url.split(':')[1]
        paper_pdf_url_list.append(https_paper_pdf_url)
        
        try:
            # source code link    
            r = requests.get(code_url).json()
            repo_url = None
            if "official" in r and r["official"]:
                repo_url = r["official"]["url"]
            # TODO: not found, two more chances  
            # else: 
            #    repo_url = get_code_link(paper_title)
            #    if repo_url is None:
            #        repo_url = get_code_link(paper_key)
            if repo_url is not None:
                content[paper_key] = "|**{}**|**{}**|{} et.al.|[{}]({})|[PDF_link]({})|**[Code_link]({})**|\n".format(
                       update_time,paper_title,paper_first_author,paper_key,paper_url,paper_pdf_url,repo_url)
                content_to_web[paper_key] = "- {}, **{}**, {} et.al., Paper: [{}]({}), Code: **[{}]({})**".format(
                       update_time,paper_title,paper_first_author,paper_url,paper_url,repo_url,repo_url)

            else:
                content[paper_key] = "|**{}**|**{}**|{} et.al.|[{}]({})|null|\n".format(
                       update_time,paper_title,paper_first_author,paper_key,paper_url)
                content_to_web[paper_key] = "- {}, **{}**, {} et.al., Paper: [{}]({})".format(
                       update_time,paper_title,paper_first_author,paper_url,paper_url)

            # TODO: select useful comments
            comments = None
            if comments != None:
                content_to_web[paper_key] += f", {comments}\n"
            else:
                content_to_web[paper_key] += f"\n"

        except Exception as e:
            print((f"exception: {e} with id: {paper_key}"))

    data = {topic:content}
    return data, paper_pdf_url_list

In [9]:
paper_pdf_url = 'http://arxiv.org/pdf/2010.04159.pdf'
print(paper_pdf_url.split(':')[0] + 's' + paper_pdf_url.split(':')[1])


https//arxiv.org/pdf/2010.04159.pdf


In [10]:
def update_json_file(filename, data_all):
    with open(filename,"r") as f:
        content = f.read()
        if not content:
            m = {}
        else:
            m = json.loads(content)
            
    json_data = m.copy() 
    
    # update papers in each keywords         
    for data in data_all:
        # print(type(data))
        # print(data)
        for keyword in data.keys():
            papers = data[keyword]

            if keyword in json_data.keys():
                json_data[keyword].update(papers)
            else:
                json_data[keyword] = papers

    with open(filename,"w") as f:
        json.dump(json_data,f)
    
def json_to_md(filename):
    """
    @param filename: str
    @return None
    """
    
    DateNow = datetime.date.today()
    DateNow = str(DateNow)
    DateNow = DateNow.replace('-','.')
    
    with open(filename,"r") as f:
        content = f.read()
        if not content:
            data = {}
        else:
            data = json.loads(content)

    md_filename = "README.md"  
      
    # clean README.md if daily already exist else create it
    with open(md_filename,"w+") as f:
        pass

    # write data into README.md
    with open(md_filename,"a+") as f:
  
        f.write("## Updated on " + DateNow + "\n\n")
        
        for keyword in data.keys():
            day_content = data[keyword]
            if not day_content:
                continue
            # the head of each part
            f.write(f"## {keyword}\n\n")
            f.write("|Publish Date|Title|Authors|Abstract|PDF|Code link|\n" + "|---|---|---|---|---|---|\n")
            # sort papers by date
            day_content = sort_papers(day_content)
        
            for _,v in day_content.items():
                if v is not None:
                    f.write(v)

            f.write(f"\n")
    print("finished")     


In [11]:
import time 
# from selenium import webdriver
import requests
import threading
import os 
import time

def Handler(start, end, url, filename): 
    # specify the starting and ending of the file 
    headers = {'Range': 'bytes=%d-%d' % (start, end)} 
    # request the specified part and get into variable     
    r = requests.get(url, headers=headers, stream=True) 
    # open the file and write the content of the html page into file. 
    with open(filename, "r+b") as fp: 
        fp.seek(start) 
        var = fp.tell() 
        fp.write(r.content)

def download_file(url_of_file,name,number_of_threads): 
    r = requests.head(url_of_file) 
    if name: 
        file_name = name 
    else: 
        file_name = url_of_file.split('/')[-1] 
    try: 
        file_size = int(r.headers['content-length']) 
    except: 
        print("Invalid URL")
        return

    part = int(file_size) / number_of_threads 
    fp = open(file_name, "wb") 
    fp.close() 
    for i in range(number_of_threads): 
        start = int(part * i) 
        end = int(start + part) 
        # create a Thread with start and end locations 
        t = threading.Thread(target=Handler, 
            kwargs={'start': start, 'end': end, 'url': url_of_file, 'filename': file_name}) 
        t.setDaemon(True) 
        t.start() 

    main_thread = threading.current_thread() 
    for t in threading.enumerate(): 
        if t is main_thread: 
            continue
        t.join() 
    print('finish one')

save_path = 'C:/Users/14541/Desktop/CVPR_2022_articles'

In [13]:
data_collector = []
keywords = dict()
for i in article_name_list:
    keywords[f"{i}"] = i
pdf_url_list = []
for topic, keyword in keywords.items():
 
    print("Keyword: " + topic)
    data, pdf_url_list = get_papers_content(topic, query = keyword, max_results = 1)
    data_collector.append(data)
    print("\n")


# update README.md file
json_file = "cv-arxiv-daily.json"
if ~os.path.exists(json_file):
    with open(json_file,'w')as a:
        print("create " + json_file)
# update json data
update_json_file(json_file, data_collector)
# json data to markdown
json_to_md(json_file)
# for pdf_url in pdf_url_list:
#     # download pdf using pdf_url
#     filename = pdf_url[-14:]
#     print('filename:{}, pdf_url:{}.'.format(filename,pdf_url))

#     # pdf_url = 'https://arxiv.org/pdf/1709.06508.pdf'

#     print('\nDownloading {} ...'.format(filename))
#     # pdf_url = 'https://arxiv.org/pdf/{}.pdf'.format(arxiv_id)
#     # filename = filename_replace(paper_title) + '.pdf'
#     ts = time.time()
#     download_file(url_of_file=pdf_url, name=os.path.join(save_path,filename),number_of_threads=1) 
#     te = time.time()
#     print('{:.0f}s [Complete] {}'.format(te-ts, filename))

[03/10/2023 20:02:42 INFO] Requesting 1 results at offset 0
[03/10/2023 20:02:42 INFO] Requesting page of results


Keyword: Cascade Transformers for End-to-End Person Search


[03/10/2023 20:02:43 INFO] Got first page; 1 of 1 results available


successfully get Cascade Transformers for End-to-End Person Search


[03/10/2023 20:02:44 INFO] Requesting 1 results at offset 0
[03/10/2023 20:02:44 INFO] Requesting page of results




Keyword: Compositional Temporal Grounding with Structured Variational Cross-Graph Correspondence Learning


[03/10/2023 20:02:45 INFO] Got first page; 1 of 1 results available


successfully get Compositional Temporal Grounding with Structured Variational Cross-Graph Correspondence Learning


[03/10/2023 20:02:46 INFO] Requesting 1 results at offset 0
[03/10/2023 20:02:46 INFO] Requesting page of results




Keyword: Long-Tailed Recognition via Weight Balancing


[03/10/2023 20:02:47 INFO] Got first page; 1 of 1 results available


successfully get Long-Tailed Recognition via Weight Balancing


[03/10/2023 20:02:47 INFO] Requesting 1 results at offset 0
[03/10/2023 20:02:47 INFO] Requesting page of results




Keyword: InfoGCN: Representation Learning for Human Skeleton-based Action Recognition


[03/10/2023 20:02:48 INFO] Got first page; 1 of 1 results available


successfully get Improving Human Action Recognition by Non-action Classification


[03/10/2023 20:02:49 INFO] Requesting 1 results at offset 0
[03/10/2023 20:02:49 INFO] Requesting page of results




Keyword: Interactive Geometry Editing of Neural Radiance Fields
