In [1]:
import pandas as pd
from tqdm import tqdm
import time

# Rate the paper

In [None]:
gs_df = pd.read_csv('./data/gs_results.csv')
gs_df['title'] = gs_df['title'].str.casefold()
pc_df = pd.read_csv('./data/pc_results.csv')
pc_df['title'] = pc_df['title'].str.casefold()
ss_df = pd.read_csv('./data/ss_results.csv')
ss_df['title'] = ss_df['title'].str.casefold()

In [None]:
gs_df.head()

In [None]:
pc_df.head()

In [None]:
ss_df.head()

In [None]:
TIER1 = 40
TIER2 = 100
TIER3 = 200
gs_rate_df = gs_df[['area','rank','title']][gs_df['rank']<=200].copy()
pc_rate_df = pc_df.copy()
ss_rate_df = ss_df[['area','rank','title']].copy()
gs_rate_df['rate'] = gs_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3 if x<=TIER3 else 2)
pc_rate_df['rate'] = pc_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3)
ss_rate_df['rate'] = ss_rate_df['rank'].apply(lambda x: 5 if x<=TIER1 else 4 if x<=TIER2 else 3)

rate_df = gs_rate_df.merge(pc_rate_df, on=['area', 'title'], how='outer', suffixes=('_gs', '_pc'))
rate_df = rate_df.merge(ss_rate_df, on=['area', 'title'], how='outer')
rate_df.rename(columns={'rank': 'rank_ss', 'rate': 'rate_ss'}, inplace=True)
rate_df['rate_ss'].fillna(1, inplace=True)
rate_df['rank_ss'].fillna(201, inplace=True)
rate_df['rate_pc'].fillna(1, inplace=True)
rate_df['rank_pc'].fillna(201, inplace=True)
rate_df['rate_gs'].fillna(1, inplace=True)
rate_df['rank_gs'].fillna(401, inplace=True)

rate_df['agg_rate'] = (rate_df['rate_gs']*0.3 + rate_df['rate_pc']*0.4 + rate_df['rate_ss']*0.3)
rate_df['agg_rank'] = (rate_df['rank_gs']*0.3 + rate_df['rank_pc']*0.4 + rate_df['rank_ss']*0.3)
rate_df = rate_df.sort_values(by=['area', 'agg_rate', 'agg_rank'], ascending=[True, False, True]).reset_index(drop=True)
rate_df['rank'] = rate_df.groupby('area').cumcount()+1

In [None]:
rate_df.head()

# Get the total dataframe that contains download information and the pdf link

## Fill info from semantic scholar

In [None]:
import requests

In [None]:
total_df = rate_df.merge(gs_df[['area', 'title', 'pdf_link']], on=['area', 'title'], how='left')
total_df = total_df.merge(ss_df[['area', 'title', 'abstract', 'citationCount', 'referenceCount', 'influentialCitationCount', 'fieldsOfStudy']], on=['area', 'title'], how='left')

In [None]:
total_df['pdf_link'].fillna('', inplace=True)
total_df['abstract'].fillna('', inplace=True)

In [None]:
total_df.to_csv('./data/total_results.csv', index=False)

In [None]:
drop_idx = []
lack_info_df = total_df[total_df['abstract'] == ''].copy()

In [None]:
lack_info_df.drop(drop_idx, inplace=True)
drop_idx = []
for idx, paper in tqdm(lack_info_df.iterrows(), total=lack_info_df.shape[0]):
    if paper['abstract'] == '':
        title_formatted = paper['title'].replace(' ', '+')
        data = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/search?query={title_formatted}&limit=1&fields=abstract,referenceCount,citationCount,influentialCitationCount').json()
        if data.get('message') is not None:
            lack_info_df = total_df[total_df['abstract'] == '']
            print(data.get('message'))
            time.sleep(120)
        else:
            data = data.get('data')
            if data:
                data = data[0]
                paper['abstract'] = data.get('abstract')
                paper['citationCount'] = data.get('citationCount')
                paper['referenceCount'] = data.get('referenceCount')
                paper['influentialCitationCount'] = data.get('influentialCitationCount')
            else:
                drop_idx.append(idx)
    total_df.loc[idx] = paper
        

In [None]:
total_df = pd.read_csv('./data/total_results_with_info.csv')
total_df['pdf_link'].fillna('', inplace=True)
total_df['abstract'].fillna('', inplace=True)

## Fill info from arxiv

In [None]:
import xmltodict

In [None]:
lack_link_df = total_df[total_df['pdf_link'] == '']

In [None]:
lack_link_df = lack_link_df.drop(drop_idxs)
drop_idxs = []
for idx, paper in tqdm(lack_link_df.iterrows(), total = lack_link_df.shape[0]):
    if paper['pdf_link'] == '':
        title_formatted = paper['title'].replace(' ', '+')
        response = requests.get(f'http://export.arxiv.org/api/query?search_query=ti:{title_formatted}&start=0&max_results=1')
        data = xmltodict.parse(response.text)['feed']
        if 'entry' in data:
            paper['pdf_link'] = data['entry']['link'][1]['@href']
            if paper['abstract'] == '':
                paper['abstract'] = data['entry']['summary'].replace('\n',' ')
        else:
            drop_idxs.append(idx)
    total_df.loc[idx] = paper
        

In [None]:
sum(total_df['pdf_link'] == '')

In [None]:
lack_abstract_df = total_df[total_df['abstract'] == '']

In [None]:
lack_abstract_df = lack_abstract_df.drop(drop_idxs)
drop_idxs = []
for idx, paper in tqdm(lack_abstract_df.iterrows(), total = lack_abstract_df.shape[0]):
    if paper['abstract'] == '':
        title_formatted = paper['title'].replace(' ', '+')
        response = requests.get(f'http://export.arxiv.org/api/query?search_query=ti:{title_formatted}&start=0&max_results=1')
        data = xmltodict.parse(response.text)['feed']
        if 'entry' in data:
            paper['abstract'] = data['entry']['summary'].replace('\n',' ')
        else:
            drop_idxs.append(idx)
    total_df.loc[idx] = paper

In [None]:
sum(total_df['abstract'] == '')

## re-rank

In [2]:
total_df = total_df[total_df['pdf_link']!='']

NameError: name 'total_df' is not defined

In [144]:
total_df = total_df.sort_values(by=['area', 'agg_rate', 'agg_rank'], ascending=[True, False, True]).reset_index(drop=True)
total_df['rank'] = total_df.groupby('area').cumcount()+1

In [147]:
total_df.to_csv('./data/total_data.csv')

# generate aria2 download file

In [3]:
import os

In [3]:
AREAS = [
'Semantic segmentation',
'Image classification',
'Object detection',
'Object Recognition',
'Domain adaptation',
'Image generation',
'Image Captioning',
'Image augmentation',
'Pose estimation',
'Autonomous vehicles',
'Denoising',
'Super-Resolution',
'Object Tracking',
'Action Recognition',
'Face Recognition',
'Depth Estimation',
'Optical Character Recognition',
'3D Reconstruction',
'Image Retrieval',
'Optical Flow Estimation',
'Style Transfer',
'Image Compression']

In [4]:
total_df = pd.read_csv('./data/total_data.csv')

In [12]:
for area in AREAS:
    area_formatted = area.lower().replace(' ', '_')
    if not os.path.exists(f'./data/pdf/{area_formatted}'):
        os.mkdir(f'./data/pdf/{area_formatted}')

In [12]:
file_names = []
with open('data_download_aria2.txt', 'w') as f:
    for idx, paper in tqdm(total_df.iterrows(), total = total_df.shape[0]):
        area = paper['area']
        area_formatted = area.lower().replace(' ', '_')
        title = paper['title']
        pdf_link = paper['pdf_link']
        letters = [word[0] for word in title.split(' ')]
        file_name = ''.join(letters)
        file_names.append(file_name)
        f.write(pdf_link + '\n')
        f.write(f'\tout=./data/pdf/{area_formatted}/{file_name}.pdf\n')

100%|██████████| 11359/11359 [00:00<00:00, 30166.65it/s]


In [13]:
total_df['file_name'] = file_names

In [14]:
total_df.to_csv('./data/total_data.csv')

In [13]:
from PyPDF2 import PdfFileReader

In [15]:
with open('data_download_aria2.txt', 'w') as f:
    for idx, paper in tqdm(total_df.iterrows(), total = total_df.shape[0]):
        area = paper['area']
        area_formatted = area.lower().replace(' ', '_')
        title = paper['title']
        pdf_link = paper['pdf_link']
        letters = [word[0] for word in title.split(' ')]
        file_name = ''.join(letters)
        file_path = f'./data/pdf/{area_formatted}/{file_name}.pdf'
        if os.path.exists(file_path):
            bValid = True
            try:
                reader = PdfFileReader(file_path)
                if reader.getNumPages() < 1:  # 进一步通过页数判断。
                    bValid = False
            except:
                bValid = False
            if bValid:
                continue
            else:
                os.remove(file_path)
        if pdf_link.find('arxiv') != -1:
            continue
        f.write(pdf_link + '\n')
        f.write(f'\tout=./data/pdf/{area_formatted}/{file_name}.pdf\n')

100%|██████████| 11359/11359 [00:27<00:00, 414.27it/s]
