In [2]:
import lxml                # import the lxml library
import codecs              # import the codecs library
from bs4 import BeautifulSoup    # import the BeautifulSoup class from the bs4 library
import pandas as pd        # import the pandas library and give it an alias "pd" for easier use
from tqdm.notebook import tqdm  # import the tqdm library's notebook module
import numpy as np         # import the numpy library and give it an alias "np" for easier use
import glob                # import the glob library

In [6]:
xml_file_new = glob.glob("/home/harsh.d/arxiv/xml_new/*")   # creates a list of file paths that match the pattern in the specified directory
xml_file_old = glob.glob("/home/harsh.d/old_xml_files/*")       # creates a list of file paths that match the pattern in the specified directory
xml_files = xml_file_old + xml_file_new                    # concatenates the two lists of file paths into a single list


In [8]:
snap_df=pd.read_json("/home/harsh.d/Shoaib/arxiv_metadata/arxiv-metadata-oai-snapshot.json",lines=True)   # read the specified JSON file into a pandas DataFrame
snap_df['id']=snap_df['id'].str.replace('/','')    # remove forward slashes from the 'id' column of the DataFrame
snap_df['title']=snap_df['title'].str.lower()     # convert the text in the 'title' column of the DataFrame to lowercase


In [4]:
"""Get Unique Papers based on Title+Authors"""
def get_titles_authors(file):

    titles=[]
    authors=[]
    doi=[]
    names=[]
    
    # Read XML
    with codecs.open(file,'r',"utf-8") as tei:
        soup = BeautifulSoup(tei, 'lxml')
    
    title=snap_df[snap_df['id']==file.split('/')[-1][:-8]]['title'].values[0].lower()
    # Paper Title
    if(title==''):
        title=soup.teiheader.title.get_text().lower()
        
#         print("Taken from Snap")
    
    
    # Paper Author Details    
    for author in soup.teiheader.find_all('author'):
    
        if(author.find('persname')!=None):
        
            if(author.find('persname').forename!=None and author.find('persname').surname!=None):
                names.append(author.find('persname').forename.get_text()[0].lower()+" "+author.find('persname').surname.get_text().lower())

            elif(author.find('persname').forename==None):
                names.append(author.find('persname').surname.get_text().lower())

            else:
                names.append(author.find('persname').forename[0].get_text().lower())
                
    titles.append(title)
    authors.append(names)
    
    # Dictionary containg <XML ID: Citation> format
    citations={}
    for targets in soup.find_all('ref'):
        if(targets.get('type')=='bibr' and targets.get('target')!=None):
            citations[targets.get('target')]=targets.get_text().lower()
    
    # Extract Citation, Reference Titles, Reference Authors 
    for ref in soup.back.find_all('biblstruct'):
        
        if('#'+ref.get("xml:id") in citations):
            
            cit=citations['#'+ref.get("xml:id")]
            
            xml_id=ref.get("xml:id")
            ref_title = ""
            for t in ref.find_all('title'):
                if(t.get_text()!=''):
                    ref_title=t.get_text().lower()
                    break
            
            if(ref.find(type="arXiv")!=None):
                ref_doi=ref.find(type="arXiv").get_text().lower()
            elif(ref.find(type="DOI")!=None):
                ref_doi=ref.find(type="DOI").get_text().lower()
            else:
                ref_doi=''
                
#             reference=ref.find_all(type='raw_reference')[0].get_text()

            ref_authors=[]
            for author in ref.find_all('author'):

                if(author.find('persname')!=None):

                    if(author.find('persname').forename!=None and author.find('persname').surname!=None):
                        ref_authors.append(author.find('persname').forename.get_text()[0].lower()+" "+author.find('persname').surname.get_text().lower())

                    elif(author.find('persname').forename==None):
                        ref_authors.append(author.find('persname').surname.get_text().lower())

                    else:
                        ref_authors.append(author.find('persname').forename.get_text()[0].lower())
                        
            titles.append(ref_title)
            authors.append(ref_authors)
            doi.append(ref_doi)
                        
    
    return file.split('/')[-1],titles,doi,authors 

In [9]:
len(xml_files)

1942302

In [10]:
import multiprocessing
outputs=[]

def driver_func():
    
    # Note: Keep Num_Process <= 45 to avoid system hang
    PROCESSES = 40
    
    with multiprocessing.Pool(PROCESSES) as pool:
        results=[pool.apply_async(get_titles_authors, args=(p,)) for p in tqdm(xml_files)]
        
        
        for r in tqdm(results):
            try:
                if(r.get(timeout=5)!=None):
                    outputs.append(r.get())
                
            except Exception as e:
                    pass
        pool.close()
        pool.join()

driver_func()

  0%|          | 0/1942302 [00:00<?, ?it/s]

  0%|          | 0/1942302 [00:00<?, ?it/s]

In [11]:
output_list=[]
for f,titles,doi,authors in tqdm(outputs):
    output_list+=[{'title':t,'authors':a} for t,d,a in zip(titles,doi,authors)]

  0%|          | 0/1939589 [00:00<?, ?it/s]

In [12]:
# remove papers with name starting with 'under consideration...'
temp = pd.DataFrame(output_list)
temp.drop(temp[temp['title'].str.startswith('under consideration')].index,inplace=True)

In [13]:
# remove duplicates
temp['authors_tup']=temp['authors'].apply(lambda x : tuple(x) if type(x) is list else x)
temp.drop_duplicates(subset=['title','authors_tup'],keep='first',inplace=True)

In [14]:
temp.index.name = 'id'
temp.reset_index(inplace=True)
temp=temp[['id','title','authors_tup']]

In [15]:
temp.set_index(['title','authors_tup'],inplace=True)
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,id
title,authors_tup,Unnamed: 2_level_1
"structadmm: a systematic, high-efficiency framework of structured weight\n pruning for dnns","(t zhang, s ye, k zhang, x ma, n liu, l zhang, j tang, k ma, x lin, m fardad, y wang)",0
distributed optimization and statistical learning via the alternating direction method of multipliers,"([ references, boyd)",1
learning both weights and connections for efficient neural network,(),2
convergence analysis of alternating direction method of multipliers for a family of nonconvex problems,"(m luo, z hong, ; luo, hong)",3
learning structured sparsity in deep neural networks,(),4
...,...,...
l1-depth revisited: a robust angle-based outlier factor in high-dimensional space,"(n pham,)",62787071
locality adaptive discriminant analysis framework,"(x li, q wang, f nie, m chen)",62787072
linear discriminant analysis: new formulations and overfit analysis,"(d luo, c ding, h huang)",62787074
"robust 2dpca with non-greedy ¡inline-formula¿ ¡tex-math notation=""latex""¿ 1 ¡/tex-math¿¡/inline-formula¿-norm maximization for image analysis","(r wang, f nie, x yang, f gao, m yao)",62787079


In [16]:
temp.to_pickle("/home/harsh.d/Shoaib/arxiv_metadata/metadata_title_first_ids.pkl")