In [1]:
import pandas as pd
import re
import os
from shutil import copyfile, Error
from jinja2 import Environment, FileSystemLoader

# Initialization

In [2]:
path = '/home/User/Documents'
crp_file  = 'CameraReadyPapers-11-03-2021.xls'
crp_sheet = 'AISTATS2021'
org_pdf_folder = os.path.abspath('org_pdfs')
dest_pdf_folder = os.path.abspath('dest_pdfs')

# Read CMT export file

In [3]:
CRP = pd.read_excel(crp_file,sheet_name=crp_sheet)
CRP.head()

Unnamed: 0,Paper ID,Paper Title,Abstract,Author Names,Author Emails,Primary Contact Author Email,Track Name,Files,Q1 (Camera Ready Submission Instruction),Q2 (Submission Code),Q3 (Student Author)
0,4,On the Effect of Auxiliary Tasks on Representa...,While auxiliary tasks play a key role in shapi...,Clare Lyle (University of Oxford); Mark Rowlan...,clare.lyle@univ.ox.ac.uk; markrowland@google.c...,markrowland@google.com,AISTATS2021,4-Permission.pdf (1538064 bytes); 4-supp.pdf (...,Agreement accepted,NDkwN,No
1,6,LassoNet: Neural Networks with Feature Sparsity,Much work has been done recently to make neura...,Ismael Lemhadri (Stanford University)*; Feng R...,i.lemhadri@gmail.com; fengruan@stanford.edu; t...,i.lemhadri@gmail.com,AISTATS2021,6-Permission.pdf (374474 bytes); 6.pdf (120698...,Agreement accepted,OWEwZ,Yes
2,7,Projection-Free Optimization on Uniformly Conv...,The Frank-Wolfe method solves smooth constrain...,Thomas Kerdreux (INRIA/ ENS)*; Alexandre d'Asp...,thomaskerdreux@gmail.com; aspremon@ens.fr; pok...,thomaskerdreux@gmail.com,AISTATS2021,7-supp.pdf (314961 bytes); 7.pdf (553162 bytes...,Agreement accepted,ODc2Y,No
3,8,Differentiable Greedy Algorithm for Monotone S...,"Motivated by, e.g., sensitivity analysis and e...",Shinsaku Sakaue (The University of Tokyo)*,sakaue@mist.i.u-tokyo.ac.jp,sakaue@mist.i.u-tokyo.ac.jp,AISTATS2021,8.pdf (767828 bytes); 8-supp.pdf (912627 bytes...,Agreement accepted,ZWE1N,
4,12,Graphical Normalizing Flows,Normalizing flows model complex probability di...,Antoine Wehenkel (University of Liège)*; Gille...,antoine.wehenkel@uliege.be; g.louppe@uliege.be,antoine.wehenkel@uliege.be,AISTATS2021,12-supp.pdf (1132933 bytes); 12.pdf (1193165 b...,Agreement accepted,NGEzM,


## Run some basic validations

In [4]:
CRP[["Q3 (Student Author)"]]

Unnamed: 0,Q3 (Student Author)
0,No
1,Yes
2,No
3,
4,
5,
6,Yes
7,Yes
8,Yes
9,No


# Generate the AISTATS2021 Proceeding

## Preprocess the meta data

In [5]:
# a dictionary containing papers as dictionaries. Each paper must consist of
# (1) title 
# (2) author (lastname, firstnames format separated by 'and') 
# (3) pages in “startpage–endpage” format
# (4) abstract
papers = {} 
identifiers = {}

pages_count = 1
num_pages = 9
YY = '21'
for index, row in CRP.iterrows():
    
    # read the paper information (paper id, title, authors, and abstract)
    paper_id = row['Paper ID']
    title    = row['Paper Title'].strip()
    authors  = row['Author Names'].strip()
    abstract = row['Abstract'].strip()
    files    = row['Files'].strip()
    
    # remove the newline character in the abstract
    abstract = " ".join(abstract.split())
    
    # extract filenames
    flist = list(filter(None,re.split(r'\(.*?bytes\);?',files)))
    flist = [f.strip() for f in flist]
    
    # preprocess the author names and extract the identifier (author's lastname) 
    alist = list(filter(None,re.split(r'\(.*?\)\*?;?', authors)))
    alist = [a.strip() for a in alist]
    
    first_author_lastname = alist[0].split()[-1]
    paper_key = first_author_lastname + YY
    if first_author_lastname in identifiers:
        paper_key += chr(ord('a')+identifiers[first_author_lastname])
        identifiers[first_author_lastname] += 1 
    else:
        identifiers[first_author_lastname] = 1
        
    # format the author list
    new_alist = []
    for i in range(len(alist)):  
        new_alist.append(', '.join([' '.join(alist[i].split()[1:]), alist[i].split()[0]]))
    author_list = ' and '.join(new_alist)
        
    # add the paper to the dictionary
    papers[paper_key] = {'key': paper_key,
                         'id': paper_id, 
                         'title': title, 
                         'authors': author_list,
                         'files': flist,
                         'pages':'{}-{}'.format(pages_count,pages_count+num_pages-1), 
                         'abstract': abstract} 
    pages_count += num_pages

In [6]:
print('Number of pages: {}, Number of papers: {}'.format(pages_count,(pages_count-1)/num_pages))

Number of pages: 4096, Number of papers: 455.0


## Preprocess the pdfs

In [7]:
def copy_file(src, dest):
    try:
        copyfile(src, dest)
    except Error as err: 
        errors.extend(err.args[0])    

In [8]:
problematic_papers = {}
form_not_found = 'permission form not found.'
main_not_found = 'main paper not found.'
supp_not_found = 'supplementary file may not exist.'
multiple_supps = 'multiple supplementary files.'

for iden in papers:
    paper_id   = papers[iden]['id']
    main_paper = '{}.pdf'.format(paper_id)
    supplement = '{}-supp'.format(paper_id)
    perm_form  = '{}-Permission.pdf'.format(paper_id)
    
    # process main paper
    if main_paper in papers[iden]['files']:
        org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,main_paper))
    else:
        potential_main = [mf for mf in papers[iden]['files'] 
                          if any(subt for subt in ['main','camera','ready'] if subt in mf.lower())]
        if any(potential_main):
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_main[0]))
        else:
            problematic_papers[iden] = main_not_found
            continue
    
    dest_file = os.path.join(dest_pdf_folder,'{}.pdf'.format(iden))
    copy_file(org_file, dest_file)
    
    # process permission form
    if perm_form in papers[iden]['files']:
        org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,perm_form))
    else:
        potential_form = [pf for pf in papers[iden]['files'] 
                          if any(subt for subt in ['permission','pmlr','agreement','license'] if subt in pf.lower())]
        if any(potential_form):
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_form[0]))
        else:
            problematic_papers[iden] = form_not_found
            continue
        
    dest_file = os.path.join(dest_pdf_folder,'{}-Permission.pdf'.format(iden))
    copy_file(org_file, dest_file)
        
    # process supplementary file
    supplement_file = [sf for sf in papers[iden]['files'] if supplement in sf]
    if any(supplement_file):
        if len(supplement_file) == 1:
            supplement_file = supplement_file[0]
            supp_ext = os.path.splitext(supplement_file)[1]
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,supplement_file))
        else:
            problematic_papers[iden] = multiple_supps
            continue
            
    else:
        potential_supp = [ps for ps in papers[iden]['files'] 
                          if any(subt for subt in ['sup','supp','supplementary','appendix'] if subt in ps.lower())]
        
        if any(potential_supp):
            supp_ext = os.path.splitext(potential_supp[0])[1]
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_supp[0]))    
        else:
            problematic_papers[iden] = supp_not_found
            continue
    
    dest_file = os.path.join(dest_pdf_folder,'{}-supp{}'.format(iden,supp_ext))
    copy_file(org_file, dest_file)

In [9]:
no_permission_form = [p for p in problematic_papers if problematic_papers[p] == form_not_found]
no_main_paper      = [p for p in problematic_papers if problematic_papers[p] == main_not_found]
no_supplement      = [p for p in problematic_papers if problematic_papers[p] == supp_not_found]
multiple_supps     = [p for p in problematic_papers if problematic_papers[p] == multiple_supps]

print('Number of problematic papers: {}'.format(len(problematic_papers)))
print('No permission form: {}'.format(len(no_permission_form)))
print('No main paper: {}'.format(len(no_main_paper)))
print('No supplementary: {}'.format(len(no_supplement)))
print('Multiple supplements: {}'.format(len(multiple_supps)))

Number of problematic papers: 36
No permission form: 7
No main paper: 0
No supplementary: 29
Multiple supplements: 0


In [10]:
[papers[key]['id'] for key in no_main_paper]

[]

In [11]:
[papers[key]['id'] for key in no_permission_form]

[339, 420, 434, 690, 1118, 1165, 1342]

In [12]:
[papers[key]['id'] for key in no_supplement]

[50,
 76,
 208,
 292,
 312,
 344,
 406,
 547,
 684,
 796,
 799,
 975,
 1058,
 1062,
 1068,
 1090,
 1159,
 1194,
 1205,
 1233,
 1240,
 1365,
 1436,
 1445,
 1467,
 1565,
 1581,
 1584,
 1671]

## Export the bibtex file

In [13]:
file_loader = FileSystemLoader('templates')
env = Environment(loader=file_loader)
template = env.get_template('bibtex_template.txt')
aistats21_bibtex = template.render(papers=papers)

f = open('aistats21.bib','w')
f.write(aistats21_bibtex)
f.close()