In [None]:
import pandas as pd
import re
import os
from shutil import copyfile, Error
from jinja2 import Environment, FileSystemLoader

# Initialization

In [None]:
path = '/home/User/Documents'
crp_file  = 'CameraReadyPapers-11-03-2021.xls'
crp_sheet = 'AISTATS2021'
org_pdf_folder = os.path.abspath('org_pdfs')
dest_pdf_folder = os.path.abspath('dest_pdfs')

# Read CMT export file

In [None]:
CRP = pd.read_excel(crp_file,sheet_name=crp_sheet)
CRP.head()

## Run some basic validations

In [None]:
CRP[["Q3 (Student Author)"]]

# Generate the AISTATS2021 Proceeding

## Preprocess the meta data

In [None]:
# a dictionary containing papers as dictionaries. Each paper must consist of
# (1) title 
# (2) author (lastname, firstnames format separated by 'and') 
# (3) pages in “startpage–endpage” format
# (4) abstract
papers = {} 
identifiers = {}

pages_count = 1
num_pages = 9
YY = '21'
for index, row in CRP.iterrows():
    
    # read the paper information (paper id, title, authors, and abstract)
    paper_id = row['Paper ID']
    title    = row['Paper Title'].strip()
    authors  = row['Author Names'].strip()
    abstract = row['Abstract'].strip()
    files    = row['Files'].strip()
    
    # remove the newline character in the abstract
    abstract = " ".join(abstract.split())
    
    # extract filenames
    flist = list(filter(None,re.split(r'\(.*?bytes\);?',files)))
    flist = [f.strip() for f in flist]
    
    # preprocess the author names and extract the identifier (author's lastname) 
    alist = list(filter(None,re.split(r'\(.*?\)\*?;?', authors)))
    alist = [a.strip() for a in alist]
    
    first_author_lastname = alist[0].split()[-1]
    paper_key = first_author_lastname + YY
    if first_author_lastname in identifiers:
        paper_key += chr(ord('a')+identifiers[first_author_lastname])
        identifiers[first_author_lastname] += 1 
    else:
        identifiers[first_author_lastname] = 1
        
    # format the author list
    new_alist = []
    for i in range(len(alist)):  
        new_alist.append(', '.join([' '.join(alist[i].split()[1:]), alist[i].split()[0]]))
    author_list = ' and '.join(new_alist)
        
    # add the paper to the dictionary
    papers[paper_key] = {'key': paper_key,
                         'id': paper_id, 
                         'title': title, 
                         'authors': author_list,
                         'files': flist,
                         'pages':'{}-{}'.format(pages_count,pages_count+num_pages-1), 
                         'abstract': abstract} 
    pages_count += num_pages

In [None]:
print('Number of pages: {}, Number of papers: {}'.format(pages_count,(pages_count-1)/num_pages))

## Preprocess the pdfs

In [None]:
def copy_file(src, dest):
    try:
        copyfile(src, dest)
    except Error as err: 
        errors.extend(err.args[0])    

In [None]:
problematic_papers = {}
form_not_found = 'permission form not found.'
main_not_found = 'main paper not found.'
supp_not_found = 'supplementary file may not exist.'
multiple_supps = 'multiple supplementary files.'

for iden in papers:
    paper_id   = papers[iden]['id']
    main_paper = '{}.pdf'.format(paper_id)
    supplement = '{}-supp'.format(paper_id)
    perm_form  = '{}-Permission.pdf'.format(paper_id)
    
    # process main paper
    if main_paper in papers[iden]['files']:
        org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,main_paper))
    else:
        potential_main = [mf for mf in papers[iden]['files'] 
                          if any(subt for subt in ['main','camera','ready'] if subt in mf.lower())]
        if any(potential_main):
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_main[0]))
        else:
            problematic_papers[iden] = main_not_found
            continue
    
    dest_file = os.path.join(dest_pdf_folder,'{}.pdf'.format(iden))
    copy_file(org_file, dest_file)
    
    # process permission form
    if perm_form in papers[iden]['files']:
        org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,perm_form))
    else:
        potential_form = [pf for pf in papers[iden]['files'] 
                          if any(subt for subt in ['permission','pmlr','agreement','license'] if subt in pf.lower())]
        if any(potential_form):
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_form[0]))
        else:
            problematic_papers[iden] = form_not_found
            continue
        
    dest_file = os.path.join(dest_pdf_folder,'{}-Permission.pdf'.format(iden))
    copy_file(org_file, dest_file)
        
    # process supplementary file
    supplement_file = [sf for sf in papers[iden]['files'] if supplement in sf]
    if any(supplement_file):
        if len(supplement_file) == 1:
            supplement_file = supplement_file[0]
            supp_ext = os.path.splitext(supplement_file)[1]
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,supplement_file))
        else:
            problematic_papers[iden] = multiple_supps
            continue
            
    else:
        potential_supp = [ps for ps in papers[iden]['files'] 
                          if any(subt for subt in ['sup','supp','supplementary','appendix'] if subt in ps.lower())]
        
        if any(potential_supp):
            supp_ext = os.path.splitext(potential_supp[0])[1]
            org_file  = os.path.join(org_pdf_folder,'{}\CameraReady\{}'.format(paper_id,potential_supp[0]))    
        else:
            problematic_papers[iden] = supp_not_found
            continue
    
    dest_file = os.path.join(dest_pdf_folder,'{}-supp{}'.format(iden,supp_ext))
    copy_file(org_file, dest_file)

In [None]:
no_permission_form = [p for p in problematic_papers if problematic_papers[p] == form_not_found]
no_main_paper      = [p for p in problematic_papers if problematic_papers[p] == main_not_found]
no_supplement      = [p for p in problematic_papers if problematic_papers[p] == supp_not_found]
multiple_supps     = [p for p in problematic_papers if problematic_papers[p] == multiple_supps]

print('Number of problematic papers: {}'.format(len(problematic_papers)))
print('No permission form: {}'.format(len(no_permission_form)))
print('No main paper: {}'.format(len(no_main_paper)))
print('No supplementary: {}'.format(len(no_supplement)))
print('Multiple supplements: {}'.format(len(multiple_supps)))

In [None]:
[papers[key]['id'] for key in no_main_paper]

In [None]:
[papers[key]['id'] for key in no_permission_form]

In [None]:
[papers[key]['id'] for key in no_supplement]

## Export the bibtex file

In [None]:
file_loader = FileSystemLoader('templates')
env = Environment(loader=file_loader)
template = env.get_template('bibtex_template.txt')
aistats21_bibtex = template.render(papers=papers)

f = open('aistats21.bib','w')
f.write(aistats21_bibtex)
f.close()