In [1]:
import os
import tqdm
import pandas as pd
from pdfrw import PdfReader
from pprint import pprint

In [2]:
class Paper():
    def __init__(self, pdf_path, verbose=1):
        # Setup rawpath
        self.rawpath = pdf_path
        # Setup verbose limit
        self.verbose = verbose
        # Get info from pdf file in pdf_path
        self.info = PdfReader(pdf_path).Info
        # Check regularity of the paper
        self.is_goodpaper = self.check()
        
    def check(self):
        # Check if the paper is regular
        
        # If not contain /Subject, it is un-regular
        if '/Subject' not in self.info:
            return False
        
        # Try to fetch doi from /Subject
        subject = self.info['/Subject'][1:-1]
        doi = subject.split(';')[-1]
        
        # Fetch doi, if /doi exists
        if '/doi' in self.info:
            doi = self.info['/doi'][1:-1].strip()
            
        # Fetch title
        title = self.info['/Title'][1:-1].strip()
        
        # If not both doi and title exist, it is un-regular
        if not all([doi, title]):
            return False
        
        # Content check
        if not len(doi.split('/')) == 2:
            return False
        if not len(doi.split('/')[0].split('.')) == 2:
            return False
        
        # Return True for regular paper
        self.doi = doi
        self.title = title
        return True
    
    def log(self, message, verbose=1):
        # Logging message
        # Filter by verbose
        if verbose > self.verbose:
            return 0
        # Print message
        pprint(message)
    
    def report(self):
        # If this is not a good paper, logging as bad paper
        if not self.is_goodpaper:
            self.log('[Bad paper]', verbose=2)
            return 0
        
        # Report infomations of the good paper
        self.log('-' * 80)
        self.log(dict(
            rawpath=self.rawpath,
            title=self.title,
            doi=self.doi,
        ))
        

In [3]:
def all_pdfs(src_path):
    # Get all pdf files in src_path
    # src_path: source path
    print('=' * 80)
    print(src_path)
    # List all .pdf files
    pdfs = [s for s in os.listdir(src_path) if s .endswith('.pdf')]
    # print(pdfs)
    # Return list of full path
    return [os.path.join(src_path, e) for e in pdfs]

In [4]:
df = pd.DataFrame(columns=['title', 'doi', 'rawpath'])
pbar = tqdm.tqdm(all_pdfs('C:\\Users\\liste\\OneDrive\\Documents\\schorlar\\buffer'))
for pdf_path in pbar:
    paper = Paper(pdf_path, verbose=0)
    paper.report()
    if paper.is_goodpaper:
        df = df.append(pd.Series(dict(
            title=paper.title,
            doi=paper.doi,
            rawpath=paper.rawpath
        )), ignore_index=True)
pbar.close()

  6%|███████▊                                                                                                                                  | 12/211 [00:00<00:01, 112.12it/s]

C:\Users\liste\OneDrive\Documents\schorlar\buffer


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 211/211 [00:02<00:00, 79.32it/s]


In [6]:
df

Unnamed: 0,title,doi,rawpath
0,Improved estimation of subject-level functiona...,10.1016/j.neuroimage.2018.01.029,C:\Users\liste\OneDrive\Documents\schorlar\buf...
1,Integrating theoretical models with functional...,10.1016/j.jmp.2016.06.008,C:\Users\liste\OneDrive\Documents\schorlar\buf...
2,Deep Learning of Constrained Autoencoders for ...,10.1109/TNNLS.2017.2747861,C:\Users\liste\OneDrive\Documents\schorlar\buf...
3,Feedback Convolutional Neural Network for Visu...,10.1109/TPAMI.2018.2843329,C:\Users\liste\OneDrive\Documents\schorlar\buf...
4,Visual pathways from the perspective of cost f...,10.1016/j.cortex.2017.09.019,C:\Users\liste\OneDrive\Documents\schorlar\buf...
...,...,...,...
79,Visual pathways from the perspective of cost f...,10.1016/j.cortex.2017.09.019,C:\Users\liste\OneDrive\Documents\schorlar\buf...
80,Visual pathways from the perspective of cost f...,10.1016/j.cortex.2017.09.019,C:\Users\liste\OneDrive\Documents\schorlar\buf...
81,What is changing when: Decoding visual informa...,10.1016/j.neuroimage.2017.08.027,C:\Users\liste\OneDrive\Documents\schorlar\buf...
82,What is changing when: Decoding visual informa...,10.1016/j.neuroimage.2017.08.027,C:\Users\liste\OneDrive\Documents\schorlar\buf...


In [10]:
df.to_json(os.path.join('examples', 'papers.json'), orient='records')