In [1]:
import os
import tqdm
import pandas as pd
from pdfrw import PdfReader
from pprint import pprint

In [2]:
class Paper():
    def __init__(self, pdf_path, verbose=1):
        # Setup rawpath
        self.rawpath = pdf_path
        # Setup verbose limit
        self.verbose = verbose
        # Get info from pdf file in pdf_path
        self.info = PdfReader(pdf_path).Info
        # Check regularity of the paper
        self.is_goodpaper = self.check()
        
    def check(self):
        # Pre-place doi and title
        self.doi = '[doi]'
        self.title = '[title]'
        
        # Check if the paper is regular
        
        # If not contain /Subject, it is un-regular
        if '/Subject' not in self.info:
            return False
        
        # Try to fetch doi from /Subject
        subject = self.info['/Subject'][1:-1]
        doi = subject.split(';')[-1]
        
        # Fetch doi, if /doi exists
        if '/doi' in self.info:
            doi = self.info['/doi'][1:-1].strip()
            
        # Fetch title
        title = self.info['/Title'][1:-1].strip()
        
        # If not both doi and title exist, it is un-regular
        if not all([doi, title]):
            return False
        
        # Content check
        if not len(doi.split('/')) == 2:
            return False
        if not len(doi.split('/')[0].split('.')) == 2:
            return False
        
        # Setup doi and title
        self.doi = doi
        self.title = title
        
        # Return True for regular paper
        return True
    
    def log(self, message, verbose=1):
        # Logging message
        # Filter by verbose
        if verbose > self.verbose:
            return 0
        # Print message
        pprint(message)
    
    def report(self):
        # If this is not a good paper, logging as bad paper
        if not self.is_goodpaper:
            self.log('[Bad paper]', verbose=2)
            return 0
        
        # Report infomations of the good paper
        self.log('-' * 80)
        self.log(dict(
            rawpath=self.rawpath,
            title=self.title,
            doi=self.doi,
        ))
        

In [3]:
def all_pdfs(src_path):
    # Get all pdf files in src_path
    # src_path: source path
    print('=' * 80)
    print(src_path)
    # List all .pdf files
    pdfs = [s for s in os.listdir(src_path) if s .endswith('.pdf')]
    # print(pdfs)
    # Return list of full path
    return [os.path.join(src_path, e) for e in pdfs]

In [4]:
# Init DataFrame
# title: paper title
# doi: paper doi
# rawpath: pdf rawpath, full path
# uid: pdf filename
raw_df = pd.DataFrame(columns=['title', 'doi', 'rawpath', 'uid'])
custom_df = pd.DataFrame(columns=['title', 'doi', 'rawpath', 'uid'])

# Load tqdm as generator to walk through all pdfs in target dir
with tqdm.tqdm(all_pdfs('C:\\Users\\liste\\OneDrive\\Documents\\schorlar\\buffer')) as pbar:
    # For each paper
    for pdf_path in pbar:
        # verbose=0 means absolute quiet
        paper = Paper(pdf_path, verbose=0)
        # Debug
        paper.report()
        # Append DataFrame
        raw_df = raw_df.append(
            pd.Series(dict(
                title=paper.title,
                doi=paper.doi,
                rawpath=paper.rawpath,
                uid=os.path.basename(paper.rawpath)
            )), ignore_index=True)

  4%|███▊                                                                                              | 8/209 [00:00<00:02, 79.97it/s]

C:\Users\liste\OneDrive\Documents\schorlar\buffer


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 209/209 [00:02<00:00, 71.33it/s]


In [5]:
raw_df

Unnamed: 0,title,doi,rawpath,uid
0,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,! VisualFeatureExtractionfromVoxel-Weighted-Av...
1,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,!10.1093@cercor@bhy123.pdf
2,Improved estimation of subject-level functiona...,10.1016/j.neuroimage.2018.01.029,C:\Users\liste\OneDrive\Documents\schorlar\buf...,!Improved estimation of subject-level function...
3,Integrating theoretical models with functional...,10.1016/j.jmp.2016.06.008,C:\Users\liste\OneDrive\Documents\schorlar\buf...,!Integrating theoretical models with functiona...
4,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,0102181v1.pdf
...,...,...,...,...
204,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,zeilerECCV2014.pdf
205,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,zpq10607.pdf
206,A primer on encoding models in sensory neurosc...,10.1016/j.jmp.2016.06.009,C:\Users\liste\OneDrive\Documents\schorlar\buf...,！A primer on encoding models in sensory neuros...
207,[title],[doi],C:\Users\liste\OneDrive\Documents\schorlar\buf...,！NeuroEncodingandDecodingwithDL-DynamicNatural...


In [6]:
raw_df.to_json(os.path.join('paper_jsons', 'raw.json'), orient='records')
custom_df.to_json(os.path.join('paper_jsons', 'custom.json'), orient='records')