In [None]:
from bs4 import BeautifulSoup as bs
from PyPDF2 import PdfFileReader
import pandas as pd
import numpy as np
import re


In [84]:
input_df = pd.read_csv('nber_scholars/nber_affiliated_scholar_paper.csv')

process_df = input_df.loc[:, ['displaydate', 'nid', 'title', 'type', 'url', 'displaytypename', 'authors', 'assigned_author', 'assigned_author_uid', 'page_url', 'pdf_url', 'disclosures', 'doi', 'issue_date']]
process_df.rename(columns={
    'disclosures' : 'web_disclosures',
    'url' : 'pseudo_path'},
    inplace=True)
    
process_df.loc[:, 'pdf_disclosures'] = None
process_df.loc[:, 'pdf_jel_codes'] = None
process_df.head(1)

Unnamed: 0,displaydate,nid,title,type,pseudo_path,displaytypename,authors,assigned_author,assigned_author_uid,page_url,pdf_url,web_disclosures,doi,issue_date,pdf_disclosures,pdf_jel_codes
0,March 2018,78898,Statistical Non-Significance in Empirical Econ...,working_paper,/papers/w24403,Working Paper,"['<a href=""/people/alberto_abadie"">Alberto Aba...",Alberto Abadie,entity:user:48551,https://www.nber.org/papers/w24403,https://www.nber.org/system/files/working_pape...,"\nI thank Isaiah Andrews, Joshua Angrist, Amy ...",10.3386/w24403,March 2018,,


In [156]:
disclosure_pattern = r'(?<=\s\d{4}\n)(.*)(?=(NBER working papers are circulated for discussion and comment purposes|The views expressed herein are the author?\(s?\) and not necessarily those of the National Bureau of Economic Research\.))'
jel_code_pattern = r'(?<=JEL No\.)(.*)(?=\n)'

for row in process_df.sample(n=10, random_state=28).index.tolist():
    observation = process_df.loc[row, :]
    pseudo_path = observation.pseudo_path
    print('WORKING ON PAPER: {}'.format(pseudo_path))

    document = read_pdf(pseudo_path)

    if document is False:
        print('failover')
        process_df.loc[row, 'pdf_disclosures'] = 'PDF read failure - no PDF on NBER website'
        process_df.loc[row, 'pdf_jel_codes'] = 'PDF read failure - no PDF on NBER website'
        continue

    page_0 = pdf_page_to_string(document, 0)
    page_1 = pdf_page_to_string(document, 1)

    if check_empty_page(page_0):
        print('this is an empty part of teh if statement. OCR NEEDED?')
    else:
        try:
            in_document_disclosures_object = re.search(disclosure_pattern, page_0, flags=re.S)
            in_document_disclosures_cleaned = re.sub(r'\n', ' ', in_document_disclosures_object.group())
            # print(in_document_disclosures_cleaned)
            print('\t got pdf disclosures')
        except:
            print("ERROR IN PAGE 0")
            print(page_0)


    if check_empty_page(page_1):
        print('this is an empty part of the if statement. OCR NEEDED?')
    else:
        try:
            jel_codes_object = re.search(jel_code_pattern, page_1)
            jel_codes_cleaned = re.sub(r'\s', '', jel_codes_object.group())
            jel_codes_list = jel_codes_cleaned.split(',')
            # print(jel_codes_list)
            print('\t got pdf jel_codes')
        except:
            print("ERROR IN PAGE 1")
            print(page_1)

WORKING ON PAPER: /papers/w17373
	 got pdf disclosures
	 got pdf jel_codes
WORKING ON PAPER: /papers/w26755
	 got pdf disclosures
	 got pdf jel_codes
WORKING ON PAPER: /papers/w25190
	 got pdf disclosures
	 got pdf jel_codes
WORKING ON PAPER: /papers/w23100
	 got pdf disclosures
	 got pdf jel_codes
WORKING ON PAPER: /papers/w20325
i think this is an empty page:
this is an empty part of teh if statement. OCR NEEDED?
ERROR IN PAGE 1



ABSTRACT

































WORKING ON PAPER: /papers/w6602
i think this is an empty page:
this is an empty part of teh if statement. OCR NEEDED?
i think this is an empty page:
this is an empty part of the if statement. OCR NEEDED?
WORKING ON PAPER: /papers/w5609
ERROR IN PAGE 0
NBER
WOR~G
PAPER
SERIES
THE
WEALTH
OF
COHORTS:
RETIREMENT
SAVING
AND
THE
CHANGING
ASSETS
OF
OLDER
AMERICANS
Steven
F.
Venti
David
A.
Wise
Working
Paper
5609
NATIONAL
BUREAU
OF
ECONOMIC
RESEARCH
1050
Massachusetts
Avenue
Cambridge,
MA
02138
June
1996
Funding
was
provid



In [143]:
def read_pdf(file_name_code):
    file_name = re.search(r'(w|h|t)\d+', file_name_code).group()

    file_path = 'nber_working_papers//' + file_name + '.pdf'
    try:
        pdf_obj = PdfFileReader(file_path)
        return pdf_obj
    except Exception as e:
        s = str(e)
        # print(s)
        if s == "PDF starts with '<!DOC', but '%PDF-' expected":
            return False


In [45]:
def pdf_page_to_string(pdf_obj, page_num):
    page_of_interest = pdf_obj.pages[page_num]
    try:
        pdf_page_as_string = page_of_interest.extractText()
    except Exception as e:
        print(e)
        print('failure to read pdf page')
        pdf_page_as_string = 'PLACEHOLDER TEXT'

    return pdf_page_as_string

In [144]:
def check_empty_page(pdf_page):
    if re.search(r'.', pdf_page) is None:
        print('i think this is an empty page:')
        # print(re.search(r'.', pdf_page))
        return True
    else:
        # print('i think i found something on this page')
        # print(re.search(r'.', pdf_page))
        return False

In [None]:
test_doc = read_pdf('')