## Convert all pdfs to pandas dataframes
Tested under Python 3.7.<br>
This notebook converts all abstract pdfs in one directory into one dataframe that has one entry for each abstract, and rows as defined in an auxiliary csv (e.g., *abstract_title*, *abstract_id*, *submitted_by*).


### Define directories, settings.

In [None]:
# directory with pdfs
path = r''

# columns in dataframe:
cols = ['abstract_title', 'abstract_id', 'submitted_by', 'source', 'overview', 'summary', 'implications']

# path to csv that holds the tag words that designate the beginning and end of a section 
csv_cols_completed = r'cols.csv'

# number of characters printed for each abstract, for debugging
print_chars = 0

# drop rows that have empty cells (e.g., no summary)
# (rows that don't have an abstract_id will always be dropped)
drop_rows = False

### Imports

In [None]:
import os
import glob
import requests
import sys, getopt
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt

### Functions

In [None]:
def timestamp():
    import time
    return time.strftime('_%Y%m%d_%H%M%S')

def open_pdf(fname):
    """
    opens fname, returns pdf object
    also works with encrypted pdf files, at least some of them
    """
    import PyPDF2
    pdf = PyPDF2.PdfFileReader(fname, 'rb')
    if pdf.isEncrypted:
        pdf.decrypt("")
    return pdf

def remove_specialchars(str, list):
    for item in list:
        str = str.replace(item, ' ')
    return str

def page2txt(pdf, n):
    page = pdf.getPage(n)
#     print(type(page.extractText()))
#     print(type(page.extractText().encode('utf-8')))
#     return page.extractText().encode('utf-8')
    txt = page.extractText()
    specialcharlist = ['\n']
    return remove_specialchars(txt, specialcharlist)    

def txt_contains_abstract(txt, df_cols):
    """
    checks if txt contains abstract
    must have keywords to be considered abstract ("summary", "overview" etc.)
    input:
        txt: txt string
        df_cols: dataframe with strings that mark the beginning of a section
    output:
        True if txt contains abstract, i.e. > 80% of the marker strings
        False otherwise
    """
    return np.average(np.array(df_cols.startstring.apply(lambda x: len(txt.split(x))>1).astype(float))) > .8

def get_section(txt, df_cols, section_name):
    """
    extracts section from txt string
     input:
        txt: txt string
        df_cols: dataframe with strings that mark the beginning of a section
        section_name: name of section to be extracted
    output:
        section
    """
    startstring = df_cols.loc[df_cols.category == section_name].startstring.values[0]
    endstring = df_cols.loc[df_cols.category == section_name].endstring.values[0]
    endstring_alt = df_cols.loc[df_cols.category == section_name].endstring_alt.values[0]
#     print 'startstring, endstring, endstring_alt: ', startstring, endstring, endstring_alt
    sections = txt.split(startstring)
    if len(sections) == 1:
        print('Could not find ', startstring, ' in ', txt[:print_chars], '...')
        return ''
    sections = sections[1:]
    for i, section in enumerate(sections):
        sections[i] = section.split(endstring)[0].split(endstring_alt)[0].rstrip().lstrip()
    if len(sections) != 1:
        print('Attention, ', section_name, ' appeared > 1x in ', txt[:print_chars], '...')
    return sections[0]

def txt2row(txt, df_cols, cols):
    """
    turns string into one dataframe row with columns cols
    input:
        txt: string
        df_cols: dataframe with strings that mark the beginning of a section
        cols: np.array with names of columns
    output:
        one row as dict with col names as indices
    """
    row = {}
    for col in cols:
        row[col] = get_section(txt, df_cols, col)
    return row
    
def drop_rows_with_empty_cells(df):
    df.replace('', np.nan, inplace=True)
    df.dropna(axis=0, how='any', inplace = True)
    return df

def make_index(a, prefix):
    a2 = np.zeros_like(a, dtype=object)
    for i in range(len(a)):
        a2[i] = prefix+str(a[i]).zfill(3)
    return a2


### Convert each pdf to a dataframe

In [None]:
files = glob.glob(os.path.join(path, '*.pdf'))
df_cols = pd.read_csv(csv_cols_completed)

for f in files:
    print(f)
    pdf = open_pdf(f)
    n_pages = pdf.getNumPages()
    df_abstracts = pd.DataFrame(columns=cols)
    for n in range(n_pages):
        print('page ', str(n))
        txt = page2txt(pdf, n)
        print(txt[:print_chars], '....')
        if not txt_contains_abstract(txt, df_cols):
            print('Page does not contain abstract.')
        else:
            print('Page contains abstract.')
            row = txt2row(txt, df_cols, cols)
#             row['index'] = row['abstract_id']
            print(row['abstract_id'])
            df_abstracts = df_abstracts.append(row, ignore_index=True)
        print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('number of rows in df: ', len(df_abstracts))
    if drop_rows: 
        df_abstracts = drop_rows_with_empty_cells(df_abstracts)
    df_abstracts = df_abstracts[df_abstracts['abstract_id'] != ''] # remove rows with empty abstract_id field
    print('number of rows in df: ', len(df_abstracts))
    df_abstracts = df_abstracts.set_index('abstract_id')
    display(df_abstracts)
    df_abstracts.to_pickle(f.replace('.pdf', '.df'))

In [None]:
df_abstracts

### Plot contributions from each contributor ("submitted by")

In [None]:
authors = np.unique(df_abstracts.submitted_by.values)
author_counts = np.zeros(len(authors))
for i, author in enumerate(authors):
    author_counts[i] = np.sum(df_abstracts.submitted_by == author)
s = np.argsort(author_counts)[::-1]
author_counts = author_counts[s]
authors = authors[s]
xvalues = np.arange(len(authors))
plt.plot(xvalues, author_counts, 'xr')
plt.xticks(xvalues, authors, rotation=90)
plt.savefig(r'your path here \submitted_by_plot'+timestamp()+'.png', bbox_inches='tight')
plt.show()

### Combine dataframes into one big dataframe

In [None]:
dfs = glob.glob(os.path.join(path, '*.df'))
df_all = pd.read_pickle(dfs[0])
for dfi in dfs[1:]:
    print('Now appending ', dfi)
    df = pd.read_pickle(dfi)
    print(len(df))
    df_all = df_all.append(df)
    print(len(df_all))
df_all.to_pickle(os.path.join(path, 'combined', 'all_abstracts'+timestamp()+'.df'))

In [None]:
df_all = glob.glob(os.path.join(path, 'combined', 'all_abstracts*.df'))[-1]
print(df_all)
df_all = pd.read_pickle(df_all)
print(len(df_all))