# Convert PRAMS pdf to csv

In [None]:
import os

import camelot
import numpy as np
import pandas as pd

In [None]:
def pdf_to_df(filename):
    df = pd.DataFrame(data = {})
    data = camelot.read_pdf(filename, flavor='stream', pages='all', strip_text='•\n§#¶¥*†‡')
    for table in data:
        if table.shape[1] > 1:  # ignore legend (does this work for all files?)
            data_dict = get_data_dict(table.df)
            df = pd.concat([df, pd.DataFrame(data_dict)])
    return df

def get_data_dict(df):
    data = {
        'Category': [],
        'Indicator': [],
        'SubIndicator': [],
        'Year': [],
        'SampleSize': [],
        'Prevalence': [],
        'CI': []
    }
    for ii in range(len(df)):
        if is_category(ii, df):
            category = format_chars(df.iloc[ii, 0])
        if is_year(ii, df):
            year_dict = get_year_dict(ii, df)
        if is_indicator(ii, df):
            indicator = format_chars(df.iloc[ii, 1])
            if is_data(ii, df):  # data row
                add_row(ii, df, data, category, indicator, year_dict)
            else:  # subindicator row
                if is_subindicator(ii + 1, df):
                    jj = ii + 1
                    while is_subindicator(jj, df):
                        sub_indicator = format_chars(df.iloc[jj, 1].split('\n')[0].rstrip())
                        add_row(jj, df, data, category, indicator, year_dict, sub_indicator)
                        jj += 1
                else:  # multiline row
                    indicator = indicator.rstrip() + ' ' + df.iloc[ii + 2, 1]
                    add_row(ii + 1, df, data, category, indicator, year_dict)
    return data

def is_category(ii, df):
    if ii >= len(df) - 1:
        return False
    return df.iloc[ii, 0] != '' and is_indicator(ii + 1, df)

def is_year(ii, df):
    return 'Overall' in df.iloc[ii, -1]

def get_year_dict(ii, df):
    if is_year(ii, df):
        year_dict = {}
        for jj, year in enumerate(df.iloc[ii]):
            if len(year) == 4:
                year_dict[year] = jj
        return year_dict
    else:
        raise ValueError("Row does not contain years.")

def is_indicator(ii, df):
    if ii >= len(df) - 1:
        return False
    return df.iloc[ii, 0] == '' and (is_subindicator(ii + 1, df) or is_data(ii, df))

def is_subindicator(ii, df):
    return is_data(ii, df) and df.iloc[ii, 0] == '' and df.iloc[ii, 1] != ''


def is_data(ii, df):
    return df.iloc[ii, 2] not in ['', 'N']

def add_row(ii, df, data, category, indicator, year_dict, sub_indicator=np.nan):
    # Modifies dictionary in place
    if is_data(ii, df):
        for year in year_dict:
            data['Category'].append(category)
            data['Indicator'].append(indicator)
            data['SubIndicator'].append(sub_indicator)
            data['Year'].append(year)
            data['SampleSize'].append(df.iloc[ii, year_dict[year] - 1])
            data['Prevalence'].append(df.iloc[ii, year_dict[year]].split()[0])
            data['CI'].append(df.iloc[ii, year_dict[year]].split()[1])
    else:
        raise ValueError("Row does not contain data.")

def format_chars(label):
    # >= and <= not saved properly in csv
    if '≥' in label:
        label = label.replace('≥', '>=')
    if '≤' in label:
        label = label.replace('≤', '<=')
    return label

In [None]:
filenames = os.listdir('../data/PRAMS/pdf/')
for filename in filenames:
    try:
        df = pdf_to_df('../data/PRAMS/pdf/' + filename)
        df.to_csv('../data/PRAMS/csv/' + filename[:-3] + 'csv', index=False)
    except:
        print(f"Failed: {filename}")
        
# suppress warnings
# files that fail: 2012-2015
# which files aren't getting read, and what do I need to modify?
# which files are getting read, and do they look okay?