# Convert 2012-2015 PRAMS pdf to csv

In [1]:
import os
import warnings

import camelot
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Need to do something fancier with years...
# Second page does not have years
# If any year is missing, that whole column is not recorded in the second page
# Can get year dict from first page
# But also need to get missing columns from first page to translate to 
#   new year dict in second page

In [91]:
def pdf_to_df(filename):
    df = pd.DataFrame(data = {})
    data = camelot.read_pdf(filename, flavor='stream', pages='all', strip_text='\n§#¶¥*†‡◊‖±^')
    for ii in range(2):  # assuming only two pages
        table = data[ii]
        data_dict = get_data_dict(table.df)
        df = pd.concat([df, pd.DataFrame(data_dict)])
    return df

def get_data_dict(df):
    data = {
        'Category': [],
        'Indicator': [],
        'Year': [],
        'SampleSize': [],
        'Prevalence': [],
        'CI': []
    }
    for ii in range(len(df)):
        if is_category(ii, df):
            category = df.iloc[ii, 0]
        if is_year(ii, df):
            year_dict = get_year_dict(ii, df)
        if is_indicator(ii, df):
            indicator = df.iloc[ii, 0][2:]
            print(f"row: {ii}")
            add_row(ii, df, data, category, indicator, year_dict)    
    return data

def add_row(ii, df, data, category, indicator, year_dict, sub_indicator=np.nan):
    if is_indicator(ii, df):
        for year in year_dict:
            stats = df.iloc[ii, year_dict[year]]
            if stats != '':  # only append years with data
                data['Category'].append(category)
                data['Indicator'].append(indicator)
                data['Year'].append(year)
                data['SampleSize'].append(df.iloc[ii, year_dict[year] - 1])
                data['Prevalence'].append(stats.split()[0])
                data['CI'].append(stats.split()[1])
    else:
        raise ValueError(f"Row {ii} does not contain data.")

def is_category(ii, df):
    if ii >= len(df) - 1:
        return False
    return df.iloc[ii, 0] != '' and df.iloc[ii, 0][0] != '-' and is_indicator(ii + 1, df)

def is_year(ii, df):
    return '2012' in df.iloc[ii, 2]
    
    data[0].df.iloc[2, 2]
    return 'Overall' in df.iloc[ii, -1]

def get_year_dict(ii, df):
    if is_year(ii, df):
        year_dict = {}
        for jj, year in enumerate(df.iloc[ii]):
            if ('20' in year) and ('Overall' not in year):
                year_dict[year[:4]] = jj
        return year_dict
    else:
        raise ValueError(f"Row {ii} does not contain years.")

def is_indicator(ii, df):
    return df.iloc[ii, 0] != '' and df.iloc[ii, 0][0] == '-'

def get_name(filename):
    name = ''
    for word in filename.split('-')[:-2]:
        name += word.capitalize()
    return name

In [101]:
# df = pd.DataFrame()
# filenames = os.listdir('../data/PRAMS/pdf/')
# for filename in filenames:
#     if ('2012' in filename) and ('All' not in filename):
#         df_state = pdf_to_df('../data/PRAMS/pdf/' + filename)
#         state_name = get_name(filename)
#         df_state['State'] = state_name
#         df = pd.concat([df, df_state])
#         print(f"State: {state_name}, Rows: {len(df_state)}")
# df.to_csv('../data/PRAMS/csv/PRAMS_2012_2015.csv', index=False)

In [102]:
data = camelot.read_pdf('../data/PRAMS/pdf/Arkansas-2012-2015.pdf', flavor='stream', pages='all', strip_text='\n§#¶¥*†‡◊‖±^')

In [104]:
get_year_dict(2, data[0].df)

{'2012': 2, '2013': 4, '2014': 6, '2015': 8}

In [113]:
# 2014 counts
data[0].df.iloc[6, 5]

'877'

In [114]:
# 2014 stats
data[0].df.iloc[6, 6]

'30.6 (27.8-33.6)'

In [116]:
# 2014 counts
data[1].df.iloc[1, 1]

'870'

In [117]:
# 2014 stats
data[1].df.iloc[1, 2]

'24.8 (22.0-27.7)'

In [None]:
# Things are shifted to the left if any preceding years are missing