# Convert 2012-2015 PRAMS pdf to csv

In [None]:
import os
import warnings

import camelot
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [None]:
def pdf_to_df(filename):
    df = pd.DataFrame(data = {})
    data = camelot.read_pdf(filename, flavor='stream', pages='all', strip_text='\n§#¶¥*†‡◊‖±^')
    year_dict = get_year_dict(data)
    for ii in range(2):  # assuming only two pages
        table = data[ii]
        data_dict = get_data_dict(table.df, year_dict[ii])
        df = pd.concat([df, pd.DataFrame(data_dict)])
    return df

def get_data_dict(df, year_dict):
    data = {
        'Category': [],
        'Indicator': [],
        'Year': [],
        'SampleSize': [],
        'Prevalence': [],
        'CI': []
    }
    for ii in range(len(df)):
        if is_category(ii, df):
            category = df.iloc[ii, 0]
        if is_indicator(ii, df):
            indicator = df.iloc[ii, 0][2:]
            add_row(ii, df, data, category, indicator, year_dict)    
    return data

def add_row(ii, df, data, category, indicator, year_dict, sub_indicator=np.nan):
    if is_indicator(ii, df):
        for year in year_dict:
            stats = df.iloc[ii, year_dict[year]]
            if stats != '':  # only append years with data
                data['Category'].append(category)
                data['Indicator'].append(indicator)
                data['Year'].append(year)
                data['SampleSize'].append(df.iloc[ii, year_dict[year] - 1])
                data['Prevalence'].append(stats.split()[0])
                data['CI'].append(stats.split()[1])
    else:
        raise ValueError(f"Row {ii} does not contain data.")

def is_category(ii, df):
    if ii >= len(df) - 1:
        return False
    return df.iloc[ii, 0] != '' and df.iloc[ii, 0][0] != '-' and is_indicator(ii + 1, df)

def get_year_dict(data):
    year_dict = [{'2012': 2, '2013': 4, '2014': 6, '2015': 8}, {}]
    year_idx = 2
    for year in year_dict[0]:
        if data[0].df.iloc[6, year_dict[0][year]] != '':
            year_dict[1][year] = year_idx
            year_idx += 2
    return year_dict

def is_indicator(ii, df):
    return df.iloc[ii, 0] != '' and df.iloc[ii, 0][0] == '-'

def get_name(filename):
    name = ''
    for word in filename.split('-')[:-2]:
        name += word.capitalize()
    return name

In [None]:
df = pd.DataFrame()
filenames = os.listdir('../data/PRAMS/pdf/')
for filename in filenames:
    if ('2012' in filename) and ('All' not in filename):
        df_state = pdf_to_df('../data/PRAMS/pdf/' + filename)
        state_name = get_name(filename)
        df_state['State'] = state_name
        df = pd.concat([df, df_state])
        print(f"State: {state_name}, Rows: {len(df_state)}")
df.to_csv('../data/PRAMS/csv/PRAMS_2012_2015.csv', index=False)