# This notebook processes some of the raw FAC data
* FAC data from 2014-2018 was downloaded in bulk from https://harvester.census.gov/facdissem/PublicDataDownloads.aspx
* Certain fields of interest are extracted from findings.txt and general.txt
* Agency prefix and entity codes tables were generated by hand
* All processed data is saved as pickles in /data/ihwang/data_journalism/allfac/processed for easy access

In [None]:
import numpy as np
import pandas as pd
import re

## Process findings.txt

In [3]:
fac_path = '/data/ihwang/data_journalism/allfac'
findings = '/raw/findings.txt'
###############################################################################


data = []
with open(fac_path + findings, 'r') as f:
    for line in f:
        if line[0] != ' ':
            header = line.split(',')
        else:
            fields = re.sub('\s', '', line).split(',')
            [DBKEY,AUDITYEAR,ELECAUDITSID,
             ELECAUDITFINDINGSID] = [int(field) for field in fields[:4]]
            # Can be multiple FINDINGSREFNUMS which makes it necessary to parse
            # file line-by-line (instead of using pandas.csv_read)
            [*FINDINGSREFNUMS] = fields[4:-9]
            [TYPEREQUIREMENT, MODIFIEDOPINION,OTHERNONCOMPLIANCE,
             MATERIALWEAKNESS,SIGNIFICANTDEFICIENCY,OTHERFINDINGS,QCOSTS,
             REPEATFINDING,PRIORFINDINGREFNUMS] = fields[-9:]
            data.append(
                [
                    DBKEY,AUDITYEAR,ELECAUDITSID,ELECAUDITFINDINGSID,
                    FINDINGSREFNUMS, TYPEREQUIREMENT,MODIFIEDOPINION,
                    OTHERNONCOMPLIANCE,MATERIALWEAKNESS,
                    SIGNIFICANTDEFICIENCY,OTHERFINDINGS,QCOSTS,
                    REPEATFINDING,PRIORFINDINGREFNUMS
                ]
            )
findings_all = pd.DataFrame(data, columns=header)
findings_df = findings_all[
    [
        'DBKEY', 'AUDITYEAR', 'TYPEREQUIREMENT', 'MODIFIEDOPINION', 
        'OTHERNONCOMPLIANCE','MATERIALWEAKNESS', 'SIGNIFICANTDEFICIENCY',
        'OTHERFINDINGS', 'QCOSTS'
    ]
]
findings_df.head()
findings_df.to_pickle(fac_path + '/processed/findings.pkl')

In [4]:
findings_all.columns

Index(['DBKEY', 'AUDITYEAR', 'ELECAUDITSID', 'ELECAUDITFINDINGSID',
       'FINDINGSREFNUMS', 'TYPEREQUIREMENT', 'MODIFIEDOPINION',
       'OTHERNONCOMPLIANCE', 'MATERIALWEAKNESS', 'SIGNIFICANTDEFICIENCY',
       'OTHERFINDINGS', 'QCOSTS', 'REPEATFINDING',
       'PRIORFINDINGREFNUMS                                                                                                                                                                                                         \n'],
      dtype='object')

## Process general.txt

In [21]:
fac_path = '/data/ihwang/data_journalism/allfac/'
general = '/raw/general.txt'
###############################################################################


general_all = pd.read_csv(
    fac_path + general, encoding = 'ISO-8859-1', low_memory=False
)  # low memory parameter for mixed types
general_df = general_all[
    [
        'DBKEY', 'AUDITYEAR', 'STATE', 'CPASTATE', 'CPAPHONE',
        'TYPEOFENTITY', 'OVERSIGHTAGENCY', 'TOTFEDEXPEND',
        'REPORTABLECONDITION', 'MATERIALWEAKNESS', 'MATERIALNONCOMPLIANCE',
        'REPORTABLECONDITION_MP', 'MATERIALWEAKNESS_MP', 'QCOSTS'
    ]
]
general_df = general_df.assign(
    AUDITEENAME=general_all['AUDITEENAME'].str.strip().str.replace(r'[^\w\s]', '')
)
general_df = general_df.assign(
    CPAFIRMNAME=general_all['CPAFIRMNAME'].str.strip().str.replace(r'[^\w\s]', '')
)
general_df = general_df.assign(
    CPACONTACT=general_all['CPACONTACT'].str.strip().str.replace(r'[^\w\s]', '')
)
general_df = general_df.assign(
    CPATITLE=general_all['CPATITLE'].str.strip().str.replace(r'[^\w\s]', '')
)
general_df = general_df.assign(
    ZIPCODE=pd.to_numeric(general_all['ZIPCODE'].str[:5], errors='coerce')
)
general_df = general_df.assign(
    CPAZIPCODE=pd.to_numeric(general_all['CPAZIPCODE'].str[:5], errors='coerce')
)
general_df = general_df.assign(
    TYPEREPORT_FS=general_all['TYPEREPORT_FS'].str.strip()
)
general_df = general_df.assign(
    TYPEREPORT_MP=general_all['TYPEREPORT_MP'].str.strip()
)
general_df = general_df.assign(
    CITY=general_all['CITY'].str.strip()
)
general_df = general_df.assign(
    CPACITY=general_all['CPACITY'].str.strip()
)
general_df = general_df.assign(
    EIN=pd.to_numeric(general_all['EIN'].str.split('-').str.get(-1))
)
general_df.apply(lambda x: x.str.upper() if(x.dtype == 'object') else x)  # convert all text to uppercase
general_df.head()
general_df.to_pickle(fac_path + 'processed/general.pkl')

In [5]:
print(general_df.columns)

Index(['DBKEY', 'AUDITYEAR', 'STATE', 'CPASTATE', 'CPAPHONE', 'TYPEOFENTITY',
       'OVERSIGHTAGENCY', 'TOTFEDEXPEND', 'REPORTABLECONDITION',
       'MATERIALWEAKNESS', 'MATERIALNONCOMPLIANCE', 'REPORTABLECONDITION_MP',
       'MATERIALWEAKNESS_MP', 'QCOSTS', 'AUDITEENAME', 'CPAFIRMNAME',
       'CPACONTACT', 'CPATITLE', 'ZIPCODE', 'CPAZIPCODE', 'TYPEREPORT_FS',
       'TYPEREPORT_MP', 'CITY', 'CPACITY', 'EIN'],
      dtype='object')


## Process agency prefix table
Raw data copied and pasted from https://harvester.census.gov/facdissem/Documents/PublicUserManual.pdf

In [8]:
import pickle


fac_path = '/data/ihwang/data_journalism/allfac'
prefix_table = '/raw/manual/prefix_table.txt'


agency_dict = {}
with open(fac_path + prefix_table, 'r') as f:
    for line in f:
        data = line.strip().split(' ')
        prefix = data[0]
        agency = data[1:-1]
        agency_dict[prefix] = ' '.join(agency)
        
        
with open(fac_path + '/processed/agency_dict.pkl', 'wb') as f_w:
    pickle.dump(agency_dict, f_w)

## Process entity codes table
Raw data copied and pasted from https://harvester.census.gov/facdissem/Documents/PublicUserManual.pdf

In [10]:
fac_path = '/data/ihwang/data_journalism/allfac'
entity_table = '/raw/manual/entity_table.txt'


entity_dict = {}
with open(fac_path + entity_table, 'r') as f:
    for line in f:
        data = line.strip().split(' ')
        entity = data[:-1]
        code = data[-1]
        entity_dict[code] = ' '.join(entity)
        
        
with open(fac_path + '/processed/entity_dict.pkl', 'wb') as f_w:
    pickle.dump(entity_dict, f_w)