# Dump salaries from schedule J in Portland, OR (continuing from part 1)

In [1]:
import unicodecsv as csv
from irsx.xmlrunner import XMLRunner
import pandas as pd

In [2]:
# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('pdxefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)
        

In [3]:
# the name of the output file
outfilename ="employees_simple.csv"
outfile = open(outfilename , 'wb')

# the header rows as they'll appear in the output
headers = ["period", "ein", "object_id", "taxpayer_name", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"]
# start up a dictwriter, ignore extra rows
dw = csv.DictWriter(outfile, headers, extrasaction='ignore')
dw.writeheader()

In [4]:
# get an XMLRunner -- this is what actually does the parsing
xml_runner = XMLRunner()

## Figure out what to extract

Data from each repeating group should go to it's own file, otherwise it won't make sense.

To figure out what to capture, I started by looking at schedule J: http://www.irsx.info/#IRS990ScheduleJ
Then I went to the table details and picked the rows I wanted from the repeating group:
http://www.irsx.info/metadata/groups/SkdJRltdOrgOffcrTrstKyEmpl.html

Note that it's common for director/employee names in schedule J to get listed as businessname.

Also note that IRSx checks to see if a file has been downloaded before fetching it. Running this the first time will be slow if the filings aren't already downloaded, but much faster if they've already been downloaded.


In [6]:
DEMO_MAX = 10
num_rows = 0

for row in file_rows:
    num_rows += 1
    this_object_id = row['OBJECT_ID']
    parsed_filing = xml_runner.run_filing(this_object_id)
    
    # if it somehow busted, just note it and continue
    if not parsed_filing:
        print("Skipping filing %s(filings with pre-2013 schemas are skipped)\n row details: %s" % (this_object_id, row))
        continue 
    
    # Is there a Schedule J there at all?
    schedule_list = parsed_filing.list_schedules()
    if 'IRS990ScheduleJ' in schedule_list:
        
        # store the output in this dict
        outputdata = {}
        # assign some initial values from the input csv
        outputdata['period'] = row['TAX_PERIOD_x']
        outputdata['ein'] = row['EIN']
        outputdata['object_id'] = row['OBJECT_ID']
        outputdata['taxpayer_name'] = row['TAXPAYER_NAME']
        
        # some schedules can appear multiple times, but schedule j only appears once
        # so we grab the first one 
        parsed_skedj = parsed_filing.get_parsed_sked('IRS990ScheduleJ')[0] 
        # now use the table name we looked up -- that's where we find it
        try:
            # repeating groups are returned as an array of dicts
            employee_groups = parsed_skedj['groups']['SkdJRltdOrgOffcrTrstKyEmpl']
        except KeyError:
            print("No SkdJRltdOrgOffcrTrstKyEmpl found in %s skipping" % this_object_id)
            continue
          
        # read through each employee and pull out the data we want
        for employee_group in employee_groups:
            
            # That leaves the following values to come from schedule J if there is one
            # "name", "business_name1", "business_name2", "title", "org_comp", "related_comp"
            # those keys come from the headers we gave dictwriter before
            outputdata['name'] = employee_group.get('PrsnNm')
            outputdata['business_name1'] = employee_group.get('BsnssNmLn1Txt')
            outputdata['business_name2'] = employee_group.get('BsnssNmLn2Txt')
            outputdata['title'] = employee_group.get('TtlTxt')
            outputdata['org_comp'] = employee_group.get('TtlCmpnstnFlngOrgAmt') # Part II Column (E) 
            outputdata['related_comp'] = employee_group.get('TtlCmpnstnRltdOrgsAmt')
        
            dw.writerow(outputdata)
        
    else:
        print("No schedule J in filing %s, skipping" % this_object_id)

    # Don't run endlessly during a demo:
    if(num_rows > DEMO_MAX):
        break
    if num_rows%100==0:
        print("Processed %s filings" % num_rows)

No schedule J in filing 201603129349301785, skipping
No schedule J in filing 201743119349301544, skipping


In [7]:
# close the outfile
outfile.close()

sked_j_pdx_efilers = pd.read_csv(outfilename)
sked_j_pdx_efilers.head()

Unnamed: 0,period,ein,object_id,taxpayer_name,name,business_name1,business_name2,title,org_comp,related_comp
0,201512,930798039,201633149349303238,KAISER FOUNDATION HEALTH PLAN OF THE NORTHWEST,Stephen Shawn Barton,,,"Exec Dir, Revenue Cycle - NW",0.0,367529.0
1,201512,930798039,201633149349303238,KAISER FOUNDATION HEALTH PLAN OF THE NORTHWEST,Maryann Bodayle,,,Assistant Secretary,0.0,168202.0
2,201512,930798039,201633149349303238,KAISER FOUNDATION HEALTH PLAN OF THE NORTHWEST,Thomas W Chapman EdD,,,Director,0.0,227466.0
3,201512,930798039,201633149349303238,KAISER FOUNDATION HEALTH PLAN OF THE NORTHWEST,Mark Charpentier,,,"VP, Mktg/Sls/Bus Dev/Comm - NW",0.0,203880.0
4,201512,930798039,201633149349303238,KAISER FOUNDATION HEALTH PLAN OF THE NORTHWEST,Erin Downing,,,Special Assistant to the Board,0.0,134369.0
