# Extract from 990, 990PF and 990EZ at the same time

Some times you have no choice but extract from three different forms at the same time to get 
the same info. This somewhat more complicated example does that with data structures that are replaceable, somewhat.

In [1]:
# This allows for multiple output streams, though in this case we're just going to use one.
output_streams = {
    'employees': {  # will output to employeesYYYY.csv where year is specified below
        'filename':'employees_detailed',
        'headers':["year", "ein", "object_id", "name", "business_name1", "business_name2", "title", "org_comp", "related_comp", "other_cmp", "form", "source", "IndvdlTrstOrDrctrInd","InstttnlTrstInd","OffcrInd","KyEmplyInd","HghstCmpnstdEmplyInd","FrmrOfcrDrctrTrstInd"]
    }
}

In [2]:

# The format we're using is this
# The stream_key used must be defined in the output stream above.
# data_capture_dict = {
#     '<Form name>': {

#         'groups': {
#              '<group name>': {
#                 'stream_key': 'employees',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
#                 'ein': {'header':'ein'},
#                 'object_id': {'header':'object_id'}


data_capture_dict = {
    'IRS990': {

        # # capture data in repeating groups here
        # # each instance of a group is a row in the output file
        'groups': {
             'Frm990PrtVIISctnA': {
                'stream_key': 'employees',  # 'stream_key' specifies where the output goes--must exist as a key in output_streams
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'BsnssNmLn1Txt':{'header':'business_name1'},
                'BsnssNmLn2Txt':{'header':'business_name2'},
                'TtlTxt': {'header':'title'},
                'RprtblCmpFrmOrgAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'RprtblCmpFrmRltdOrgAmt': {
                    'header':'related_comp',
                    'default':0
                },
                'OthrCmpnstnAmt':{
                    'header':'other_cmp',
                    'default':0
                },
                'IndvdlTrstOrDrctrInd':{'header':'IndvdlTrstOrDrctrInd'},
                'InstttnlTrstInd':{'header':'InstttnlTrstInd'},
                'OffcrInd':{'header':'OffcrInd'},
                'KyEmplyInd':{'header':'KyEmplyInd'},
                'HghstCmpnstdEmplyInd':{'header':'HghstCmpnstdEmplyInd'},
                'FrmrOfcrDrctrTrstInd':{'header':'FrmrOfcrDrctrTrstInd'}
            }
        }
    },
    'IRS990EZ': {
        'groups': {
            'EZOffcrDrctrTrstEmpl': {
                'stream_key': 'employees',
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'BsnssNmLn1': {'header':'business_name1'},
                'BsnssNmLn2': {'header':'business_name2'},


                'TtlTxt': {'header':'title'},
                'CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0
                },
                'composite': {  # other compensation includes benefits and other allowances for EZ, PF filers
                    'other_cmp': {
                        'EmplyBnftPrgrmAmt': {
                            'default':0
                        },
                        'ExpnsAccntOthrAllwncAmt': {
                            'default':0
                        }
                    }
                }
            },
            'EZCmpnstnHghstPdEmpl': {
                'stream_key': 'employees',
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'TtlTxt': {'header':'title'},
                'CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0
                },
                'composite': {
                    'other_cmp': {
                        'EmplyBnftsAmt': {
                            'default':0
                        },
                        'ExpnsAccntAmt': {
                            'default':0
                        }
                    }
                }
            }
        }
    },
    'IRS990PF': {
        'groups': {
            'PFOffcrDrTrstKyEmpl': {
                'stream_key': 'employees',

                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'OffcrDrTrstKyEmpl_PrsnNm': {'header':'name'},
                'OffcrDrTrstKyEmpl_BsnssNmLn1': {'header':'business_name1'},
                'OffcrDrTrstKyEmpl_BsnssNmLn2': {'header':'business_name2'},
                'OffcrDrTrstKyEmpl_TtlTxt': {'header':'title'},
                'OffcrDrTrstKyEmpl_CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'composite': {
                    'other_cmp': {
                        'OffcrDrTrstKyEmpl_EmplyBnftPrgrmAmt': {
                            'default':0
                        },
                        'OffcrDrTrstKyEmpl_ExpnsAccntOthrAllwncAmt': {
                            'default':0
                        }
                    }
                }
            },
            'PFCmpnstnHghstPdEmpl': {
                'stream_key': 'employees',

                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'CmpnstnHghstPdEmpl_PrsnNm': {'header':'name'},
                'CmpnstnHghstPdEmpl_TtlTxt': {'header':'title'},
                'CmpnstnHghstPdEmpl_CmpnstnAmt': {
                    'header':'org_comp',
                    'default':0  # set numeric if missing
                },
                'composite': {
                    'other_cmp': {
                        'CmpnstnHghstPdEmpl_EmplyBnftsAmt': {
                            'default':0
                        },
                        'CmpnstnHghstPdEmpl_ExpnsAccntAmt': {
                            'default':0
                        }
                    }
                }
            }
        }
    },
    'IRS990ScheduleJ': {
        'groups': {
            'SkdJRltdOrgOffcrTrstKyEmpl': {
                'stream_key': 'employees',
                'ein': {'header':'ein'},
                'object_id': {'header':'object_id'},
                'PrsnNm': {'header':'name'},
                'BsnssNmLn1Txt': {'header':'business_name1'},
                'BsnssNmLn2Txt': {'header':'business_name2'},
                'TtlTxt': {'header':'title'},
                'TtlCmpnstnFlngOrgAmt': {
                    'header':'org_comp',
                    'default':0
                },
                'TtlCmpnstnRltdOrgsAmt': {
                    'header':'related_comp',
                    'default':0
                },
            }
        }
    }
}

In [3]:
from stream_extractor import StreamExtractor
import unicodecsv as csv

In [4]:
YEAR = 2017  # THIS MUST AGREE WITH OUR OTHER DATA
extractor = StreamExtractor(output_streams, data_capture_dict, YEAR)

Initializing output stream employees_detailed2017.csv


In [5]:
# read the whole file of orgs with efilings from part 1 here, it's not very long
file_rows = [] 
# We're using the output of part 1
with open('pdxefilers.csv', 'rb') as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        file_rows.append(row)

In [6]:

for filing_count, row in enumerate(file_rows):
    this_object_id = row['OBJECT_ID']
    extractor.run_filing(this_object_id)
    filing_count += 1
    print("Processed %s filings" % filing_count)

Running sked IRS990
Running sked IRS990ScheduleJ
Processed 1 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 2 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 3 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 4 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 5 filings
Running sked IRS990
Processed 6 filings
Running sked IRS990
Processed 7 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 8 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 9 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 10 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 11 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 12 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 13 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 14 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 15 filings
Running sked IRS9

Running sked IRS990
Running sked IRS990ScheduleJ
Processed 147 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 148 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 149 filings
Running sked IRS990
Processed 150 filings
Running sked IRS990
Processed 151 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 152 filings
Running sked IRS990
Processed 153 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 154 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 155 filings
Running sked IRS990
Processed 156 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 157 filings
Running sked IRS990
Processed 158 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 159 filings
Running sked IRS990
Processed 160 filings
Running sked IRS990
Processed 161 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 162 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 163 fili

Running sked IRS990
Processed 304 filings
Running sked IRS990PF
Processed 305 filings
Running sked IRS990
Processed 306 filings
Running sked IRS990
Processed 307 filings
Running sked IRS990
Processed 308 filings
Running sked IRS990
Processed 309 filings
Running sked IRS990
Processed 310 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 311 filings
Running sked IRS990
Processed 312 filings
Running sked IRS990
Processed 313 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 314 filings
Running sked IRS990
Processed 315 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 316 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 317 filings
Running sked IRS990
Processed 318 filings
Running sked IRS990
Processed 319 filings
Running sked IRS990
Processed 320 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 321 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 322 filings
Running sked IRS990
Proces

Running sked IRS990
Processed 490 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 491 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 492 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 493 filings
Running sked IRS990
Processed 494 filings
Running sked IRS990
Processed 495 filings
Running sked IRS990
Processed 496 filings
Running sked IRS990
Processed 497 filings
Running sked IRS990
Processed 498 filings
Running sked IRS990
Processed 499 filings
Running sked IRS990
Processed 500 filings
Running sked IRS990
Processed 501 filings
Running sked IRS990
Processed 502 filings
Running sked IRS990
Processed 503 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 504 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 505 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 506 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 507 filings
Running sked IRS990
Processed 508 filings

Processed 672 filings
Running sked IRS990
Processed 673 filings
Running sked IRS990
Processed 674 filings
Running sked IRS990
Processed 675 filings
Running sked IRS990
Processed 676 filings
Running sked IRS990
Processed 677 filings
Running sked IRS990
Processed 678 filings
Running sked IRS990
Processed 679 filings
Running sked IRS990
Processed 680 filings
Running sked IRS990
Processed 681 filings
Running sked IRS990
Processed 682 filings
Running sked IRS990
Processed 683 filings
Running sked IRS990
Processed 684 filings
Running sked IRS990
Processed 685 filings
Running sked IRS990
Processed 686 filings
Running sked IRS990
Processed 687 filings
Running sked IRS990
Processed 688 filings
Running sked IRS990
Processed 689 filings
Running sked IRS990
Processed 690 filings
Running sked IRS990PF
Processed 691 filings
Running sked IRS990
Running sked IRS990ScheduleJ
Processed 692 filings
Running sked IRS990
Processed 693 filings
Running sked IRS990
Processed 694 filings
Running sked IRS990
Pro

KeyboardInterrupt: 