# ProPublica NonProfit Tax Data Scrape

Authored by Jeff Bocek
Updated: 4/3/2024

#### Libraries

In [133]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
from time import sleep

Notes for Author:

In [134]:
'''base_url = 'https://projects.propublica.org/nonprofits/organizations/'
ein = 362167016
'''

"base_url = 'https://projects.propublica.org/nonprofits/organizations/'\nein = 362167016\n"

### Functions

In [135]:
def create_institut_url(base_url, ein):
    '''
    Create the URL for the insitution by combining the base URL with the institutions EIN number.
    
    Parameters
    ----------
    base_url : str
        The base URL of ProPublica nonprofit organization websites
    ein : int
        The EIN tax identification number for a nonprofit organization
    
    Returns
    -------
    str
        The specific URL for a nonprofit organization within ProPublicas system
    
    Rasies
    ------
    n/a

    '''
    # Add the EIN number to the end of the base URL string
    url = base_url + str(ein)
    return url


In [136]:
def get_soup(url):
    '''
    Get the "soup" for parsing webpage via BeautifulSoup library.

    Parameters
    ----------
    url : str
        The specific URL for a nonprofit organization within ProPublicas system
    
    Returns
    -------
    BeautifulSoup obj
        A BeatifulSoup object made from the parsed HTML file

    Raises
    ------
    n/a
    '''

    NUM_RETRIES = 3
    # Try to connect to website "x" number of times before returning a "failed" status
    for _ in range(NUM_RETRIES):
        try:
            # Make a request to a web page
            response = requests.get(url, timeout = 360)
            # If the status code succeeded (the HTTP status code for succeed is 200)
            # For testing...
            # print(response.status_code)
            if response.status_code in [200]:
                # Escape for loop if returns a successful response
                # Create a BeautifulSoup object with HTML parser from the websites HTML file
                soup = BeautifulSoup(response.content, "html.parser")
                return soup
        # If there is a connection error do nothing and try again in the for loop
        except requests.exceptions.ConnectionError:
            pass
    #page = requests.get(url, timeout = 360)
    return 'failed'

In [137]:
def get_full990_links(soup):
    '''
    Create a list of URL links for all the years with full 990 data available
    
    Parameters
    ----------
    soup : BeautifulSoup obj
        A BeatifulSoup object made from the parsed HTML file
    
    Returns
    -------
    lst
        A list of URL links for all the years with full 990 data available
    
    Raises
    ------
    n/a
    '''

    full990_links = []
    # For testing:
    # print(soup.find_all('tr', class_= 'employee-row hide more-employees-link'))
    # Older version code: for link in soup.find_all('a', class_= 'action fulltext'):
    for link in soup.find_all('tr', class_= 'employee-row hide more-employees-link'):
        try:
            # Get part of the URL that links to full 990 webpage
            link1 = link.find('a', href = True)
            # For testing:
            # print('https://projects.propublica.org/nonprofits/full_text/' + str(link1)[-77:-59] + '/IRS990')
            # Make URL for full 990 for specific organization and year
            full990_links.append('https://projects.propublica.org/nonprofits/full_text/' + str(link1)[-77:-59] + '/IRS990')

        except KeyError:
            pass
    # For testins...
    # print(full990_links)
    return full990_links

In [138]:
def create_etree(url):
    '''
    Create parsed etree (into XML format) to use XPath
    
    Parameters
    ----------
    url : str
        The specific URL for a nonprofit organization within ProPublicas system

    Return
    ------
    obj
        Parsed etree

    Raises
    ------
    n/a
    '''
    '''
    # Get the "soup" for parsing webpage via BeautifulSoup library.
    soup = get_soup(url)
    if soup == 'failed':
        return 'failed'
    else:
        tree = etree.HTML(str(soup))
        return tree
    '''

    NUM_RETRIES = 3
    # Try to connect to website "x" number of times before returning a "failed" status
    for _ in range(NUM_RETRIES):
        try:
            # Make a request to a web page
            response = requests.get(url, timeout = 360)
            # If the status code succeeded (the HTTP status code for succeed is 200)
            if response.status_code in [200]:
                # Escape for loop if returns a successful response
                # Create a BeautifulSoup object with HTML parser from the websites HTML file
                soup = BeautifulSoup(response.content, "html.parser")
                # Create a parsed etree
                tree = etree.HTML(str(soup))
                return tree
        # If there is a connection error do nothing and try again in the for loop
        except requests.exceptions.ConnectionError:
            pass
    return 'failed'
    

In [139]:
def create_attribute_dict():
        '''
        Create dictionary of attributes with description and XPath link
        '''
        
        attributes = {
        'ein': ['EIN number',
                '//*[@id="/AppData/SubmissionHeaderAndDocument/ReturnHeader[1]/Filer[1]/EIN[1]"]', 'Full'],
        'year': ['filling year',
                '//*[@id="/AppData/SubmissionHeaderAndDocument/ReturnHeader[1]/TaxPeriodEndDt[1]"]', 'Full'],
        'name_of_org': ['name of organization',
                '//*[@id="/AppData/SubmissionHeaderAndDocument/ReturnHeader[1]/Filer[1]/BusinessName[1]/BusinessNameLine1Txt[1]"]', 'Full'],
        'another_name': ['name of organization doing buisness as',
                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/DoingBusinessAsName[1]/BusinessNameLine1Txt[1]"]', 'Full'],
        'employees': ['total number of individuals employed in calender year',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalEmployeeCnt[1]"]', 'Full'],
        'volunteers': ['total number of volunteers estimated',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalVolunteersCnt[1]"]', 'Full'],
        'board_members': ['total number of board members',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/GoverningBodyVotingMembersCnt[1]"]', 'Full'],
        'contrib_grants': ['Contributions and grants (Part VIII, line 1h)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYContributionsGrantsAmt[1]"]', 'Full'],
        'program_revenue': ['Program service revenus (Part VIII, line 2g)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYProgramServiceRevenueAmt[1]"]', 'Full'],
        'invest_income': ['Investment income (Part VIII) column (A), lines 3, 4, and 7d )',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYInvestmentIncomeAmt[1]"]', 'Full'],
        'other_revenue': ['(Part VIII, column (A), lines 5, 6d, 8c, 9c, 10c, and 11e)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYOtherRevenueAmt[1]"]', 'Full'],
        'total_revenue': ['—add lines 8 through 11 (must equal Part VIII, column (A), line 12)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYTotalRevenueAmt[1]"]', 'Full'],
        'grants_paid': ['Grants and similar amounts paid (Part IX, column (A), lines 1-3 )',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYGrantsAndSimilarPaidAmt[1]"]', 'Full'],
        'salaries_benefits': [' Salaries, other compensation, benefits for employees (Part IX, column (A), lines 5-10)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYSalariesCompEmpBnftPaidAmt[1]"]', 'Full'],
        'fundrais_fees': ['professional fundraising fees (Part IX, column (A), line 11e)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYTotalProfFndrsngExpnsAmt[1]"]', 'Full'],
        'fundrais_expenses': ['Total fundraising expenses (Part IX, column (D), line 25)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYTotalFundraisingExpenseAmt[1]"]', 'Full'],
        'other_expenses': ['Other expenses (Part IX, column (A), lines 11a-11d, 11f-24e)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYOtherExpensesAmt[1]"]', 'Full'],
        'total_expenses': ['Add lines 13-17 (must equal Part IX, column (A), line 25)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYTotalExpensesAmt[1]"]', 'Full'],
        'rev_less_exp': ['Revenue less expenses. (Profit), Subtract line 18 from line 12',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CYRevenuesLessExpensesAmt[1]"]', 'Full'],
        'total_assets': ['total assets (part X, line 16)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalAssetsEOYAmt[1]"]', 'Full'],
        'total_liabil': ['total liabilities (part X line 26)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalLiabilitiesEOYAmt[1]"]', 'Full'],
        'net_assets': ['net assets or fund balances total_assets-total_liabil',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/NetAssetsOrFundBalancesEOYAmt[1]"]', 'Full'],
        'mission': ['brief description of the organizations mission', 
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/MissionDesc[1]"]', 'Full'],
        'program1_expen': ['program service accomplishment measured by expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ExpenseAmt[1]"]', 'Full'],
        'program1_grants': ['amount of grant money included in program1_expen',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/GrantAmt[1]"]', 'Full'],
        'program1_revenue': ['revenue from program service',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/RevenueAmt[1]"]', 'Full'],
        'program1_text': ['text explanation of program service', 
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/Desc[1]"]', 'Full'],
        'program2_expen': ['program service accomplishment measured by expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy2Grp[1]/ExpenseAmt[1]"]', 'Full'],
        'program2_grants': ['amount of grant money included in program1_expen',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy2Grp[1]/GrantAmt[1]"]', 'Full'],
        'program2_revenue': ['revenue from program service',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy2Grp[1]/RevenueAmt[1]"]', 'Full'],
        'program2_text': ['text explanation of program service', 
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy2Grp[1]/Desc[1]"]', 'Full'],
        'program3_expen': ['program service accomplishment measured by expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy3Grp[1]/ExpenseAmt[1]"]', 'Full'],
        'program3_grants': ['amount of grant money included in program1_expen',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy3Grp[1]/GrantAmt[1]"]', 'Full'],
        'program3_revenue': ['revenue from program service',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy3Grp[1]/RevenueAmt[1]"]', 'Full'],
        'program3_text': ['text explanation of program service', 
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgSrvcAccomActy3Grp[1]/Desc[1]"]', 'Full'],
        'total_program_exp': ['total program service expenses',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalProgramServiceExpensesAmt[1]"]', 'Full'],
        'executive1_title': ['top paid executive title',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/Form990PartVIISectionAGrp[1]/TitleTxt[1]"]', 'Full'],
        'executive1_sal': ['salary of top executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/Form990PartVIISectionAGrp[1]/ReportableCompFromOrgAmt[1]"]', 'Full'],
        'executive2_title': ['second top paid exectuve title',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/Form990PartVIISectionAGrp[2]/TitleTxt[1]"]', 'Full'],
        'executive2_sal': ['second highest salary of executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/Form990PartVIISectionAGrp[2]/ReportableCompFromOrgAmt[1]"]', 'Full'],
        'executive_team_sal': ['total amount of executive team salaries including former employees',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TotalReportableCompFromOrgAmt[1]"]', 'Full'],
        '100thou_club_employee': ['number of employees that earn more than $100,000',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/IndivRcvdGreaterThan100KCnt[1]"]', 'Full'],
        'indepen_contract1_descr': ['description of services for independant contractor',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[1]/ServicesDesc[1]"]', 'Full'],
        'indepen_contract1_comp': ['compensatoin for contractor',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[1]/CompensationAmt[1]"]', 'Full'],
        'indepen_contract2_descr': ['description of services for independant contractor',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[2]/ServicesDesc[1]"]', 'Full'],
        'indepen_contract2_comp': ['compensatoin for contractor',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[2]/CompensationAmt[1]"]', 'Full'],
        'indepen_contract3_descr': ['description of services for independant contractor',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[3]/ServicesDesc[1]"]', 'Full'],
        'indepen_contract3_comp': ['compensatoin for contractor',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[3]/CompensationAmt[1]"]', 'Full'],
        'indepen_contract4_descr': ['description of services for independant contractor',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[4]/ServicesDesc[1]"]', 'Full'],
        'indepen_contract4_comp': ['compensatoin for contractor',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[4]/CompensationAmt[1]"]', 'Full'],
        'indepen_contract5_descr': ['description of services for independant contractor',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[5]/ServicesDesc[1]"]', 'Full'],
        'indepen_contract5_comp': ['compensatoin for contractor',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ContractorCompensationGrp[5]/CompensationAmt[1]"]', 'Full'],
        '100thou_club_contractr': ['Total number of independent contractors (including) who received more than $100,000 of compensation from the organization',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CntrctRcvdGreaterThan100KCnt[1]"]', 'Full'],
        'membership_dues_revenue': ['PartIII Membership dues (part of contrib_grants)',
                                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/MembershipDuesAmt[1]"]', 'Full'],
        'fundrais_event_rev': ['fundraising event revenue (part of contrib_grants)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/FundraisingAmt[1]"]', 'Full'],
        'gov_grants': ['goverment grants (contributions) (part of contrib_grants)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/GovernmentGrantsAmt[1]"]', 'Full'],
        'other_grant_gift': ['All other contributions, gifts, grants, and similar amounts not included above (part of contrib_grants)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/AllOtherContributionsAmt[1]"]', 'Full'],
        'program_serv_rev1': ['program service revenue type (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[1]/Desc[1]"]', 'Full'],
        'program_serv_rev1_amt': ['program service revenue amount (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[1]/TotalRevenueColumnAmt[1]"]', 'Full'],
        'program_serv_rev2': ['program service revenue type (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[2]/Desc[1]"]', 'Full'],
        'program_serv_rev2_amt': ['program service revenue amount (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[2]/TotalRevenueColumnAmt[1]"]', 'Full'],     
        'program_serv_rev3': ['program service revenue type (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[3]/Desc[1]"]', 'Full'],
        'program_serv_rev3_amt': ['program service revenue amount (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[3]/TotalRevenueColumnAmt[1]"]', 'Full'],
        'program_serv_rev4': ['program service revenue type (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[4]/Desc[1]"]', 'Full'],
        'program_serv_rev4_amt': ['program service revenue amount (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[4]/TotalRevenueColumnAmt[1]"]', 'Full'],
        'program_serv_rev5': ['program service revenue type (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[5]/Desc[1]"]', 'Full'],
        'program_serv_rev5_amt': ['program service revenue amount (part of program_revenue)',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ProgramServiceRevenueGrp[5]/TotalRevenueColumnAmt[1]"]', 'Full'],
        'domestic_grants': ['Grants and other assistance given to domestic organizations and domestic governments above $5,000',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/GrantsToDomesticOrgsGrp[1]/TotalAmt[1]"]', 'Full'],
        'foreign_grants': ['Grants and other assistance given to foreign organizations, foreign governments, and foreign individuals above $5,000',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ForeignGrantsGrp[1]/TotalAmt[1]"]', 'Full'],
        'region1': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[1]/RegionTxt[1]"]', 'F'],
        'region1_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[1]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region1_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[1]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region2': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[2]/RegionTxt[1]"]', 'F'],
        'region2_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[2]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region2_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[2]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region3': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[3]/RegionTxt[1]"]', 'F'],
        'region3_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[3]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region3_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[3]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region4': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[4]/RegionTxt[1]"]', 'F'],
        'region4_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[4]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region4_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[4]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region5': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[5]/RegionTxt[1]"]', 'F'],
        'region5_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[5]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region5_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[5]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region6': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[6]/RegionTxt[1]"]', 'F'],
        'region6_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[6]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region6_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[6]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region7': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[7]/RegionTxt[1]"]', 'F'],
        'region7_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[7]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region7_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[7]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region8': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[8]/RegionTxt[1]"]', 'F'],
        'region8_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[8]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region8_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[8]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region9': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[9]/RegionTxt[1]"]', 'F'],
        'region9_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[9]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region9_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[9]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'region10': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[10]/RegionTxt[1]"]', 'F'],
        'region10_reason': ['Activities conducted in region (by type) (such as, fundraising, program services, investments, grants to recipients located in the region)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[10]/TypeOfActivitiesConductedTxt[1]"]', 'F'],
        'region10_amt': ['region of international expenditures',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleF[1]/AccountActivitiesOutsideUSGrp[10]/RegionTotalExpendituresAmt[1]"]', 'F'],
        'current_exec_team_sals': ['Compensation of current officers, directors, trustees, and key employees',
                                '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/CompCurrentOfcrDirectorsGrp[1]/TotalAmt[1]"]', 'Full'],
        'other_sals': ['other salaries and wages',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/OtherSalariesAndWagesGrp[1]/TotalAmt[1]"]', 'Full'],
        'legal_exp': ['legal expensis',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/FeesForServicesLegalGrp[1]/TotalAmt[1]"]', 'Full'],
        'lobbying_exp': ['lobbying expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/FeesForServicesLobbyingGrp[1]/TotalAmt[1]"]', 'Full'],
        'advertis_exp': ['Advertising and promotion',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/AdvertisingGrp[1]/TotalAmt[1]"]', 'Full'],
        'i_t__exp': ['Information Technology expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/InformationTechnologyGrp[1]/TotalAmt[1]"]', 'Full'],
        'insurance_exp': ['insurance expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/InsuranceGrp[1]/TotalAmt[1]"]', 'Full'],
        'travel_exp': ['travel expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/TravelGrp[1]/TotalAmt[1]"]', 'Full'],
        'conf_meeting_exp': ['conference, conventions, and meeting expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/ConferencesMeetingsGrp[1]/TotalAmt[1]"]', 'Full'],
        'lobby_grants': ['grants to other organizations for lobbying purposes',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleC[1]/GrantsOtherOrganizationsAmt[1]"]', 'C'],
        'legislator_lobby': ['direct contact with legislators, their staffs, government officials, or a lgesilative body',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleC[1]/DirectContactLegislatorsAmt[1]"]', 'C'],
        'quasi_endow': ['estimated percentage of the current year end balance held as board designated or quasi-endowment',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleD[1]/BoardDesignatedBalanceEOYPct[1]"]', 'D'],
        'permanent_endow': ['estimated percentage of the current year end balance held as permanent endowment',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleD[1]/PrmnntEndowmentBalanceEOYPct[1]"]', 'D'],
        'term_endow': ['estimated percentage of the current year end balance held as term endowment',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleD[1]/TermEndowmentBalanceEOYPct[1]"]', 'D'],
        'fundevnt1': ['fundraising event name',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/Event1Nm[1]"]', 'G'],    
        'fundevnt1_gross': ['fundraising event gross receipts',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/GrossReceiptsEvent1Amt[1]"]', 'G'],
        'fundevnt1_contrib': ['fundraising event Less: Contributions',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/CharitableContriEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp1_prize': ['fundraising event cash prizes; expense',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/CashPrizesEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp2_prize': ['fundraising event non cash prizes; expense',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/NonCashPrizesEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp3_facilty': ['fundraising event rent/facility costs',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/RentFacilityCostsEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp4_food': ['fundraising event food and drink cost',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/FoodAndBeverageEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp5_ent': ['fundraising event entertainment costs',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/EntertainmentEvent1Amt[1]"]', 'G'],
        'fundevnt1_exp6_oth': ['fundraising event other direct expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/OtherDirectExpensesEvent1Amt[1]"]', 'G'],
        'fundevnts_gross_tot': ['total fundraising events gross receipts',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/GrossReceiptsTotalAmt[1]"]', 'G'],
        'fundevnts_contrib_tot': ['total fundraising events contributions',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/CharitableContributionsTotAmt[1]"]', 'G'],
        'fundevnts_exp_tot': ['total fundraising events expenses',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleG[1]/FundraisingEventInformationGrp[1]/DirectExpenseSummaryEventsAmt[1]"]', 'G'],
        'gov_grants': ['goverment grants (contributions) (part of contrib_grants)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990[1]/GovernmentGrantsAmt[1]"]', 'Full'],
        'exec1_title': ['exectutive who earns move than $150k on Schedule J part II listed title)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[1]/TitleTxt[1]"]', 'J'],
        'exec1_base': ['executive base compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[1]/BaseCompensationFilingOrgAmt[1]"]', 'J'],
        'exec1_bonus': ['executive bonus and incentive compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[1]/BonusFilingOrganizationAmount[1]"]', 'J'],
        'exec1_nontax': ['executive nontaxable benefits)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[1]/NontaxableBenefitsFilingOrgAmt[1]"]', 'J'],
        'exec1_total': ['total compensation for executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[1]/TotalCompensationFilingOrgAmt[1]"]', 'J'],
        'exec2_title': ['exectutive who earns move than $150k on Schedule J part II listed title)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[2]/TitleTxt[1]"]', 'J'],
        'exec2_base': ['executive base compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[2]/BaseCompensationFilingOrgAmt[1]"]', 'J'],
        'exec2_bonus': ['executive bonus and incentive compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[2]/BonusFilingOrganizationAmount[1]"]', 'J'],
        'exec2_nontax': ['executive nontaxable benefits)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[2]/NontaxableBenefitsFilingOrgAmt[1]"]', 'J'],
        'exec2_total': ['total compensation for executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[2]/TotalCompensationFilingOrgAmt[1]"]', 'J'],
        'exec3_title': ['exectutive who earns move than $150k on Schedule J part II listed title)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[3]/TitleTxt[1]"]', 'J'],
        'exec3_base': ['executive base compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[3]/BaseCompensationFilingOrgAmt[1]"]', 'J'],
        'exec3_bonus': ['executive bonus and incentive compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[3]/BonusFilingOrganizationAmount[1]"]', 'J'],
        'exec3_nontax': ['executive nontaxable benefits)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[3]/NontaxableBenefitsFilingOrgAmt[1]"]', 'J'],
        'exec3_total': ['total compensation for executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[3]/TotalCompensationFilingOrgAmt[1]"]', 'J'],
        'exec4_title': ['exectutive who earns move than $150k on Schedule J part II listed title)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[4]/TitleTxt[1]"]', 'J'],
        'exec4_base': ['executive base compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[4]/BaseCompensationFilingOrgAmt[1]"]', 'J'],
        'exec4_bonus': ['executive bonus and incentive compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[4]/BonusFilingOrganizationAmount[1]"]', 'J'],
        'exec4_nontax': ['executive nontaxable benefits)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[4]/NontaxableBenefitsFilingOrgAmt[1]"]', 'J'],
        'exec4_total': ['total compensation for executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[4]/TotalCompensationFilingOrgAmt[1]"]', 'J'],
        'exec5_title': ['exectutive who earns move than $150k on Schedule J part II listed title)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[5]/TitleTxt[1]"]', 'J'],
        'exec5_base': ['executive base compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[5]/BaseCompensationFilingOrgAmt[1]"]', 'J'],
        'exec5_bonus': ['executive bonus and incentive compensation)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[5]/BonusFilingOrganizationAmount[1]"]', 'J'],
        'exec5_nontax': ['executive nontaxable benefits)',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[5]/NontaxableBenefitsFilingOrgAmt[1]"]', 'J'],
        'exec5_total': ['total compensation for executive',
                        '//*[@id="/AppData/SubmissionHeaderAndDocument/SubmissionDocument/IRS990ScheduleJ[1]/RltdOrgOfficerTrstKeyEmplGrp[5]/TotalCompensationFilingOrgAmt[1]"]', 'J']
        }
        return attributes

In [140]:
def init_results_dict(attributes):
    '''
    Initialize results dictionary with attribute names

    Parameters
    ----------
    attributes : dict
        A dictionary of attributes/features with description and XPath link

    Return
    ------
    dict
        A dictionary with the feature name as the key and the value as an empty list (initialize)

    Raises
    ------
    n/a
    '''
    
    full990_values = {}
    for key in attributes.keys():
        full990_values[key] = []
    return full990_values
    

In [141]:
def get_attr_value(schedule_trees, xpath_str, xpath_tree):
    '''
    Get attribute value from parsed tree
    
    Parameters
    ----------
    schedule_trees : dict
        A dictionary of different etrees for different tax schedule documents (from create_sched_etrees())
    xpath_str : str
        The XML string for the location of the value
    xpath_tree : str
        Location of which tree to use (which tax shedule document)

    Return
    ------
    str
        Atrribute value 
    
    Raises
    ------
    n/a
    '''

    try:
        # get the specific etree needed from the dictionary
        tree = schedule_trees[xpath_tree][1]
    except:
        # if there is no etree (no schedule "x" document for this institution)
        value = None
        return value
    value = tree.xpath(xpath_str)[0].text
    return value

In [142]:
def store_attr_values(schedule_trees, attributes, full990_values):
    '''
    Store attribute value from parsed tree in dictionary

    Parameters
    ----------
    schedule_trees : dict
        A dictionary of different etrees for different tax schedule documents (from create_sched_etrees())
    attributes : dict
        A dictionary of attributes/features with description and XPath link
    full990_values : dict
        A dictionary with the feature names as the keys and the values as empty lists (initialized)

    Return
    ------
    dict
        An updated dictionary of the features with now stored values inside
        
    Raises
    ------
    n/a
    '''
    # For each feature...
    for key in attributes.keys():
        # The XML string for the location of the value
        xpath_str = attributes[key][1]
        # Location of which tree to use (which tax shedule document)
        xpath_tree = attributes[key][2]
        # For testing
        # print(xpath_str)
        try:
            value = get_attr_value(schedule_trees, xpath_str, xpath_tree)
        except:
            # If something doesn't work record as no value
            value = None
        # For testing
        # print(value)
        full990_values[key].append(value)
    return full990_values

In [143]:
def dict_to_dataframe(dict1):
    '''
    Change dictionary to dataframe
    
    Parameters
    ----------
    dict1 : dict
        A dictionary

    Return
    ------
    obj
        A dataframe
    
    Raises
    ------
    n/a
    '''
    
    df = pd.DataFrame.from_dict(dict1)
    return df

In [144]:
def create_sched_etrees(url):
    '''
    Create an etree for Schedule C, D, F, G, and J and store into a dictionary
    
    Parameters
    ----------
    url : str
        The base url for the nonprofit organization 
    
    Return
    ------
    dict
        Dictionary of the etree's for the coresponding Schedule tax documents
    Raises
    ------
    n/a
    '''
    schedule_trees = {'Full': [''], 'C': ['ScheduleC'], 'D': ['ScheduleD'], 'F': ['ScheduleF'], 'G': ['ScheduleG'], 'J': ['ScheduleJ']}
    for key in schedule_trees.keys():
        url_sch = url + schedule_trees[key][0]
        tree = create_etree(url_sch)
        # if there is no tree don't store anything to dictionary
        if tree == 'failed':
            pass
        # add tree to dictionary
        schedule_trees[key].append(tree)
    return schedule_trees

In [145]:
def get_institut_data(full990_links, attributes, full990_values):
    '''
    Retrieve all full 990 data available for institution as dictionary
    
    Parameters
    ----------
    full990_links : lst
        A list of URL links for all the years with full 990 data available
    attributes : dict
        A dictionary of attributes/features with description and XPath link
    full990_values : dict
         A dictionary with the feature name as the key and the value as an empty list (initialized)

    Return
    ------
    dict
        A dictionary with all the feature values stored inside for each year of available tax documents
        
    Raises
    ------
    n/a
    '''

    # For each year with full 990 data available...
    for url in full990_links:                           
        # Create an etree for Schedule C, D, F, G, and J and store into a dictionary
        schedule_trees = create_sched_etrees(url)
        # Store attribute values from parsed tree in dictionary
        full990_values = store_attr_values(schedule_trees, attributes, full990_values) 
        #sleep between each year of 990s
        sleep(15)                                       
    return full990_values

In [146]:
def ProPublica_scrape(ein, attributes, full990_values):
    '''
    Scrape select data from an institutions tax forms from ProPublica into a dictionary

    Parameters
    ----------
    ein : str
        EIN number for nonprofit organization
    attributes : dict
        A dictionary of attributes/features with description and XPath link
    full990_values : dict
         A dictionary with the feature name as the key and the value as an empty list (initialized)

    Return
    ------
    dict
        A dictionary of all the features desired from the tax documents for the nonprofit organization for the years available
        
    Raises
    ------
    n/a
    '''

    base_url = 'https://projects.propublica.org/nonprofits/organizations/'
    # Create the url for the insitution
    url = create_institut_url(base_url, ein)            
    # Get the "soup" for parsing webpage via BeautifulSoup
    soup = get_soup(url)                                
    # If the request didn't work don't add anything
    if soup == 'failed':                                  
        return full990_values
    # Create a list of url links for all the years with full 990 data available
    full990_links = get_full990_links(soup)             
    # Retrieve all full 990 data available for institution as dictionary
    full990_values = get_institut_data(full990_links, attributes, full990_values) 
    return full990_values

In [147]:
def load_EIN_df():
    '''
    Load the dataframe containing the EIN numbers

    Parameters
    ----------
    n/a

    Return
    ------
    obj
        Dataframe containing the EIN numbers needed

    Raises
    ------
    n/a
    '''
    # Only load column containing the EIN numbers from the excel file
    df = pd.read_excel('USA_zoos.xlsx', sheet_name = 'List', usecols= ['EIN#'])
    return df

In [148]:
def get_ein_list():
    '''
    Transform ein numbers from df column to list without NAs
    
    Parameters
    ----------
    n/a

    Return
    ------
    lst
        A formated list of EIN numbers

    Raises
    ------
    n/a
    '''
    # Load the dataframe containing the EIN numbers
    df = load_EIN_df()
    # Format numbers to be all the same with no dashes
    df['EIN#'] = df['EIN#'].str.replace('-', '', regex=True)
    # Remove NAs
    df = df.loc[df['EIN#'].notnull()]
    # Make into a list
    ein_list = list(df['EIN#'])
    return ein_list

In [149]:
def main():
    '''
    Tax data scrape selected nonprofit instiutions from ProPublicas website and save data to excel file
    
    Parameters
    ----------
    n/a

    Return
    ------
    obj
        Dataframe of all the nonprofit institutions with their selected tax information
    '''
    ein_list = get_ein_list()
    # Create dictionary of attributes with description and XPath link
    attributes = create_attribute_dict() 
    # Initialize results dictionary with attribute names               
    full990_values = init_results_dict(attributes) 
    # For testing can do ein_list[:#]     
    for ein in ein_list:   
        full990_values = ProPublica_scrape(ein, attributes, full990_values)
        # For testing
        # print(full990_values)
    # Change dictionary to dataframe
    df = dict_to_dataframe(full990_values)  
    # Save dataframe to excel file            
    df.to_excel('Zoo990data_4_4_24.xlsx')          
    return df
     

In [150]:
full990_values = main()

In [151]:
full990_values

Unnamed: 0,ein,year,name_of_org,another_name,employees,volunteers,board_members,contrib_grants,program_revenue,invest_income,...,exec4_title,exec4_base,exec4_bonus,exec4_nontax,exec4_total,exec5_title,exec5_base,exec5_bonus,exec5_nontax,exec5_total
0,34-6003866,12-31-2022,AKRON ZOOLOGICAL PARK,,328,189,30,16350981,4724598,37980,...,VICE-PRESIDENT LIFE SCIENCES (THRU 1,144148,14384,10764,181270,CHIEF DEVELOPMENT OFFICER & SR VP,138510,21895,1760,180339
1,34-6003866,12-31-2021,AKRON ZOOLOGICAL PARK,,292,170,29,14557065,4282244,2562,...,CHIEF DEVELOPMENT OFFICER & SR VP,130323,13333,1556,159276,,,,,
2,34-6003866,12-31-2020,AKRON ZOOLOGICAL PARK,,254,223,34,10221507,1933590,104868,...,VP FINANCE,120362,14375,23576,171940,"VP OF DEVELOPMENT, CDO",132016,14375,1450,161734
3,34-6003866,12-31-2019,AKRON ZOOLOGICAL PARK,,338,186,31,11441724,3896881,-265007,...,VP FINANCE,106760,14025,20656,154386,,,,,
4,34-6003866,12-31-2018,AKRON ZOOLOGICAL PARK,,301,174,32,10500376,3569203,148466,...,VP FINANCE,102519,13900,23055,152036,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,36-2167918,12-31-2017,SHEDD AQUARIUM SOCIETY,,746,807,55,19137568,37496688,2105995,...,SVP BOARD LIASON & ORGANIZATIONAL GO,176615,25000,10540,248397,"SVP - MARKETING, GUEST EXPERIENCE, &",165844,15000,4677,214626
557,36-2167918,12-31-2016,SHEDD AQUARIUM SOCIETY,,717,844,56,16712405,37287973,-1257975,...,COO/EXECUTIVE VP,264048,45000,19771,358873,FORMER EXECUTIVE VICE PRESIDENT,199496,20000,11524,255909
558,36-2167918,12-31-2015,SHEDD AQUARIUM SOCIETY,,666,858,56,20351694,37535864,859808,...,Executive Vice President,213002,25000,20367,290561,Senior Vice President,178338,20000,7186,237370
559,36-2167918,12-31-2014,SHEDD AQUARIUM SOCIETY,,670,966,53,16884628,35998442,28098506,...,EXECUTIVE VICE PRESIDENT,230513,25000,19891,308066,EXECUTIVE VICE PRESIDENT,187692,25000,8023,228461
