In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dask import dataframe as dd
import os

In [None]:
# Import state FIPS codes and abbreviations
from datetime import datetime
import json
import requests

path = os.getcwd()
index = path.index("GrahamKnittel_ECF_PNAS_ReplicationMaterials")
subpath = path[:index + len("GrahamKnittel_ECF_PNAS_ReplicationMaterials")]

stateFIPS = pd.read_csv(subpath + "\Data\empData\Temp\stateFIPS.csv",
                        dtype={'FIPS': str},
                        index_col='Abbr')

# Import geographies used by LEHD
geographies = pd.read_csv(subpath + "\Data\empData\Input\label_geography.csv",
                          dtype={'geography': str})

# Import industries used by LEHD
industries = pd.read_csv(subpath + "\Data\empData\Input\label_industry.csv",
                         dtype={'industry': str, 'ind_level': str})

# Load in date coverage information
date_coverage = pd.read_csv(subpath + "\Data\empData\Input\date_coverage.csv")
date_coverage = pd.merge(date_coverage,
                         stateFIPS.loc[:, ['FIPS']],
                         how='left',
                         left_on='State',
                         right_index=True
                         )

# Set-up
# Insert activated key
key = '1b5678a1a374e17f9c32c8981fbe81d8f437edc7'

# Specify variables for API call
endpoints = ['sa?', 'se?', 'rh?']
demo = ['sex', 'agegrp', 'race', 'ethnicity', 'education']

In [None]:
# ------------- DEFINE FUNCTION TO PULL LEHD DATA FROM CENSUS API -------------#
def getLEHDemp(year, naics_granularity, naics, states='all', endpoint='sa?', demos=demo[:2], write_to_csv=False):
    '''
    Returns dataframe of county-level LEHD employment data for specified NAICS granularity and subsectors.

    Parameters:
    ----------------------   
    naics_granularity (str) - Single digit number indicating the NAICS code granularity of the employment data desired.
    naics (str) - The NAICS subsector to pull employment data for. If len(naics) < naics_granularity, will return employment data for all NAICS subsectors within the specified naics industry with a naics granularity equal to that specifieid by naics_granularity. E.g. if naics_granularity is '3', and naics is '21', then data will be returned for NAICS 211, 212, etc.
    states (str or list) - default 'all'. Either 'all' to return nationwide data, or a list of 2-char state abbreviations (characters must be uppercase).
    write_to_csv (boolean, defaul False) - Specifies whether to write the results to a csv file. Recommended when pulling a lot of data, to avoid long computation times in future by just reading data from csv file.
    '''
    
    # Check formats of function inputs
    if type(year) != str:
        raise TypeError('year must be a string.')
    if (type(naics_granularity) != str) or len(naics_granularity) > 1:
        raise TypeError(
            'naics_granularity must be a single-digit number in string format.')
    if type(naics) != str or len(naics) > int(naics_granularity):
        if naics != '31-33' and naics != '44-45':
            raise TypeError(
                'naics must be a string of length <= naics_granularity')

    # Determine list of industries to consider
    ind_df = industries[industries['ind_level'] == naics_granularity].copy()
    if naics == '31-33':
        ind_df['Keep?'] = ind_df['industry'].apply(
            lambda x: True if x[:2] == naics[:2] else
            (True if x[:2] == naics[-2:] else
             (True if x[:2] == '32' else False))
        )
    elif naics == '44-45':
        ind_df['Keep?'] = ind_df['industry'].apply(
            lambda x: True if x[:2] == naics[:2] else
            (True if x[:2] == naics[-2:] else False)
        )
    else:
        ind_df['Keep?'] = ind_df['industry'].apply(
            lambda x: True if x[:len(naics)] == naics else False
        )
    ind_df = ind_df[ind_df['Keep?'] == True]

    if naics == '00':
        industry_list = ['00']
    else:
        industry_list = list(ind_df['industry'])

    # Prepare demographic input
    demos_txt = ','.join(demos)
    indicators = ['Emp']

    # Specify state scope
    if states == 'all':
        geo = list(stateFIPS['FIPS'])
    else:
        geo = list(stateFIPS.loc[states, 'FIPS'])

    # Set up dictionary to store pulled data
    dfs_ovr = {}     # To store dataframes on employment across all demographics at the county-level, for each state

    # Iterative through each state
    for state in geo:

        # To store dataframes of employment data for each industry within a given state.
        df_list = []

        for industry in industry_list:
            emp_year = year

            # Check whether date has data for selected year
            if int(date_coverage[date_coverage['FIPS'] == state].reset_index(drop=True).loc[0, 'End Quarter'][:4]) < int(emp_year):
                emp_year = date_coverage[date_coverage['FIPS'] == state].reset_index(
                    drop=True).loc[0, 'End Quarter'][:4]

            # Make an API call and store the response.
            url = f'https://api.census.gov/data/timeseries/qwi/{endpoint}get={indicators[0]},{demos_txt}\
&for=county:*&in=state:{state}&year={emp_year}&quarter=1&quarter=2&quarter=3&quarter=4&\
industry={industry}&key={key}'

            data = requests.get(url)

            # Check that all calls return data
            if data.status_code == 400:
                raise ValueError(
                    f'More than 400,000 cells returned at {url}. Specify an alternate query that lessens output.')

            # Store the API response in a variable.
            try:
                available_data = data.json()

                df = pd.DataFrame(
                    available_data[1:], columns=available_data[0])
                df['Emp'] = pd.to_numeric(df['Emp'])

                # Annualize data by taking mean across quarters
                df = df.groupby(by=['state', 'county', 'year', 'industry',
                                f'{demos[0]}', f'{demos[1]}'], as_index=False).mean()

                # Only keep overall data
                df = df[(df[f'{demos[0]}'] == '0') & (
                    df[f'{demos[1]}'] == 'A00')].reset_index(drop=True)

                # Append to the list of dataframes for each industry
                df_list.append(df)

            # If there is 0 employment for this industry in a certain state, ValueError will be raised. Pass this.
            except ValueError:
                print(
                    f'0 employment for NAICS {industry} in state {state}, therefore passed')
                pass

        # Store results by concatenating df_list into a single dataframe for the overall case as well as the demographic case
        try:
            # Store final dataframes in dictionary keyed by state
            dfs_ovr[state] = pd.concat(df_list)

        # If there is no employment in any of the assessed industries in this state, valueerror will be raised. Pass this.
        except ValueError:
            print(f'No employees in NAICS {naics} in state {state}. Pass.')
            pass

    # Concatenate dataframes for each state into one big dataframe. Add county FIPS column.
    emp_ovr = pd.concat(dfs_ovr.values()).reset_index(drop=True)
    emp_ovr['FIPS'] = emp_ovr['state'] + emp_ovr['county']

    # Drop any duplicate columns
    emp_ovr = emp_ovr.drop_duplicates(ignore_index=True)

    if write_to_csv == True:
        emp_ovr.to_csv(subpath + f'/Data/empData/Temp/emp_ovr_{naics}_{naics_granularity}dig_{year}.csv')

    return emp_ovr