In [1]:
import pandas as pd
import numpy as np

In [2]:
import urllib.request
import urllib

# *******************************************************************************
# qcewCreateDataRows : This function takes a raw csv string and splits it into
# a two-dimensional array containing the data and the header row of the csv file
# a try/except block is used to handle for both binary and char encoding
def qcewCreateDataRows(csv):
    dataRows = []
    try: dataLines = csv.decode().split('\r\n')
    except er: dataLines = csv.split('\r\n');
    for row in dataLines:
        dataRows.append(row.split(','))
    return dataRows
# *******************************************************************************




# *******************************************************************************
# qcewGetAreaData : This function takes a year, quarter, and area argument and
# returns an array containing the associated area data. Use 'a' for annual
# averages. 
# For all area codes and titles see:
# http://www.bls.gov/cew/doc/titles/area/area_titles.htm
#
def qcewGetAreaData(year,qtr,area):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/[QTR]/area/[AREA].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[QTR]",qtr.lower())
    urlPath = urlPath.replace("[AREA]",area.upper())
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)
# *******************************************************************************




# *******************************************************************************
# qcewGetIndustryData : This function takes a year, quarter, and industry code
# and returns an array containing the associated industry data. Use 'a' for 
# annual averages. Some industry codes contain hyphens. The CSV files use
# underscores instead of hyphens. So 31-33 becomes 31_33. 
# For all industry codes and titles see:
# http://www.bls.gov/cew/doc/titles/industry/industry_titles.htm
#
def qcewGetIndustryData(year,qtr,industry):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/[QTR]/industry/[IND].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[QTR]",qtr.lower())
    urlPath = urlPath.replace("[IND]",industry)
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)
# *******************************************************************************




# *******************************************************************************
# qcewGetSizeData : This function takes a year and establishment size class code
# and returns an array containing the associated size data. Size data
# is only available for the first quarter of each year.
# For all establishment size classes and titles see:
# http://www.bls.gov/cew/doc/titles/size/size_titles.htm
#
def qcewGetSizeData(year,size):
    urlPath = "http://data.bls.gov/cew/data/api/[YEAR]/1/size/[SIZE].csv"
    urlPath = urlPath.replace("[YEAR]",year)
    urlPath = urlPath.replace("[SIZE]",size)
    httpStream = urllib.request.urlopen(urlPath)
    csv = httpStream.read()
    httpStream.close()
    return qcewCreateDataRows(csv)
# *******************************************************************************

In [3]:
## Define a function that pulls and cleans data in a way that is useful for my analysis
def getAnnualCountyIndustryData(year, naics, print_results = True, return_total_US_emp = False):
    '''
    Connect to QCEW API and extract and clean county-level data for a given industry (NAICS code) and year. 
    Can print an assessment of the county-level coverage of QCEW employment figures vs the national employment figure for the industry, and can choose whether to return this national total or the county-level data.
    Use naics = '10' to obtain employment across all industries.
    '''
    
    # Call industry data as per function above
    data_raw = qcewGetIndustryData(year, 'a', naics)
    
    # Remove quotation marks in returned array
    data = []
    for i in range(len(data_raw)):
        lst = [data_raw[i][j].strip('"') for j in range(len(data_raw[i]))]
        data.append(lst)
    
    # Store in a df
    df = pd.DataFrame(data[1:-1], columns = data[0])
    
    # Convert strings to numerics
    numeric_cols = ['annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages', 'taxable_annual_wages',
       'annual_contributions', 'annual_avg_wkly_wage', 'avg_annual_pay',
       'lq_disclosure_code', 'lq_annual_avg_estabs', 'lq_annual_avg_emplvl',
       'lq_total_annual_wages', 'lq_taxable_annual_wages',
       'lq_annual_contributions', 'lq_annual_avg_wkly_wage',
       'lq_avg_annual_pay', 'oty_disclosure_code', 'oty_annual_avg_estabs_chg',
       'oty_annual_avg_estabs_pct_chg', 'oty_annual_avg_emplvl_chg',
       'oty_annual_avg_emplvl_pct_chg', 'oty_total_annual_wages_chg',
       'oty_total_annual_wages_pct_chg', 'oty_taxable_annual_wages_chg',
       'oty_taxable_annual_wages_pct_chg', 'oty_annual_contributions_chg',
       'oty_annual_contributions_pct_chg', 'oty_annual_avg_wkly_wage_chg',
       'oty_annual_avg_wkly_wage_pct_chg', 'oty_avg_annual_pay_chg',
       'oty_avg_annual_pay_pct_chg']

    for col in numeric_cols:
        df[col] = pd.to_numeric(np.array(df[col]), errors = 'coerce')
        
    # Ensure only annual averages are kept
    df = df[df['qtr'] == 'A']

    # Ensure only 'all establishment sizes' are kept
    df = df[df['size_code'] == '0']

    # Ensure all 'total covered' ownership fields are removed, then group by all other categorical variables to effectively obtain this value
    df = df[df['own_code'] != '0']
    df = df.groupby(by = ['area_fips', 'industry_code', 'agglvl_code', 'size_code', 'year', 'qtr', 'disclosure_code'], as_index = False).sum()
    
    # Seperate national total from rest of data, then drop all non-county level entries
    df_total = df[df['area_fips'] == 'US000']
    df = df[df['area_fips'] != 'US000']
    
    # If specified in function input, return only the total US employment for this industry. Otherwise, return the final county-level dataframe
    if return_total_US_emp == True:
        if print_results == True:
            print('Total U.S. employment in this industry:', df_total.to_numpy()[0][8])
        return df_total.to_numpy()[0][8]
    
    df['Drop?'] = np.zeros(len(df))

    for i in range(len(df)):
        if df.loc[i, 'area_fips'][0] == 'C' or df.loc[i, 'area_fips'][0] == 'U' or df.loc[i, 'area_fips'][-1] == '0':
            df.loc[i, 'Drop?'] = 1

    df = df[df['Drop?'] == 0].drop(columns = ['Drop?']).reset_index(drop = True)
    
    # Print comparison of national total employment to disclosed county-level employment
    if print_results == True:
        print('Total disclosed employment in QCEW:', df[df['disclosure_code'] != 'N']['annual_avg_emplvl'].sum())
        print('Percent of total U.S. employment disclosed at county level by QCEW:', np.round( df[df['disclosure_code'] != 'N']['annual_avg_emplvl'].sum() / df_total.to_numpy()[0][8] * 100, 1), '%')
    
    return df

In [39]:
## Define a function that pulls and cleans data in a way that is useful for my analysis
def getAnnualStateIndustryData(year, naics, print_results = True, return_total_US_emp = False):
    '''
    Connect to QCEW API and extract and clean state-level data for a given industry (NAICS code) and year. 
    Can print an assessment of the state-level coverage of QCEW employment figures vs the national employment figure for the industry, and can choose whether to return this national total or the state-level data.
    '''
    
    # Call industry data as per function above
    data_raw = qcewGetIndustryData(year, 'a', naics)

    # Remove quotation marks in returned array
    data = []
    for i in range(len(data_raw)):
        lst = [data_raw[i][j].strip('"') for j in range(len(data_raw[i]))]
        data.append(lst)

    # Store in a df
    df = pd.DataFrame(data[1:-1], columns = data[0])

    # Convert strings to numerics
    numeric_cols = ['annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages', 'taxable_annual_wages',
       'annual_contributions', 'annual_avg_wkly_wage', 'avg_annual_pay',
       'lq_disclosure_code', 'lq_annual_avg_estabs', 'lq_annual_avg_emplvl',
       'lq_total_annual_wages', 'lq_taxable_annual_wages',
       'lq_annual_contributions', 'lq_annual_avg_wkly_wage',
       'lq_avg_annual_pay', 'oty_disclosure_code', 'oty_annual_avg_estabs_chg',
       'oty_annual_avg_estabs_pct_chg', 'oty_annual_avg_emplvl_chg',
       'oty_annual_avg_emplvl_pct_chg', 'oty_total_annual_wages_chg',
       'oty_total_annual_wages_pct_chg', 'oty_taxable_annual_wages_chg',
       'oty_taxable_annual_wages_pct_chg', 'oty_annual_contributions_chg',
       'oty_annual_contributions_pct_chg', 'oty_annual_avg_wkly_wage_chg',
       'oty_annual_avg_wkly_wage_pct_chg', 'oty_avg_annual_pay_chg',
       'oty_avg_annual_pay_pct_chg']

    for col in numeric_cols:
        df[col] = pd.to_numeric(np.array(df[col]), errors = 'coerce')

    # Ensure only annual averages are kept
    df = df[df['qtr'] == 'A']

    # Ensure only 'all establishment sizes' are kept
    df = df[df['size_code'] == '0']

    # Ensure all 'total covered' ownership fields are removed, then group by all other categorical variables to effectively obtain this value
    df = df[df['own_code'] != '0']
    df = df.groupby(by = ['area_fips', 'industry_code', 'agglvl_code', 'size_code', 'year', 'qtr', 'disclosure_code'], as_index = False).sum()

    # Seperate national total from rest of data, then drop all non-county level entries
    df_total = df[df['area_fips'] == 'US000']
    df = df[df['area_fips'] != 'US000']

    # If specified in function input, return only the total US employment for this industry. Otherwise, return the final county-level dataframe
    if return_total_US_emp == True:
        if print_results == True:
            print('Total U.S. employment in this industry:', df_total.to_numpy()[0][8])
        return df_total.to_numpy()[0][8]

    df['Drop?'] = np.zeros(len(df))

    for i in range(len(df)):
        if df.loc[i, 'area_fips'][0] == 'C' or df.loc[i, 'area_fips'][0] == 'U' or df.loc[i, 'area_fips'][-3:] != '000':
            df.loc[i, 'Drop?'] = 1

    df = df[df['Drop?'] == 0].drop(columns = ['Drop?']).reset_index(drop = True)

    df = df[df['disclosure_code'] != 'N'].reset_index(drop = True)
    
    # Print comparison of national total employment to disclosed county-level employment
    if print_results == True:
        print('Total disclosed employment in QCEW:', df[df['disclosure_code'] != 'N']['annual_avg_emplvl'].sum())
        print('Percent of total U.S. employment disclosed at state level by QCEW:', np.round( df[df['disclosure_code'] != 'N']['annual_avg_emplvl'].sum() / df_total.to_numpy()[0][8] * 100, 1), '%')

    return df