In [112]:
# Imports
import pandas as pd
import requests

# Get urls
from config import summary_url, characteristics_url, business_owners_url, tech_characteristics_url

# Check connections
census_data = []
for url in [summary_url, characteristics_url, business_owners_url, tech_characteristics_url]:
    r = requests.get(url)
    data = r.json()
    census_data.append(pd.DataFrame(data[1:], columns=data[0]))

In [113]:
def commonClean(data, code_groups=[], meaningless_groups=[], rename_dict=dict(), number_cols=[]):

    # Remove codes groups
    data = data.drop(columns=code_groups)

    # Remove flags and error groups
    for column in data.columns:
        if '_F' in column:
            data = data.drop(columns=[column])
        elif '_S' in column:
            data = data.drop(columns=[column])

    # Remove columns with no meaning
    data = data.drop(columns=meaningless_groups) 

    # Rename labels
    data = data.rename(columns=rename_dict)

    # Handle nulls and cast all number columns
    for column in number_cols:
        if 'Perc' in column:
            data[column].loc[data[column].notna()] = data[column].loc[data[column].notna()].astype(float)
        else:
            data[column].loc[data[column].notna()] = data[column].loc[data[column].notna()].astype(int)
        
        # Cast zeros as nulls
        data[column].loc[data[column] == 0] = None

    # Return clean data
    return data

# Helpful Documentation: https://www2.census.gov/programs-surveys/abs/technical-documentation/api/ABS_API_CB-10-4-2021.pdf
# https://api.census.gov/data/2018/abscs/variables/FIRMPDEMP_F.html
# https://api.census.gov/data/2017/abscs/variables/RCPPDEMP_F.html

In [114]:
# Get business dataset
summary_data = census_data[0]

# Clean data
summary_label_dict = {
    'SEX_LABEL': 'Sex',
    'RACE_GROUP_LABEL': 'Race',
    'ETH_GROUP_LABEL': 'Ethnicity',
    'VET_GROUP_LABEL': 'VetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'EMP': 'EmployeeCt',
    'EMP_PCT': 'EmployeeCtPerc',
    'BUSCHAR_LABEL': 'SpousalOwnershipSharing',
    'FIRMPDEMP': 'EmployerFirmCt',
    'FIRMPDEMP_PCT': 'EmployerFirmCtPerc',
    'RCPPDEMP': 'RevenueInThousands',
    'RCPPDEMP_PCT': 'RevenueInThousandsPerc',
    'PAYANN': 'AnnualPayrollInThousands',
    'PAYANN_PCT': 'AnnualPayrollInThousandsPerc',
    }
summary_data = commonClean(summary_data,
    code_groups=['GEO_ID', 'SEX', 'RACE_GROUP', 'ETH_GROUP', 'VET_GROUP', 'NAICS2017', 'BUSCHAR'],
    meaningless_groups=['QDESC', 'QDESC_LABEL', 'us', 'NAME'],
    rename_dict=summary_label_dict,
    number_cols=['EmployerFirmCt', 'RevenueInThousands', 'EmployeeCt', 'AnnualPayrollInThousands', 
    'EmployerFirmCtPerc','RevenueInThousandsPerc', 'EmployeeCtPerc', 'AnnualPayrollInThousandsPerc'])

In [115]:
# Get firm dataset
firm_data = census_data[1]

# Clean data
firm_label_dict = {
    'SEX_LABEL': 'Sex',
    'RACE_GROUP_LABEL': 'Race',
    'ETH_GROUP_LABEL': 'Ethnicity',
    'VET_GROUP_LABEL': 'VetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'EMP': 'EmployeeCt',
    'EMP_PCT': 'EmployeeCtPerc',
    'BUSCHAR_LABEL': 'SpousalOwnershipSharing',
    'FIRMPDEMP': 'EmployerFirmCt',
    'FIRMPDEMP_PCT': 'EmployerFirmCtPerc',
    'RCPPDEMP': 'RevenueInThousands',
    'RCPPDEMP_PCT': 'RevenueInThousandsPerc',
    'PAYANN': 'AnnualPayrollInThousands',
    'PAYANN_PCT': 'AnnualPayrollInThousandsPerc'
    }
firm_data = commonClean(firm_data,
    code_groups=['GEO_ID', 'SEX', 'RACE_GROUP', 'ETH_GROUP', 'VET_GROUP', 'NAICS2017', 'BUSCHAR'],
    meaningless_groups=['QDESC', 'QDESC_LABEL', 'us', 'NAME'],
    rename_dict=firm_label_dict,
    number_cols=['EmployerFirmCt', 'RevenueInThousands', 'EmployeeCt', 'AnnualPayrollInThousands', 
    'EmployerFirmCtPerc','RevenueInThousandsPerc', 'EmployeeCtPerc', 'AnnualPayrollInThousandsPerc'])

In [116]:
# Get owner dataset
owner_data = census_data[2]

# Clean data
owner_label_dict = {
    'OWNER_SEX_LABEL': 'OwnerSex',
    'OWNER_RACE_LABEL': 'OwnerRace',
    'OWNER_ETH_LABEL': 'OwnerEthnicity',
    'OWNER_VET_LABEL': 'OwnerVetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'OWNCHAR_LABEL': 'OwnedSince',
    'OWNPDEMP': 'OwnerCt',
    'OWNPDEMP_PCT': 'OwnerCtPerc'
    }
owner_data = commonClean(owner_data,
    code_groups=['GEO_ID', 'OWNER_SEX', 'OWNER_RACE', 'OWNER_ETH', 'OWNER_VET', 'NAICS2017', 'OWNCHAR'],
    meaningless_groups=['QDESC', 'QDESC_LABEL', 'us', 'NAME'],
    rename_dict=owner_label_dict,
    number_cols=['OwnerCt', 'OwnerCtPerc'])

In [117]:
# Get tech dataset
tech_data = census_data[3]

# Column removal
tech_label_dict = {
    'SEX_LABEL': 'Sex',
    'RACE_GROUP_LABEL': 'Race',
    'ETH_GROUP_LABEL': 'Ethnicity',
    'VET_GROUP_LABEL': 'VetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'NSFSZFI_LABEL': 'CompanySize',
    'FACTORS_P_LABEL': 'RestrictionReason',
    'RCPPDEMP': 'RevenueInThousands',
    'RCPPDEMP_PCT': 'RevenueInThousandsPerc',
    'PAYANN': 'AnnualPayrollInThousands',
    'PAYANN_PCT': 'AnnualPayrollInThousandsPerc',
    'EMP': 'EmployeeCt',
    'EMP_PCT': 'EmployeeCtPerc',
    'FIRMPDEMP': 'EmployerFirmCt',
    'FIRMPDEMP_PCT': 'EmployerFirmCtPerc'
    }
tech_data = commonClean(tech_data,
    code_groups=['GEO_ID', 'SEX', 'RACE_GROUP', 'ETH_GROUP', 'VET_GROUP', 'NAICS2017', 'NSFSZFI', 'FACTORS_P'],
    meaningless_groups=['us', 'NAME'],
    rename_dict=tech_label_dict,
    number_cols=['RevenueInThousands', 'RevenueInThousandsPerc', 'AnnualPayrollInThousands',
        'AnnualPayrollInThousandsPerc', 'EmployeeCt', 'EmployeeCtPerc', 'EmployerFirmCt', 'EmployerFirmCtPerc'])

In [118]:
# Save all data to CSVs
summary_data.to_csv('data/summary_dataset.csv')
firm_data.to_csv('data/firm_dataset.csv')
owner_data.to_csv('data/owner_dataset.csv')
tech_data.to_csv('data/tech_dataset.csv')