In [164]:
# Imports
import pandas as pd
import requests

# Get urls
from config import summary_url, characteristics_url, business_owners_url, tech_characteristics_url

# Check connections
census_data = []
for url in [summary_url, characteristics_url, business_owners_url, tech_characteristics_url]:
    r = requests.get(url)
    data = r.json()
    census_data.append(pd.DataFrame(data[1:], columns=data[0]))

In [165]:
# TL business dataset
summary_data = census_data[0]

# Helpful Documentation: https://www2.census.gov/programs-surveys/abs/technical-documentation/api/ABS_API_CB-10-4-2021.pdf
# https://api.census.gov/data/2018/abscs/variables/FIRMPDEMP_F.html
# https://api.census.gov/data/2017/abscs/variables/RCPPDEMP_F.html

# Column removal

# Remove codes groups
summary_data = summary_data.drop(columns=['GEO_ID', 'SEX', 'RACE_GROUP', 'ETH_GROUP', 'VET_GROUP', 'NAICS2017', 'BUSCHAR'])

# Remove flags groups
summary_data = summary_data.drop(columns=['FIRMPDEMP_F', 'FIRMPDEMP_PCT_F', 'RCPPDEMP_F', 'RCPPDEMP_PCT_F', 'EMP_F',
    'EMP_PCT_F', 'PAYANN_F', 'PAYANN_PCT_F', 'FIRMPDEMP_S_F', 'FIRMPDEMP_PCT_S_F', 'RCPPDEMP_S_F', 'RCPPDEMP_PCT_S_F',
    'EMP_S_F', 'EMP_PCT_S_F', 'PAYANN_S_F', 'PAYANN_PCT_S_F'])

# Remove standard error groups
summary_data = summary_data.drop(columns=['FIRMPDEMP_S', 'FIRMPDEMP_PCT_S', 'RCPPDEMP_S', 'RCPPDEMP_PCT_S', 'EMP_S',
    'EMP_PCT_S', 'PAYANN_S', 'PAYANN_PCT_S'])

# Remove columns with no meaning
summary_data = summary_data.drop(columns=['QDESC', 'QDESC_LABEL', 'us', 'NAME'])

# Change to more descriptive names

# Change names of columns to be more descriptive
summary_data = summary_data.rename(columns=
    {
    'SEX_LABEL': 'Sex',
    'RACE_GROUP_LABEL': 'Race',
    'ETH_GROUP_LABEL': 'Ethnicity',
    'VET_GROUP_LABEL': 'VetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'EMP': 'EmployeeCt',
    'EMP_PCT': 'EmployeeCtPerc',
    'BUSCHAR_LABEL': 'SpousalOwnershipSharing',
    'FIRMPDEMP': 'EmployerFirmCt',
    'FIRMPDEMP_PCT': 'EmployerFirmCtPerc',
    'RCPPDEMP': 'RevenueInThousands',
    'RCPPDEMP_PCT': 'RevenueInThousandsPerc',
    'PAYANN': 'AnnualPayrollInThousands',
    'PAYANN_PCT': 'AnnualPayrollInThousandsPerc'
    })

# NOTE: Business codes has a lot of different meanings, including operation status, business sharing

# Type casting and handling nulls
for column in ['EmployerFirmCt', 'RevenueInThousands', 'EmployeeCt', 'AnnualPayrollInThousands', 
    'EmployerFirmCtPerc','RevenueInThousandsPerc', 'EmployeeCtPerc', 'AnnualPayrollInThousandsPerc']:
        if 'Perc' in column:
            summary_data[column].loc[summary_data[column].notna()] = summary_data[column].loc[summary_data[column].notna()].astype(float)
        else:
            summary_data[column].loc[summary_data[column].notna()] = summary_data[column].loc[summary_data[column].notna()].astype(int)
        
        # Cast zeros as nulls
        summary_data[column].loc[summary_data[column] == 0] = None

# Save data as CSV

summary_data.to_csv('data/summary_dataset.csv')

In [166]:
# TL firm dataset
firm_data = census_data[1]

# Column removal

# Remove codes groups
firm_data = firm_data.drop(columns=['GEO_ID', 'SEX', 'RACE_GROUP', 'ETH_GROUP', 'VET_GROUP', 'NAICS2017', 'BUSCHAR'])

# Remove flags groups
firm_data = firm_data.drop(columns=['FIRMPDEMP_F', 'FIRMPDEMP_PCT_F', 'RCPPDEMP_F', 'RCPPDEMP_PCT_F', 'EMP_F',
    'EMP_PCT_F', 'PAYANN_F', 'PAYANN_PCT_F', 'FIRMPDEMP_S_F', 'FIRMPDEMP_PCT_S_F', 'RCPPDEMP_S_F', 'RCPPDEMP_PCT_S_F',
    'EMP_S_F', 'EMP_PCT_S_F', 'PAYANN_S_F', 'PAYANN_PCT_S_F'])

# Remove standard error groups
firm_data = firm_data.drop(columns=['FIRMPDEMP_S', 'FIRMPDEMP_PCT_S', 'RCPPDEMP_S', 'RCPPDEMP_PCT_S', 'EMP_S',
    'EMP_PCT_S', 'PAYANN_S', 'PAYANN_PCT_S'])

# Remove columns with no meaning
firm_data = firm_data.drop(columns=['QDESC', 'QDESC_LABEL', 'us', 'NAME'])

# Change to more descriptive names

# Change names of columns to be more descriptive
firm_data = firm_data.rename(columns=
    {
    'SEX_LABEL': 'Sex',
    'RACE_GROUP_LABEL': 'Race',
    'ETH_GROUP_LABEL': 'Ethnicity',
    'VET_GROUP_LABEL': 'VetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'EMP': 'EmployeeCt',
    'EMP_PCT': 'EmployeeCtPerc',
    'BUSCHAR_LABEL': 'SpousalOwnershipSharing',
    'FIRMPDEMP': 'EmployerFirmCt',
    'FIRMPDEMP_PCT': 'EmployerFirmCtPerc',
    'RCPPDEMP': 'RevenueInThousands',
    'RCPPDEMP_PCT': 'RevenueInThousandsPerc',
    'PAYANN': 'AnnualPayrollInThousands',
    'PAYANN_PCT': 'AnnualPayrollInThousandsPerc'
    })

# NOTE: Business codes has a lot of different meanings, including operation status, business sharing

# Type casting and handling nulls
for column in ['EmployerFirmCt', 'RevenueInThousands', 'EmployeeCt', 'AnnualPayrollInThousands', 
    'EmployerFirmCtPerc','RevenueInThousandsPerc', 'EmployeeCtPerc', 'AnnualPayrollInThousandsPerc']:
        if 'Perc' in column:
            firm_data[column].loc[firm_data[column].notna()] = firm_data[column].loc[firm_data[column].notna()].astype(float)
        else:
            firm_data[column].loc[firm_data[column].notna()] = firm_data[column].loc[firm_data[column].notna()].astype(int)
        
        # Cast zeros as nulls
        firm_data[column].loc[firm_data[column] == 0] = None

# Save data as CSV

firm_data.to_csv('data/firm_dataset.csv')

In [167]:
# TL firm dataset
owner_data = census_data[2]

# Column removal

# Remove codes groups
owner_data = owner_data.drop(columns=['GEO_ID', 'OWNER_SEX', 'OWNER_RACE', 'OWNER_ETH', 'OWNER_VET', 'NAICS2017', 'OWNCHAR'])

# Remove flags groups
owner_data = owner_data.drop(columns=['OWNPDEMP_F', 'OWNPDEMP_PCT_F', 'OWNPDEMP_S_F', 'OWNPDEMP_PCT_S_F'])

# Remove standard error groups
owner_data = owner_data.drop(columns=['OWNPDEMP_S', 'OWNPDEMP_PCT_S'])

# Remove columns with no meaning
owner_data = owner_data.drop(columns=['QDESC', 'QDESC_LABEL', 'us', 'NAME'])

# Change to more descriptive names

# Change names of columns to be more descriptive
owner_data = owner_data.rename(columns=
    {
    'OWNER_SEX_LABEL': 'OwnerSex',
    'OWNER_RACE_LABEL': 'OwnerRace',
    'OWNER_ETH_LABEL': 'OwnerEthnicity',
    'OWNER_VET_LABEL': 'OwnerVetStatus',
    'NAICS2017_LABEL': 'Industry',
    'YEAR': 'Year',
    'OWNCHAR_LABEL': 'OwnedSince',
    'OWNPDEMP': 'OwnerCt',
    'OWNPDEMP_PCT': 'OwnerCtPerc'
    })

# Type casting and handling nulls
for column in ['OwnerCt', 'OwnerCtPerc']:
        if 'Perc' in column:
            owner_data[column].loc[owner_data[column].notna()] = owner_data[column].loc[owner_data[column].notna()].astype(float)
        else:
            owner_data[column].loc[owner_data[column].notna()] = owner_data[column].loc[owner_data[column].notna()].astype(int)
        
        # Cast zeros as nulls
        owner_data[column].loc[owner_data[column] == 0] = None

# Save data as CSV

owner_data.to_csv('data/owner_dataset.csv')