In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [2]:
def read_file_details(file,year):
    # Read in file
    df = pd.read_csv(file)

    # Get rid of spaces in Labels
    df['Label'] = df['Label'].apply(lambda x: x.lstrip())

    # # Convert percentage estimates to float
    df['Percentage'] = df['Estimate'].apply(lambda x: float(x.rstrip('%'))/100 if '%' in x else None)

    # # Keep only full population stats
    df = df.iloc[:6]

    # # Move Population 3 years and over enrolled in school from row to column
    df['Total Population'] = int(df.iloc[0]['Estimate'].replace(',',''))
    df.dropna(inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['Estimate','Margin of Error'], inplace=True)

    # Calculate population estimates
    df['Population Estimate'] = df['Percentage']*df['Total Population']

    # # # Drop unnecessary columns
    df.drop(columns=['Percentage','Total Population'], inplace=True)

    # # Change index and transpose table
    df.set_index('Label', inplace=True)
    df = df.T

    # Change index to year
    df.index = [year]

    return df

In [3]:
for year in range(2010,2023):
    file = 'data/'+str(year)+'/TOTAL_NUMBER_OF_RACES_REPORTED.CSV'

    try:
        read_file_details(file,year)
    except:
        print(f"No file for year '{year}'")

No file for year '2020'


In [4]:
# test that function works on one file
read_file_details('data/2014/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2014)

Label,One race,Two races,Three races,Four or more races
2014,68701.704,10030.608,875.688,0.0


In [5]:
# apply function to each dataset by year
TOTAL_NUMBER_OF_RACES_REPORTED_2022 = read_file_details('data/2022/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2022)
TOTAL_NUMBER_OF_RACES_REPORTED_2021 = read_file_details('data/2021/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2021)
TOTAL_NUMBER_OF_RACES_REPORTED_2019 = read_file_details('data/2019/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2019)
TOTAL_NUMBER_OF_RACES_REPORTED_2018 = read_file_details('data/2018/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2018)
TOTAL_NUMBER_OF_RACES_REPORTED_2017 = read_file_details('data/2017/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2017)
TOTAL_NUMBER_OF_RACES_REPORTED_2016 = read_file_details('data/2016/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2016)
TOTAL_NUMBER_OF_RACES_REPORTED_2015 = read_file_details('data/2015/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2015)
TOTAL_NUMBER_OF_RACES_REPORTED_2014 = read_file_details('data/2014/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2014)
TOTAL_NUMBER_OF_RACES_REPORTED_2013 = read_file_details('data/2013/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2013)
TOTAL_NUMBER_OF_RACES_REPORTED_2012 = read_file_details('data/2012/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2012)
TOTAL_NUMBER_OF_RACES_REPORTED_2011 = read_file_details('data/2011/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2011)
TOTAL_NUMBER_OF_RACES_REPORTED_2010 = read_file_details('data/2010/TOTAL_NUMBER_OF_RACES_REPORTED.CSV',2010)

In [6]:
TOTAL_NUMBER_OF_RACES_REPORTED = pd.concat([TOTAL_NUMBER_OF_RACES_REPORTED_2022,TOTAL_NUMBER_OF_RACES_REPORTED_2021,TOTAL_NUMBER_OF_RACES_REPORTED_2019,TOTAL_NUMBER_OF_RACES_REPORTED_2018,TOTAL_NUMBER_OF_RACES_REPORTED_2017,TOTAL_NUMBER_OF_RACES_REPORTED_2016,TOTAL_NUMBER_OF_RACES_REPORTED_2015,TOTAL_NUMBER_OF_RACES_REPORTED_2014,TOTAL_NUMBER_OF_RACES_REPORTED_2013,TOTAL_NUMBER_OF_RACES_REPORTED_2012,TOTAL_NUMBER_OF_RACES_REPORTED_2011,TOTAL_NUMBER_OF_RACES_REPORTED_2010])
TOTAL_NUMBER_OF_RACES_REPORTED

Label,One race,Two races,Three races,Four or more races
2022,76347.138,11773.58,2173.584,181.132
2021,74352.81,11228.07,2210.25,618.87
2019,76197.964,12503.352,1540.268,362.416
2018,76154.734,11374.336,977.482,355.448
2017,70902.37,12137.95,669.68,83.71
2016,64998.72,9403.75,601.84,225.69
2015,70203.52,10775.424,571.424,163.264
2014,68701.704,10030.608,875.688,0.0
2013,72954.528,10016.64,250.416,250.416
2012,67645.858,9529.786,859.243,0.0


In [7]:
# adjust axes
TOTAL_NUMBER_OF_RACES_REPORTED.reset_index(inplace=True)
TOTAL_NUMBER_OF_RACES_REPORTED.rename(columns={'index':'Year'}, inplace=True)
#EMPLOYMENT_STATUS.index.names = ['index']
TOTAL_NUMBER_OF_RACES_REPORTED

Label,Year,One race,Two races,Three races,Four or more races
0,2022,76347.138,11773.58,2173.584,181.132
1,2021,74352.81,11228.07,2210.25,618.87
2,2019,76197.964,12503.352,1540.268,362.416
3,2018,76154.734,11374.336,977.482,355.448
4,2017,70902.37,12137.95,669.68,83.71
5,2016,64998.72,9403.75,601.84,225.69
6,2015,70203.52,10775.424,571.424,163.264
7,2014,68701.704,10030.608,875.688,0.0
8,2013,72954.528,10016.64,250.416,250.416
9,2012,67645.858,9529.786,859.243,0.0


In [8]:
TOTAL_NUMBER_OF_RACES_REPORTED.to_csv("data/final/TOTAL_NUMBER_OF_RACES_REPORTED.CSV")