## Create 2022 Race Census Demo CSV

In [1]:
import pandas as pd

In [2]:
# For comma-separated values
race_2022_df = pd.read_csv('2022_census_race.csv', sep=',')

In [3]:
race_2022_df.head()

Unnamed: 0,state,white,black_or_african_american,american_indian_and_alaska_native,asian,native_hawaiian_and_other_pacific-islander,hispanic_or_latino_any_race,state_id,congressional_district,year
0,Alabama,65.4,25.7,0.6,1.6,0.0,3.6,1,1,2022
1,Alabama,60.5,31.0,0.2,1.7,0.0,4.3,1,2,2022
2,Alabama,67.7,23.8,0.1,1.8,0.1,3.6,1,3,2022
3,Alabama,81.4,6.3,1.2,0.8,0.3,7.6,1,4,2022
4,Alabama,70.8,17.7,0.6,1.7,0.1,5.9,1,5,2022


In [4]:
race_2022_df.columns

Index(['state', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian',
       'native_hawaiian_and_other_pacific-islander',
       'hispanic_or_latino_any_race', 'state_id', 'congressional_district',
       'year'],
      dtype='object')

In [5]:
race_2022_df.dtypes

state                                          object
white                                         float64
black_or_african_american                     float64
american_indian_and_alaska_native             float64
asian                                         float64
native_hawaiian_and_other_pacific-islander    float64
hispanic_or_latino_any_race                   float64
state_id                                        int64
congressional_district                          int64
year                                            int64
dtype: object

In [6]:
# Remove any spaces
race_2022_df.columns = race_2022_df.columns.str.strip()

In [7]:
# Drop 'state_id' and 'congressional_district' columns
race_2022_df = race_2022_df.drop(columns=['state_id', 'congressional_district'])

# Display the updated DataFrame
race_2022_df.head()

Unnamed: 0,state,white,black_or_african_american,american_indian_and_alaska_native,asian,native_hawaiian_and_other_pacific-islander,hispanic_or_latino_any_race,year
0,Alabama,65.4,25.7,0.6,1.6,0.0,3.6,2022
1,Alabama,60.5,31.0,0.2,1.7,0.0,4.3,2022
2,Alabama,67.7,23.8,0.1,1.8,0.1,3.6,2022
3,Alabama,81.4,6.3,1.2,0.8,0.3,7.6,2022
4,Alabama,70.8,17.7,0.6,1.7,0.1,5.9,2022


In [8]:
# Check for null values in the entire DataFrame
null_values = race_2022_df.isnull().sum()

# Display columns with null values
print(null_values[null_values > 0])

Series([], dtype: int64)


In [9]:
# List of demographic percentage columns to convert to fractions for averaging
demographic_percentage_columns = [
    'white',
    'black_or_african_american',
    'american_indian_and_alaska_native',
    'asian',
    'native_hawaiian_and_other_pacific-islander',
    'hispanic_or_latino_any_race'
]

# Convert the demographic percentage columns to fractions by dividing by 100
race_2022_df[demographic_percentage_columns] = race_2022_df[demographic_percentage_columns] / 100

# Group by 'state' and calculate the average of all numerical columns
state_averages2022_df = race_2022_df.groupby('state').mean().reset_index()

# Convert the specific percentage columns back to percentages by multiplying by 100
state_averages2022_df[demographic_percentage_columns] = state_averages2022_df[demographic_percentage_columns] * 100

# Optionally, round the values to one decimal place
state_averages2022_df[demographic_percentage_columns] = state_averages2022_df[demographic_percentage_columns].round(1)

# Convert year to whole integer
state_averages2022_df['year'] = state_averages2022_df['year'].astype(int)

# Display the results
state_averages2022_df



Unnamed: 0,state,white,black_or_african_american,american_indian_and_alaska_native,asian,native_hawaiian_and_other_pacific-islander,hispanic_or_latino_any_race,year
0,Alabama,65.0,25.7,0.5,1.5,0.1,4.8,2022
1,Alaska,59.6,3.0,13.0,6.3,2.0,7.7,2022
2,Arizona,57.9,4.6,4.1,3.6,0.2,32.3,2022
3,Arkansas,69.1,14.4,0.8,1.6,0.5,8.3,2022
4,California,38.8,5.5,1.3,15.7,0.4,40.3,2022
5,Colorado,70.3,4.1,1.2,3.2,0.1,22.5,2022
6,Connecticut,65.0,10.5,0.3,4.8,0.0,18.2,2022
7,Delaware,59.9,22.1,0.4,4.1,0.0,10.3,2022
8,District of Columbia,38.4,42.1,0.5,4.2,0.1,11.7,2022
9,Florida,55.7,15.1,0.4,2.9,0.0,27.3,2022


In [10]:
# Export DataFrame as cleaned csv
state_averages2022_df.to_csv('2022_Census_Race_et.csv', index=False)

In [19]:
# Load the CSV files into DataFrames
state_averages2022_df = pd.read_csv('2022_Census_Race_et.csv')
merged_df = pd.read_csv('../censusoutputcsv/merged_race_data.csv')

# Merge the DataFrames on 'state' using an outer join to include all states
combined_df = pd.merge(state_averages2022_df, merged_df, on='state', how='outer')
combined_df

Unnamed: 0,state,white_x,black_or_african_american_x,american_indian_and_alaska_native_x,asian_x,native_hawaiian_and_other_pacific-islander_x,hispanic_or_latino_any_race_x,year_x,white_2008,black_or_african_american_2008,...,native_hawaiian_and_other_pacific-islander_2016,hispanic_or_latino_any_race_2016,year_2016,white_2020,black_or_african_american_2020,american_indian_and_alaska_native_2020,asian_2020,native_hawaiian_and_other_pacific-islander_2020,hispanic_or_latino_any_race_2020,year_2020
0,Alabama,65.0,25.7,0.5,1.5,0.1,4.8,2022,69.5,27.0,...,0.0,4.1,2016,63.7,26.2,0.7,1.5,0.1,5.3,2020
1,Alaska,59.6,3.0,13.0,6.3,2.0,7.7,2022,69.1,3.6,...,1.3,6.9,2016,59.4,3.0,15.2,6.0,1.7,6.8,2020
2,Arizona,57.9,4.6,4.1,3.6,0.2,32.3,2022,80.0,3.7,...,0.2,30.8,2016,60.2,4.8,4.6,3.6,0.2,30.9,2020
3,Arkansas,69.1,14.4,0.8,1.6,0.5,8.3,2022,78.4,16.0,...,0.3,7.0,2016,70.2,15.5,0.9,1.7,0.4,8.2,2020
4,California,38.8,5.5,1.3,15.7,0.4,40.3,2022,62.0,6.2,...,0.4,39.0,2016,41.1,5.6,1.6,15.3,0.4,39.7,2020
5,Colorado,70.3,4.1,1.2,3.2,0.1,22.5,2022,84.5,3.9,...,0.2,21.3,2016,70.7,4.0,1.3,3.4,0.2,21.9,2020
6,Connecticut,65.0,10.5,0.3,4.8,0.0,18.2,2022,80.1,9.5,...,0.0,15.7,2016,66.5,10.7,0.5,4.8,0.0,17.2,2020
7,Delaware,59.9,22.1,0.4,4.1,0.0,10.3,2022,72.7,20.6,...,0.2,9.2,2016,60.4,22.1,0.5,4.3,0.0,10.5,2020
8,District of Columbia,38.4,42.1,0.5,4.2,0.1,11.7,2022,37.5,53.4,...,0.0,10.9,2016,39.6,41.4,0.5,4.9,0.1,11.3,2020
9,Florida,55.7,15.1,0.4,2.9,0.0,27.3,2022,76.9,16.0,...,0.0,24.7,2016,57.5,15.2,0.4,3.0,0.1,26.5,2020


In [20]:
# Define the mapping of old column names to new column names
rename_mapping = {
    'white_x': 'white_2022',
    'black_or_african_american_x': 'black_or_african_american_2022',
    'american_indian_and_alaska_native_x': 'american_indian_and_alaska_native_2022',
    'asian_x': 'asian_2022',
    'native_hawaiian_and_other_pacific-islander_x': 'native_hawaiian_and_other_pacific-islander_2022',
    'hispanic_or_latino_any_race_x': 'hispanic_or_latino_any_race_2022',
    'year_x': 'year_2022'
}

# Rename the columns in the DataFrame
combined_df = combined_df.rename(columns=rename_mapping)

# Display the first few rows of the updated DataFrame
combined_df.head()


Unnamed: 0,state,white_2022,black_or_african_american_2022,american_indian_and_alaska_native_2022,asian_2022,native_hawaiian_and_other_pacific-islander_2022,hispanic_or_latino_any_race_2022,year_2022,white_2008,black_or_african_american_2008,...,native_hawaiian_and_other_pacific-islander_2016,hispanic_or_latino_any_race_2016,year_2016,white_2020,black_or_african_american_2020,american_indian_and_alaska_native_2020,asian_2020,native_hawaiian_and_other_pacific-islander_2020,hispanic_or_latino_any_race_2020,year_2020
0,Alabama,65.0,25.7,0.5,1.5,0.1,4.8,2022,69.5,27.0,...,0.0,4.1,2016,63.7,26.2,0.7,1.5,0.1,5.3,2020
1,Alaska,59.6,3.0,13.0,6.3,2.0,7.7,2022,69.1,3.6,...,1.3,6.9,2016,59.4,3.0,15.2,6.0,1.7,6.8,2020
2,Arizona,57.9,4.6,4.1,3.6,0.2,32.3,2022,80.0,3.7,...,0.2,30.8,2016,60.2,4.8,4.6,3.6,0.2,30.9,2020
3,Arkansas,69.1,14.4,0.8,1.6,0.5,8.3,2022,78.4,16.0,...,0.3,7.0,2016,70.2,15.5,0.9,1.7,0.4,8.2,2020
4,California,38.8,5.5,1.3,15.7,0.4,40.3,2022,62.0,6.2,...,0.4,39.0,2016,41.1,5.6,1.6,15.3,0.4,39.7,2020


In [21]:
combined_df.columns

Index(['state', 'white_2022', 'black_or_african_american_2022',
       'american_indian_and_alaska_native_2022', 'asian_2022',
       'native_hawaiian_and_other_pacific-islander_2022',
       'hispanic_or_latino_any_race_2022', 'year_2022', 'white_2008',
       'black_or_african_american_2008',
       'american_indian_and_alaska_native_2008', 'asian_2008',
       'native_hawaiian_and_other_pacific-islander_2008',
       'hispanic_or_latino_any_race_2008', 'year_y', 'white_y',
       'black_or_african_american_y', 'american_indian_and_alaska_native_y',
       'asian_y', 'native_hawaiian_and_other_pacific-islander_y',
       'hispanic_or_latino_any_race_y', 'year_2012', 'white_2016',
       'black_or_african_american_2016',
       'american_indian_and_alaska_native_2016', 'asian_2016',
       'native_hawaiian_and_other_pacific-islander_2016',
       'hispanic_or_latino_any_race_2016', 'year_2016', 'white_2020',
       'black_or_african_american_2020',
       'american_indian_and_alaska_n

In [22]:
# Export DataFrame as cleaned csv
combined_df.to_csv('Combined_Census_Race_Data.csv', index=False)