## Create 2022Age Census Demo CSV

In [4]:
import pandas as pd

In [5]:
# For comma-separated values
age_2022_df = pd.read_csv('../censusoutputcsv/2022_census_age.csv', sep=',')

In [6]:
# Display dataframe head
age_2022_df.head()

Unnamed: 0,state,18_and_over_population_male,18_and_over_population_female,20_to_24_years\t,25_to_34_years,35_to_44_years,45_to_54_years,55_to_59_years,60_to_64_years\t,65_to_74_years\t,75_to_84_years\t,85_years_and_over\t,median_age(years)\t,state_id,congressional_district,year
0,Alabama,47.6,52.4,5.8,12.8,12.0,12.5,6.0,7.2,11.5,5.9,2.0,40.9,1,1,2022
1,Alabama,47.5,52.5,6.1,13.8,12.4,11.9,6.5,6.3,10.6,5.9,1.6,39.1,1,2,2022
2,Alabama,47.7,52.3,8.0,12.4,12.2,12.1,6.5,6.7,11.2,5.6,1.5,39.4,1,3,2022
3,Alabama,48.3,51.7,6.3,11.8,12.5,12.3,6.7,6.5,11.2,6.0,1.7,40.8,1,4,2022
4,Alabama,48.7,51.3,6.4,13.1,12.8,12.5,6.5,7.5,10.1,5.6,1.6,40.2,1,5,2022


In [4]:
# Remove unnecessary spaces
age_2022_df.columns = age_2022_df.columns.str.strip()

In [5]:
# Check dataframe columns
age_2022_df.columns

Index(['state', '18_and_over_population_male', '18_and_over_population_female',
       '20_to_24_years', '25_to_34_years', '35_to_44_years', '45_to_54_years',
       '55_to_59_years', '60_to_64_years', '65_to_74_years', '75_to_84_years',
       '85_years_and_over', 'median_age(years)', 'state_id',
       'congressional_district', 'year'],
      dtype='object')

In [6]:
# Check dataframe types
age_2022_df.dtypes

state                             object
18_and_over_population_male      float64
18_and_over_population_female    float64
20_to_24_years                   float64
25_to_34_years                   float64
35_to_44_years                   float64
45_to_54_years                   float64
55_to_59_years                   float64
60_to_64_years                   float64
65_to_74_years                   float64
75_to_84_years                   float64
85_years_and_over                float64
median_age(years)                float64
state_id                           int64
congressional_district             int64
year                               int64
dtype: object

In [7]:
# Drop 'state_id' and 'congressional_district' columns
age_2022_df = age_2022_df.drop(columns=['state_id', 'congressional_district'])

# Display the updated DataFrame
age_2022_df

Unnamed: 0,state,18_and_over_population_male,18_and_over_population_female,20_to_24_years,25_to_34_years,35_to_44_years,45_to_54_years,55_to_59_years,60_to_64_years,65_to_74_years,75_to_84_years,85_years_and_over,median_age(years),year
0,Alabama,47.6,52.4,5.8,12.8,12.0,12.5,6.0,7.2,11.5,5.9,2.0,40.9,2022
1,Alabama,47.5,52.5,6.1,13.8,12.4,11.9,6.5,6.3,10.6,5.9,1.6,39.1,2022
2,Alabama,47.7,52.3,8.0,12.4,12.2,12.1,6.5,6.7,11.2,5.6,1.5,39.4,2022
3,Alabama,48.3,51.7,6.3,11.8,12.5,12.3,6.7,6.5,11.2,6.0,1.7,40.8,2022
4,Alabama,48.7,51.3,6.4,13.1,12.8,12.5,6.5,7.5,10.1,5.6,1.6,40.2,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,Wisconsin,50.4,49.6,6.3,11.6,12.3,12.0,6.9,7.9,12.0,6.2,2.3,42.7,2022
433,Wisconsin,50.7,49.3,4.9,10.5,11.8,12.0,7.5,8.4,13.6,6.8,2.2,45.4,2022
434,Wisconsin,50.0,50.0,5.7,12.2,12.6,12.1,7.4,7.2,11.4,5.4,2.0,41.3,2022
435,Wyoming,51.0,49.0,6.3,12.5,14.0,11.1,6.0,6.6,11.8,5.4,1.4,39.1,2022


In [8]:
# Check for null values in the entire DataFrame
null_values = age_2022_df.isnull().sum()

# Display columns with null values
print(null_values[null_values > 0])

Series([], dtype: int64)


In [9]:
# Group by 'State' and calculate the median of 'Median_Age' to reduce rows 
st_medians_df = age_2022_df.groupby('state')['median_age(years)'].median().reset_index()

st_medians_df['median_age(years)'] = st_medians_df['median_age(years)'].astype(int)

# Rename the columns if desired
st_medians_df.columns = ['state', 'median_age(years)']

# Display the results
print(st_medians_df)

                    state  median_age(years)
0                 Alabama                 39
1                  Alaska                 35
2                 Arizona                 41
3                Arkansas                 39
4              California                 38
5                Colorado                 37
6             Connecticut                 40
7                Delaware                 41
8    District of Columbia                 34
9                 Florida                 42
10                Georgia                 37
11                 Hawaii                 40
12                  Idaho                 37
13               Illinois                 40
14                Indiana                 38
15                   Iowa                 39
16                 Kansas                 37
17               Kentucky                 39
18              Louisiana                 38
19                  Maine                 45
20               Maryland                 39
21        

In [10]:
# List of columns with percentage data to convert to fractions for averaging
columns_to_convert = [
    '18_and_over_population_male',
    '18_and_over_population_female',
    '20_to_24_years',
    '25_to_34_years',
    '35_to_44_years',
    '45_to_54_years',
    '55_to_59_years',
    '60_to_64_years',
    '65_to_74_years',
    '75_to_84_years',
    '85_years_and_over',
]

# Convert the columns to fractions by dividing by 100
age_2022_df[columns_to_convert] = age_2022_df[columns_to_convert] / 100

# Group by 'state' and calculate the average of all numerical columns
state_averages2022_df = age_2022_df.groupby('state').mean().reset_index()

# Convert the specific columns back to plain numbers with one or two decimal places
state_averages2022_df[columns_to_convert] = state_averages2022_df[columns_to_convert] * 100

# Format the specific columns to show as plain numbers with two decimal places
for col in columns_to_convert:
    state_averages2022_df[col] = state_averages2022_df[col].map(lambda x: round(x, 2))

# Display the results
state_averages2022_df


Unnamed: 0,state,18_and_over_population_male,18_and_over_population_female,20_to_24_years,25_to_34_years,35_to_44_years,45_to_54_years,55_to_59_years,60_to_64_years,65_to_74_years,75_to_84_years,85_years_and_over,median_age(years),year
0,Alabama,47.6,52.4,6.8,12.86,12.47,12.17,6.27,6.73,10.77,5.57,1.69,39.528571,2022.0
1,Alaska,53.2,46.8,7.1,15.5,14.3,11.2,5.9,5.9,9.4,3.7,0.8,35.9,2022.0
2,Arizona,49.64,50.36,7.11,13.86,12.58,11.61,5.66,6.2,10.63,6.42,1.78,39.1,2022.0
3,Arkansas,48.75,51.25,6.9,12.62,12.7,11.95,6.0,6.6,10.38,5.58,1.78,39.075,2022.0
4,California,49.39,50.61,6.75,14.74,13.83,12.51,6.04,5.98,9.14,4.82,1.83,37.959615,2022.0
5,Colorado,50.54,49.46,6.78,15.71,14.72,12.06,5.61,6.16,9.61,4.58,1.45,37.95,2022.0
6,Connecticut,48.16,51.84,6.7,12.42,12.72,12.56,7.08,7.12,10.54,5.5,2.22,40.9,2022.0
7,Delaware,47.7,52.3,5.9,12.8,12.3,11.1,6.4,7.7,12.5,6.5,1.9,41.5,2022.0
8,District of Columbia,46.5,53.5,7.2,21.6,16.6,10.7,4.6,4.9,7.5,4.1,1.4,34.9,2022.0
9,Florida,48.38,51.62,6.03,12.54,12.6,12.3,6.62,6.83,11.67,7.33,2.49,43.2,2022.0


In [11]:
# Convert the 'year' column to integer
state_averages2022_df['year'] = state_averages2022_df['year'].astype(int)
state_averages2022_df['median_age(years)'] = state_averages2022_df['median_age(years)'].astype(int)

state_averages2022_df.head()

Unnamed: 0,state,18_and_over_population_male,18_and_over_population_female,20_to_24_years,25_to_34_years,35_to_44_years,45_to_54_years,55_to_59_years,60_to_64_years,65_to_74_years,75_to_84_years,85_years_and_over,median_age(years),year
0,Alabama,47.6,52.4,6.8,12.86,12.47,12.17,6.27,6.73,10.77,5.57,1.69,39,2022
1,Alaska,53.2,46.8,7.1,15.5,14.3,11.2,5.9,5.9,9.4,3.7,0.8,35,2022
2,Arizona,49.64,50.36,7.11,13.86,12.58,11.61,5.66,6.2,10.63,6.42,1.78,39,2022
3,Arkansas,48.75,51.25,6.9,12.62,12.7,11.95,6.0,6.6,10.38,5.58,1.78,39,2022
4,California,49.39,50.61,6.75,14.74,13.83,12.51,6.04,5.98,9.14,4.82,1.83,37,2022


In [12]:
# Export DataFrame as cleaned csv
state_averages2022_df.to_csv('2022_Census_Age_et.csv', index=False)