In [67]:
import pandas as pd
import numpy as np

df_raw = pd.read_csv("census2021.csv", encoding='latin-1')

# Clean column names
df_raw.columns = df_raw.columns.str.strip()

In [68]:
vars_needed = [
    'Population, 2021',
    'Median age of the population',
    'Median total income of household in 2020 ($)',
    "Bachelor's degree or higher"
]

df_filtered = df_raw[df_raw['CHARACTERISTIC_NAME'].isin(vars_needed)].copy()

In [69]:
# Remove commas and special symbols
df_filtered['C1_COUNT_TOTAL'] = (
    df_filtered['C1_COUNT_TOTAL']
    .replace({',': '', '…': np.nan}, regex=True)
)

df_filtered['C10_RATE_TOTAL'] = (
    df_filtered['C10_RATE_TOTAL']
    .replace({',': '', '…': np.nan}, regex=True)
)

df_filtered['C1_COUNT_TOTAL'] = pd.to_numeric(df_filtered['C1_COUNT_TOTAL'], errors='coerce')
df_filtered['C10_RATE_TOTAL'] = pd.to_numeric(df_filtered['C10_RATE_TOTAL'], errors='coerce')

In [70]:
df_filtered['clean_value'] = df_filtered['C1_COUNT_TOTAL']

# If count is missing (like bachelor's %), use rate instead
df_filtered.loc[
    df_filtered['clean_value'].isna(),
    'clean_value'
] = df_filtered['C10_RATE_TOTAL']

In [71]:
df_wide = df_filtered.pivot(
    index='GEO_NAME',
    columns='CHARACTERISTIC_NAME',
    values='clean_value'
).reset_index()

In [72]:
### Calculating Pop Density

In [73]:
import geopandas as gpd

gdf = gpd.read_file("lfsa000b21a_e.shp")

In [74]:
print(gdf.columns)

Index(['CFSAUID', 'DGUID', 'PRUID', 'PRNAME', 'LANDAREA', 'geometry'], dtype='str')


In [75]:
gdf = gdf.merge(
    df_wide,
    left_on='CFSAUID',
    right_on='GEO_NAME',
    how='left'
)

In [76]:
gdf = gdf.to_crs(epsg=3347)   # Statistics Canada Lambert

In [77]:
gdf['population_density'] = gdf['Population, 2021'] / gdf['LANDAREA']

In [78]:
gdf['population_density'].describe()

count     1643.000000
mean      1653.648504
std       2862.300774
min          0.000000
25%         30.675269
50%        437.908614
75%       2206.407704
max      35036.658306
Name: population_density, dtype: float64

In [79]:
df_model = gdf.drop(columns='geometry')

In [80]:
df_model.drop(columns=['DGUID','PRUID','CFSAUID'], inplace=True)

In [81]:
ontario_fsa = df_model[df_model['GEO_NAME'].str[0].isin(['K', 'L', 'M', 'N', 'P'])]

In [82]:
ontario_fsa.reset_index(drop=True, inplace=True)

In [83]:
ontario_fsa.rename(columns={'GEO_NAME': 'FSA'}, inplace=True)

In [86]:
ontario_fsa.to_csv('ontario_fsa.csv', index=False)

In [85]:
ontario_fsa.head()

Unnamed: 0,PRNAME,LANDAREA,FSA,Median age of the population,"Population, 2021",population_density
0,Ontario,3072.8212,K0A,44.0,111626.0,36.326878
1,Ontario,1223.8974,K0B,50.4,21020.0,17.174642
2,Ontario,2470.942,K0C,48.4,52838.0,21.383748
3,Ontario,1979.2436,K0E,49.6,39649.0,20.0324
4,Ontario,2675.2361,K0G,50.4,39862.0,14.900367
