In [1]:
import pandas as pd
import dask.dataframe as dd

In [2]:
def parse_columns(df, cols):
    """Parses columns for a groupby-ed df"""
    cnt = df[cols]

    cnt = cnt.T
    cnt = cnt.reset_index(drop=True)
    cnt.columns = cnt.iloc[0]           # set first row to header
    cnt = cnt.drop(cnt.index[0])  # drop first row
    
    # create column for totals by sex
    if cols[1].endswith('Sex'):
        cnt.index =['Total']
    if cols[1].endswith('Male'):
        cnt.index =['Male']
    if cols[1].endswith('Female'):
        cnt.index =['Female']
    
    cnt.columns = range(len(cnt.columns))
    return cnt

In [3]:
def parse_group(df, dim):
    """parses each of the three 2nd dimensions (total, male, female)"""
    total = parse_columns(df, [dim, 'Dim: Sex (3): Member ID: [1]: Total - Sex'])
    male = parse_columns(df, [dim, 'Dim: Sex (3): Member ID: [2]: Male'])
    female = parse_columns(df, [dim, 'Dim: Sex (3): Member ID: [3]: Female'])

    unioned = pd.concat([total, male, female])
    return unioned

In [4]:
def parse_census_profile(df, dask=False):
    """parse census profile. input a pandas df"""
    
    # Get item to groupby on
    cols = df.columns
    level = [s for s in df.columns if s.startswith('DIM:')][0] 
    #count_dim = [s for s in df.columns if s.startswith('Dim:')]
    
    # Map integers to column names
    column_map = {i:v for i,v in enumerate(df.loc[df.GEO_NAME == 'Canada', level].tolist())}
    
    if dask:
        df = dd.from_pandas(df, npartitions=64)
        df = df.set_index('GEO_CODE (POR)')
        df = df.persist()
        parsed_df = df.groupby('GEO_CODE (POR)').apply(parse_group, level, meta={i:object for i in range(2247)}).compute(scheduler='processes')
    else:
        parsed_df = df.groupby('GEO_CODE (POR)').apply(parse_group, level)
        
    parsed_df = parsed_df.rename(columns=column_map)
    
    return parsed_df

## Federal Election Districts

In [5]:
df = pd.read_csv("98-401-X2016045_English_CSV_data.csv", low_memory=False)

In [18]:
%time fed = parse_census_profile(df)

CPU times: user 2min 28s, sys: 0 ns, total: 2min 28s
Wall time: 2min 28s


In [25]:
%time fed2 = parse_census_profile(df, dask=True)

CPU times: user 38.4 s, sys: 58.6 ms, total: 38.5 s
Wall time: 56.7 s


## Census Tract

In [14]:
ct = pd.read_csv('98-401-X2016043_English_CSV_data.csv', low_memory=False)

FileNotFoundError: [Errno 2] File b'98-401-X2016043_English_CSV_data.csv' does not exist: b'98-401-X2016043_English_CSV_data.csv'

In [9]:
ctp = parse_census_profile(ct, dask=True)

# dissemination area

In [5]:
import os
files_dir = 'da'
files = [f'{files_dir}/' + f for f in os.listdir(files_dir)]

In [7]:
files

['da/98-401-X2016044_ONTARIO_English_CSV_data.csv',
 'da/98-401-X2016044_QUEBEC_English_CSV_data.csv',
 'da/98-401-X2016044_ATLANTIC_English_CSV_data.csv',
 'da/98-401-X2016044_TERRITORIES_English_CSV_data.csv',
 'da/98-401-X2016044_BRITISH_COLUMBIA_English_CSV_data.csv',
 'da/98-401-X2016044_PRAIRIES_English_CSV_data.csv']

In [36]:
%time da_t = parse_census_profile(df, dask=True)

CPU times: user 29.9 s, sys: 1.39 s, total: 31.3 s
Wall time: 37.2 s


In [8]:
df = pd.read_csv(files[0], low_memory=False)

In [None]:
df[1] = pd.read_csv(files[1], low_memory=False)

In [21]:
da_t.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,"Population, 2016","Population, 2011","Population percentage change, 2011 to 2016",Total private dwellings,Private dwellings occupied by usual residents,Population density per square kilometre,Land area in square kilometres,Total - Age groups and average age of the population - 100% data,0 to 14 years,0 to 4 years,...,External migrants,Total - Mobility status 5 years ago - 25% sample data,Non-movers,Movers,Non-migrants,Migrants,Internal migrants,Intraprovincial migrants,Interprovincial migrants,External migrants
GEO_CODE (POR),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,Total,35151728,7740,99140,900,3535,37060,67335,355045,15380,9590,...,31446,10102285,35174,1775035,76419,14072080,92764,76171,3967765,88249
1,Male,...,4135,51730,500,1855,19690,34755,185775,7935,5075,...,...,...,...,1313385,...,...,...,...,...,...
1,Female,...,3605,47410,400,1675,17375,32580,169270,7445,4515,...,...,...,...,461650,...,...,...,...,...,...
59,Total,255,95,125,105,0,620,420,770,210,150,...,541915,35701,61280,1340055,88466,77108,1881970,90354,75209,31899
59,Male,135,70,60,65,0,305,275,460,130,90,...,...,...,...,...,...,...,...,...,...,...


In [37]:
da_t.shape

(1011, 2247)

In [68]:
df_test = df_test.assign(dim=df_test.groupby('GEO_CODE (POR)').cumcount())

In [69]:
df_test.head(10)

Unnamed: 0,GEO_CODE (POR),DIM: Profile of Dissemination Areas (2247),Dim: Sex (3): Member ID: [1]: Total - Sex,dim
0,1,"Population, 2016",35151728.0,0
1,1,"Population, 2011",33476688.0,1
2,1,"Population percentage change, 2011 to 2016",5.0,2
3,1,Total private dwellings,15412443.0,3
4,1,Private dwellings occupied by usual residents,14072079.0,4
5,1,Population density per square kilometre,3.9,5
6,1,Land area in square kilometres,8965588.85,6
7,1,Total - Age groups and average age of the popu...,35151730.0,7
8,1,0 to 14 years,5839570.0,8
9,1,0 to 4 years,1898790.0,9


In [74]:
def groupby_pivot(df, dim):
    return df.pivot(index='GEO_CODE (POR)', 
                    columns=dim,
                    values='Dim: Sex (3): Member ID: [1]: Total - Sex')

In [77]:
%time df_test.groupby('GEO_CODE (POR)').apply(groupby_pivot, 'dim')

CPU times: user 48.9 s, sys: 22 ms, total: 48.9 s
Wall time: 48.9 s


Unnamed: 0_level_0,dim,0,1,2,3,4,5,6,7,8,9,...,2237,2238,2239,2240,2241,2242,2243,2244,2245,2246
GEO_CODE (POR),GEO_CODE (POR),Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,35151728,33476688,5.0,15412443,14072079,3.9,8965588.85,35151730,5839570,1898790,...,372475,32568565,20134760,12433805,6755630,5678175,4296720,3467675,829050,1381460
60,60,35874,33897,5.8,17987,15215,0.1,474712.68,35875,6280,2140,...,245,32985,18040,14945,8240,6700,5515,1230,4280,1185
61,61,41786,41462,0.8,17666,14981,0.0,1143793.86,41785,8870,3125,...,250,38035,21380,16650,9000,7655,6760,1745,5010,890
62,62,35944,31906,12.7,11433,9819,0.0,1877778.53,35945,11690,4160,...,55,31430,18460,12980,8745,4230,4045,1515,2535,185
6001,6001,35874,33897,5.8,17987,15215,0.1,474712.68,35870,6280,2140,...,250,32985,18040,14950,8240,6705,5515,1230,4280,1190
6101,6101,6372,6712,-5.1,2741,2248,0.0,365094.40,6375,1540,575,...,15,5635,3245,2385,1385,1005,955,365,590,50
6102,6102,2433,2341,3.9,1111,858,0.0,220752.16,2430,555,180,...,10,2230,1240,985,465,525,490,215,275,30
6103,6103,2751,2812,-2.2,861,705,0.1,25080.97,2750,805,270,...,10,2470,1980,490,315,175,170,90,80,0
6104,6104,3160,3256,-2.9,1495,1138,0.0,194494.15,3160,610,205,...,0,2920,2115,800,470,330,325,130,195,0
6105,6105,6980,6616,5.5,3199,2692,0.0,153468.37,6980,1375,455,...,25,6405,3890,2520,1295,1220,1180,450,725,40


In [85]:
%time df_test_pivoted = df_test.pivot(index='GEO_CODE (POR)', columns='dim', values='Dim: Sex (3): Member ID: [1]: Total - Sex')

CPU times: user 315 ms, sys: 0 ns, total: 315 ms
Wall time: 326 ms


In [86]:
df_test

Unnamed: 0,GEO_CODE (POR),DIM: Profile of Dissemination Areas (2247),Dim: Sex (3): Member ID: [1]: Total - Sex,dim
0,1,"Population, 2016",35151728,0
1,1,"Population, 2011",33476688,1
2,1,"Population percentage change, 2011 to 2016",5.0,2
3,1,Total private dwellings,15412443,3
4,1,Private dwellings occupied by usual residents,14072079,4
5,1,Population density per square kilometre,3.9,5
6,1,Land area in square kilometres,8965588.85,6
7,1,Total - Age groups and average age of the popu...,35151730,7
8,1,0 to 14 years,5839570,8
9,1,0 to 4 years,1898790,9


### using pivot...

In [1]:
import pandas as pd
import re
import os

In [13]:
files_dir = 'dissemination_area'
files = [f'{files_dir}/' + f for f in os.listdir(files_dir) if f.startswith('98-401-X2016044')]

In [14]:
file_re = re.compile("98-401-X2016044_(.+)_English_CSV_data\.csv")

In [20]:
files

['dissemination_area/98-401-X2016044_ONTARIO_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_QUEBEC_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_ATLANTIC_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_TERRITORIES_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_BRITISH_COLUMBIA_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_PRAIRIES_English_CSV_data.csv']

In [None]:
colmap = None

for file in files:
    df = pd.read_csv(file, low_memory=False)
    region = file_re.search(file).groups()[0]
    print(file, region)
    
    # Replace dimension headers as INTs
    df = df.assign(dim=df.groupby('GEO_CODE (POR)').cumcount())
    if colmap is None:
        colmap = dict(df.loc[df['GEO_CODE (POR)'] == 1, ['DIM: Profile of Dissemination Areas (2247)', 'dim']].to_records(index=False))

    # Pivot each gender category then concat them
    df_pivoted_total = df.pivot(index='GEO_CODE (POR)', 
                                columns='dim', 
                                values='Dim: Sex (3): Member ID: [1]: Total - Sex')
    df_pivoted_total = df_pivoted_total.assign(sex='total')

    df_pivoted_male = df.pivot(index='GEO_CODE (POR)', 
                                columns='dim', 
                                values='Dim: Sex (3): Member ID: [2]: Male')
    df_pivoted_male = df_pivoted_male.assign(sex='male')

    df_pivoted_female = df.pivot(index='GEO_CODE (POR)', 
                                columns='dim', 
                                values='Dim: Sex (3): Member ID: [3]: Female')
    df_pivoted_female = df_pivoted_female.assign(sex='female')

    df_pivoted = pd.concat([df_pivoted_total, df_pivoted_male, df_pivoted_female])
    df_pivoted = df_pivoted.assign(region=region)
    df_pivoted = df_pivoted.reset_index()
    df_pivoted.to_csv(f"{files_dir}/parsed_{region}.csv", index=False)
    
    del df, df_pivoted

dissemination_area/98-401-X2016044_ONTARIO_English_CSV_data.csv ONTARIO
dissemination_area/98-401-X2016044_ATLANTIC_English_CSV_data.csv ATLANTIC
dissemination_area/98-401-X2016044_TERRITORIES_English_CSV_data.csv TERRITORIES
dissemination_area/98-401-X2016044_BRITISH_COLUMBIA_English_CSV_data.csv BRITISH_COLUMBIA
dissemination_area/98-401-X2016044_PRAIRIES_English_CSV_data.csv PRAIRIES


In [25]:
# Merge all parsed files into one dataframe
all_df = pd.concat([pd.read_csv(f"{files_dir}/{f}", low_memory=False) for f in os.listdir(files_dir) if f.startswith('parsed')])

In [26]:
all_df.head(3)

Unnamed: 0,GEO_CODE (POR),0,1,2,3,4,5,6,7,8,...,2239,2240,2241,2242,2243,2244,2245,2246,sex,region
0,1,35151728,33476688,5.0,15412443,14072079,3.9,8965588.85,35151730,5839570,...,20134760,12433805,6755630,5678175,4296720,3467675,829050,1381460,total,QUEBEC
1,24,8164361,7903001,3.3,3858943,3531663,6.0,1356625.27,8164365,1333255,...,4825870,2695090,1507670,1187425,949170,893800,55370,238250,total,QUEBEC
2,2401,12475,12781,-2.4,6477,5534,59.3,210.3,12475,1405,...,9515,2275,1510,765,750,710,35,20,total,QUEBEC


In [47]:
cols = df.loc[df['GEO_CODE (POR)'] == 1, ['Member ID: Profile of Dissemination Areas (2247)', 'DIM: Profile of Dissemination Areas (2247)']]
cols.columns = ['dim', 'value']
cols.to_csv('column_lookup.csv', index=False)

In [49]:
cols.head(10)

Unnamed: 0,dim,value
0,1,"Population, 2016"
1,2,"Population, 2011"
2,3,"Population percentage change, 2011 to 2016"
3,4,Total private dwellings
4,5,Private dwellings occupied by usual residents
5,6,Population density per square kilometre
6,7,Land area in square kilometres
7,8,Total - Age groups and average age of the popu...
8,9,0 to 14 years
9,10,0 to 4 years


In [51]:
all_df.to_csv('census2016.csv', index=False)