In [1]:
import pandas as pd
import dask.dataframe as dd

In [2]:
def parse_columns(df, cols):
    """Parses columns for a groupby-ed df
    df: df for $group
    group: grouping
    """
    cnt = df[cols]

    cnt = cnt.T
    cnt = cnt.reset_index(drop=True)
    cnt.columns = cnt.iloc[0]           # set first row to header
    cnt = cnt.drop(cnt.index[0])  # drop first row
    cnt.columns = range(len(cnt.columns))
    return cnt

In [3]:
def parse_group(df, dim):
    total = parse_columns([dim, 'Dim: Sex (3): Member ID: [1]: Total - Sex'], df)
    male = parse_columns([dim, 'Dim: Sex (3): Member ID: [2]: Male'], df)
    female = parse_columns([dim, 'Dim: Sex (3): Member ID: [3]: Female'], df)

    unioned = pd.concat([total, male, female])
    return unioned

# dissemination area

In [4]:
df = pd.read_csv('98-401-X2016044_ATLANTIC_English_CSV_data.csv', low_memory=False)

In [14]:
df.GEO_NAME.nunique()

5313

In [5]:
column_map = {i:v for i,v in enumerate(df.loc[df.GEO_NAME == 'Canada', 'DIM: Profile of Dissemination Areas (2247)'].tolist())}

In [6]:
ddf = dd.from_pandas(df, npartitions=8)

In [9]:
sddf = ddf.groupby(['GEO_CODE (POR)', 'GEO_NAME']).apply(parse_columns, 
                                                         ['DIM: Profile of Dissemination Areas (2247)', 'Dim: Sex (3): Member ID: [1]: Total - Sex'],
                                                         meta={i:object for i in range(2247)})

In [10]:
total = sddf.compute()

In [15]:
total.shape

(5385, 2247)

# FED

In [51]:
df = pd.read_csv("98-401-X2016045_English_CSV_data.csv", low_memory=False)

In [86]:
df.head(1)

Unnamed: 0,CENSUS_YEAR,GEO_CODE (POR),GEO_LEVEL,GEO_NAME,GNR,GNR_LF,DATA_QUALITY_FLAG,ALT_GEO_CODE,DIM: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Member ID: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Notes: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Dim: Sex (3): Member ID: [1]: Total - Sex,Dim: Sex (3): Member ID: [2]: Male,Dim: Sex (3): Member ID: [3]: Female
0,2016,1,0,Canada,4.0,5.1,20000,1,"Population, 2016",1,1.0,35151728,...,...


In [87]:
for _, g in df.groupby(['GEO_CODE (POR)', 'GEO_NAME']):
    break

In [134]:
ddf = dd.from_pandas(df, npartitions=8)

In [64]:
statscan_df = df.groupby(['GEO_CODE (POR)', 'GEO_NAME']).apply(parse_group, 'DIM: Profile of Federal Electoral Districts (2013 Representation Order) (2247)')

In [11]:
statscan_df = statscan_df.reset_index()
statscan_df = statscan_df.drop(columns='level_2')
statscan_df.head(5)

Unnamed: 0,GEO_CODE (POR),GEO_NAME,"Population, 2016","Population, 2011","Population percentage change, 2011 to 2016",Total private dwellings,Private dwellings occupied by usual residents,Population density per square kilometre,Land area in square kilometres,Total - Age groups and average age of the population - 100% data,...,Total - Mobility status 5 years ago - 25% sample data,Non-movers,Movers,Non-migrants,Migrants,Internal migrants,Intraprovincial migrants,Interprovincial migrants,External migrants,sex
0,1,Canada,35151728,33476688,5.0,15412443,14072079,3.9,8965588.85,35151730,...,32568560,20134755,12433810,6755630,5678175,4296715,3467670,829050,1381460,total
1,1,Canada,...,...,...,...,...,...,...,17264195,...,16004320,9902640,6101675,3314730,2786950,2104325,1689950,414375,682625,total
2,1,Canada,...,...,...,...,...,...,...,17887530,...,16564245,10232115,6332130,3440905,2891225,2192395,1777720,414675,698835,total
3,10,Newfoundland and Labrador,519716,514536,1.0,265739,218673,1.4,370514.08,519715,...,489800,347170,142625,69955,72670,66305,42755,23545,6370,total
4,10,Newfoundland and Labrador,...,...,...,...,...,...,...,253925,...,239505,169930,69575,33920,35660,32245,20410,11840,3410,total


In [12]:
fed_df = statscan_df[statscan_df['GEO_CODE (POR)'] > 1000]

In [13]:
fed_df.GEO_NAME.nunique()

338

In [44]:
fed_df.shape

(1014, 2250)

In [62]:
fed_df.head(1)

Unnamed: 0,GEO_CODE (POR),GEO_NAME,"Population, 2016","Population, 2011","Population percentage change, 2011 to 2016",Total private dwellings,Private dwellings occupied by usual residents,Population density per square kilometre,Land area in square kilometres,Total - Age groups and average age of the population - 100% data,...,Total - Mobility status 5 years ago - 25% sample data,Non-movers,Movers,Non-migrants,Migrants,Internal migrants,Intraprovincial migrants,Interprovincial migrants,External migrants,sex
42,10001,Avalon,86494,81590,6.0,44009,34766,13.4,6457.79,86495,...,81115,58485,22630,9550,13075,12770,9300,3480,305,total


In [65]:
statscan_df.dtypes

0
Population, 2016                                 object
Population, 2011                                 object
Population percentage change, 2011 to 2016       object
Total private dwellings                          object
Private dwellings occupied by usual residents    object
                                                  ...  
Internal migrants                                object
Intraprovincial migrants                         object
Interprovincial migrants                         object
External migrants                                object
sex                                              object
Length: 2248, dtype: object

In [61]:
variables = df[df.GEO_NAME == 'Canada']['DIM: Profile of Federal Electoral Districts (2013 Representation Order) (2247)'].tolist()
len(variables)

2247

In [143]:
df.head(1)

Unnamed: 0,CENSUS_YEAR,GEO_CODE (POR),GEO_LEVEL,GEO_NAME,GNR,GNR_LF,DATA_QUALITY_FLAG,ALT_GEO_CODE,DIM: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Member ID: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Notes: Profile of Federal Electoral Districts (2013 Representation Order) (2247),Dim: Sex (3): Member ID: [1]: Total - Sex,Dim: Sex (3): Member ID: [2]: Male,Dim: Sex (3): Member ID: [3]: Female
0,2016,1,0,Canada,4.0,5.1,20000,1,"Population, 2016",1,1.0,35151728,...,...


In [15]:
ddf = dd.from_pandas(df, npartitions=8)

In [142]:
statscan_df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,"Population, 2016","Population, 2011","Population percentage change, 2011 to 2016",Total private dwellings,Private dwellings occupied by usual residents,Population density per square kilometre,Land area in square kilometres,Total - Age groups and average age of the population - 100% data,0 to 14 years,0 to 4 years,...,Total - Mobility status 5 years ago - 25% sample data,Non-movers,Movers,Non-migrants,Migrants,Internal migrants,Intraprovincial migrants,Interprovincial migrants,External migrants,sex
GEO_CODE (POR),GEO_NAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Canada,1,35151728,33476688,5.0,15412443,14072079,3.9,8965588.85,35151730,5839565,1898790,...,32568560,20134755,12433810,6755630,5678175,4296715,3467670,829050,1381460,total


In [144]:
sddf = ddf.groupby(['GEO_CODE (POR)', 'GEO_NAME']).apply(parse_columns, 
                                                         ['DIM: Profile of Federal Electoral Districts (2013 Representation Order) (2247)', 'Dim: Sex (3): Member ID: [1]: Total - Sex'],
                                                         meta={i:object for i in range(2247)})

In [145]:
sddf.compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,2237,2238,2239,2240,2241,2242,2243,2244,2245,2246
GEO_CODE (POR),GEO_NAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Canada,1,35151728,33476688,5.0,15412443,14072079,3.9,8965588.85,35151730,5839565,1898790,...,372475,32568560,20134755,12433810,6755630,5678175,4296715,3467670,829050,1381460
12,Nova Scotia,1,923598,921727,0.2,458568,401990,17.4,52942.27,923600,133830,42005,...,6470,866425,582710,283720,169570,114150,94880,50325,44555,19270
46,Manitoba,1,1278365,1208268,5.8,539748,489050,2.3,552370.99,1278365,243820,81065,...,14705,1161235,718640,442600,261015,181585,117140,86295,30840,64445
48,Alberta,1,4067175,3645257,11.6,1654129,1527678,6.4,640330.46,4067175,779155,266520,...,46470,3714635,2052580,1662055,899165,762890,559860,333770,226090,203025
11002,Charlottetown,1,36094,34562,4.4,17193,16100,814.1,44.34,36095,5010,1585,...,900,33180,17355,15830,8570,7260,4640,2340,2300,2620
12002,Central Nova,1,71962,74597,-3.5,38651,31579,7.7,9307.83,71960,10345,3165,...,245,67470,49385,18095,9265,8830,8220,5475,2735,610
12003,Cumberland--Colchester,1,80590,82321,-2.1,43823,35843,10.2,7905.99,80590,11620,3480,...,250,75510,51410,24100,13095,11010,10270,6985,3280,740
13001,Acadie--Bathurst,1,77791,79340,-2.0,38754,34544,15.4,5063.17,77790,8910,2660,...,110,73145,56920,16225,9160,7065,6815,5055,1755,255
13004,Fundy Royal,1,79943,79386,0.7,34819,31562,11.1,7230.86,79940,13740,4050,...,225,75095,55475,19620,9090,10530,9935,7685,2255,595
13008,New Brunswick Southwest,1,65287,66197,-1.4,32907,27100,6.6,9911.43,65280,10310,3210,...,180,60905,44685,16215,7310,8910,8415,5925,2495,490


### Census Tract

In [None]:
ct = pd.read_csv('98-401-X2016043_English_CSV_data.csv')