# Parsing Stats Can data

In [1]:
import pandas as pd
import numpy as np
import re
import os

In [443]:
pd.set_option('mode.chained_assignment', None)
pd.options.display.max_colwidth = 100

In [2]:
# List of files
files_dir = 'dissemination_area'
files = [f'{files_dir}/' + f for f in os.listdir(files_dir) if f.startswith('98-401-X2016044')]

In [3]:
# regex to extract region from file name
file_re = re.compile("98-401-X2016044_(.+)_English_CSV_data\.csv")

In [4]:
files

['dissemination_area/98-401-X2016044_ONTARIO_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_QUEBEC_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_ATLANTIC_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_TERRITORIES_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_BRITISH_COLUMBIA_English_CSV_data.csv',
 'dissemination_area/98-401-X2016044_PRAIRIES_English_CSV_data.csv']

### Explore

In [519]:
df = pd.read_csv(files[0], low_memory=False)

In [501]:
da_size = df.groupby(['GEO_CODE (POR)', 'GEO_LEVEL', 'GEO_NAME']).size().reset_index()

In [502]:
da_size.groupby('GEO_LEVEL').size()

GEO_LEVEL
0       1
1       4
2      47
3     853
4    4480
dtype: int64

In [503]:
# Get length of 'GEO_CODE (POR)' for each GEO_LEVEL
def get_unique_length(g):
    return g.apply(str).apply(len).unique()

da_size.groupby('GEO_LEVEL')['GEO_CODE (POR)'].apply(get_unique_length)

GEO_LEVEL
0    [1]
1    [2]
2    [4]
3    [7]
4    [8]
Name: GEO_CODE (POR), dtype: object

### Clean input data

In [520]:
# only keep dissemination area level and relavant columns
df = df.loc[df.GEO_LEVEL == 4, 
            ['GEO_CODE (POR)',
             'DIM: Profile of Dissemination Areas (2247)',
             'Member ID: Profile of Dissemination Areas (2247)',
             'Dim: Sex (3): Member ID: [1]: Total - Sex',
             'Dim: Sex (3): Member ID: [2]: Male',
             'Dim: Sex (3): Member ID: [3]: Female'
            ]]

In [521]:
# Rename columns for easier access
df.columns = ['da', 'feature_desc', 'feature_num', 'total', 'male', 'female']

#Set columns to numeric
for c in ['total', 'male', 'female']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df.head(10)

Unnamed: 0,da,feature_desc,feature_num,total,male,female
8988,35010159,"Population, 2016",1,517.0,,
8989,35010159,"Population, 2011",2,,,
8990,35010159,"Population percentage change, 2011 to 2016",3,,,
8991,35010159,Total private dwellings,4,227.0,,
8992,35010159,Private dwellings occupied by usual residents,5,213.0,,
8993,35010159,Population density per square kilometre,6,14.4,,
8994,35010159,Land area in square kilometres,7,36.0,,
8995,35010159,Total - Age groups and average age of the population - 100% data,8,515.0,275.0,240.0
8996,35010159,0 to 14 years,9,70.0,45.0,25.0
8997,35010159,0 to 4 years,10,20.0,15.0,10.0


# PARSE

In [148]:
### Extract relavant rows

In [446]:
def parse_features(g,
                   feature_nums, 
                   name, 
                   alt_feature_names=None, 
                   agg=False):
    df = g[g.feature_num.isin(feature_nums)]
    if alt_feature_names:
        df['feature_desc'] = alt_feature_names
    df = pd.melt(df, id_vars=['da', 'feature_desc', 'feature_num'], var_name='sex', value_name='count')
    df = df.dropna()
    df['feature'] = name
    
    if agg:
        df = df.assign(pct=df['count'] / df.groupby('sex')['count'].transform('sum'))
    return df

In [435]:
# Total population, density
parse_features([1, 6, 7, 58, 73], 'overall stats')

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature
0,46010069,"Population, 2016",1,total,398.0,overall stats
1,46010069,Population density per square kilometre,6,total,19.7,overall stats
2,46010069,Land area in square kilometres,7,total,20.16,overall stats
3,46010069,Average household size,58,total,1.7,overall stats
4,46010069,Average size of census families,73,total,2.1,overall stats


In [384]:
# Age
parse_features([39, 40], 'age', alt_feature_names= ['Average', 'Median'])

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature
0,46010069,Average,39,total,61.7,age
1,46010069,Median,40,total,66.2,age
2,46010069,Average,39,male,59.6,age
3,46010069,Median,40,male,64.8,age
4,46010069,Average,39,female,64.0,age
5,46010069,Median,40,female,66.8,age


# Household

In [432]:
# private dwelling
parse_features([41, 42, 43, 44, 50, 57], 'dwelling')

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature
0,46010069,Total - Occupied private dwellings by structur...,41,total,235.0,dwelling
1,46010069,Single-detached house,42,total,230.0,dwelling
2,46010069,Apartment in a building that has five or more ...,43,total,0.0,dwelling
3,46010069,Other attached dwelling,44,total,0.0,dwelling
4,46010069,Movable dwelling,50,total,0.0,dwelling
5,46010069,Number of persons in private households,57,total,400.0,dwelling


In [434]:
# private dwelling
parse_features([52, 53, 54, 55, 56], 
               'household dwelling', 
               alt_feature_names=['1 person', '2 persons', '3 persons', '4 persons', '5+ persons'],
               agg=True)

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature,pct
0,46010069,1 person,52,total,100.0,household dwelling,0.434783
1,46010069,2 persons,53,total,115.0,household dwelling,0.5
2,46010069,3 persons,54,total,5.0,household dwelling,0.021739
3,46010069,4 persons,55,total,5.0,household dwelling,0.021739
4,46010069,5+ persons,56,total,5.0,household dwelling,0.021739


# Census family

In [445]:
# census family
g[g.feature_num.isin([68, 76, 77, 79, 80, 82, 83, 91])]

Unnamed: 0,da,feature_desc,feature_num,total,male,female
107923,46010069,Total - Census families in private households by family size - 100% data,68,130.0,,
107931,46010069,Married couples,76,115.0,,
107932,46010069,Common-law couples,77,10.0,,
107934,46010069,Female parent,79,5.0,,
107935,46010069,Male parent,80,5.0,,
107937,46010069,Couples without children,82,110.0,,
107938,46010069,Couples with children,83,10.0,,
107946,46010069,Total - Persons not in census families in private households - 100% data,91,105.0,55.0,55.0


In [389]:
g[g.feature_num.isin([68,69,70,71,72])]

Unnamed: 0,da,feature_desc,feature_num,total,male,female
107923,46010069,Total - Census families in private households ...,68,130.0,,
107924,46010069,2 persons,69,120.0,,
107925,46010069,3 persons,70,10.0,,
107926,46010069,4 persons,71,5.0,,
107927,46010069,5 or more persons,72,0.0,,


In [393]:
# income
parse_features([663, 671, 674, 682], name='income')

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature
0,46010069,Median total income in 2015 among recipients ($),663,total,35456.0,income
1,46010069,Median employment income in 2015 among recipie...,671,total,20288.0,income
2,46010069,Average total income in 2015 among recipients ($),674,total,66921.0,income
3,46010069,Average employment income in 2015 among recipi...,682,total,25811.0,income
4,46010069,Median total income in 2015 among recipients ($),663,male,41088.0,income
5,46010069,Median employment income in 2015 among recipie...,671,male,19712.0,income
6,46010069,Average total income in 2015 among recipients ($),674,male,78280.0,income
7,46010069,Average employment income in 2015 among recipi...,682,male,24769.0,income
8,46010069,Median total income in 2015 among recipients ($),663,female,29888.0,income
9,46010069,Median employment income in 2015 among recipie...,671,female,21568.0,income


In [397]:
### HOUSEHOLD inome
parse_features([741, 742, 744, 745, 747, 748, 751, 754, 757], name='household income')

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature
0,46010069,Total - Income statistics in 2015 for private ...,741,total,235.0,household income
1,46010069,Median total income of households in 2015 ($),742,total,57216.0,household income
2,46010069,Total - Income statistics in 2015 for one-pers...,744,total,100.0,household income
3,46010069,Median total income of one-person households i...,745,total,41280.0,household income
4,46010069,Total - Income statistics in 2015 for two-or-m...,747,total,130.0,household income
5,46010069,Median total income of two-or-more-person hous...,748,total,75648.0,household income
6,46010069,Average total income of households in 2015 ($),751,total,108901.0,household income
7,46010069,Average total income of one-person households ...,754,total,85236.0,household income
8,46010069,Average total income of two-or-more-person hou...,757,total,127361.0,household income


In [398]:
# household income
parse_features(np.concatenate([np.arange(760, 775), np.arange(776, 780)]), name='household income', agg=True)

Unnamed: 0,da,feature_desc,feature_num,sex,count,feature,pct
0,46010069,"Under $5,000",760,total,10.0,household income,0.042553
1,46010069,"$5,000 to $9,999",761,total,5.0,household income,0.021277
2,46010069,"$10,000 to $14,999",762,total,0.0,household income,0.0
3,46010069,"$15,000 to $19,999",763,total,10.0,household income,0.042553
4,46010069,"$20,000 to $24,999",764,total,15.0,household income,0.06383
5,46010069,"$25,000 to $29,999",765,total,5.0,household income,0.021277
6,46010069,"$30,000 to $34,999",766,total,10.0,household income,0.042553
7,46010069,"$35,000 to $39,999",767,total,10.0,household income,0.042553
8,46010069,"$40,000 to $44,999",768,total,15.0,household income,0.06383
9,46010069,"$45,000 to $49,999",769,total,15.0,household income,0.06383


In [408]:
#* Could be in more than 1

# Ethnic origin
g[g.feature_num.isin([1338, 1339, 1343, 1353, 1527, 1448,1473, 1541, 1607])]

Unnamed: 0,da,feature_desc,feature_num,total,male,female
109193,46010069,Total - Ethnic origin for the population in pr...,1338,380.0,210.0,170.0
109194,46010069,North American Aboriginal origins,1339,55.0,30.0,25.0
109198,46010069,Other North American origins,1343,80.0,50.0,30.0
109208,46010069,European origins,1353,345.0,200.0,145.0
109303,46010069,"Latin, Central and South American origins",1448,0.0,0.0,0.0
109328,46010069,African origins,1473,0.0,10.0,0.0
109382,46010069,Rwandan,1527,0.0,0.0,0.0
109396,46010069,Asian origins,1541,15.0,20.0,0.0
109462,46010069,Oceania origins,1607,0.0,0.0,0.0


### User Level

In [495]:
def parse_user_level_features(g):
    # Age
    age = parse_features(g, [10,11,12,14,15,16,17,18,19,20,21,22,23,25,26,27,28,30,31,32,33], 'age', agg=True)

    # marital status
    marital = parse_features(g, [61, 62, 64, 65, 66, 67], 'martial status', alt_feature_names=['Married', 'Common law', 'Never married', 'Separated', 'Divorced', 'Widowed'], agg=True)

    # knowledge of official language
    official_lang = parse_features(g, [101, 102, 103, 104], 'knowledge of official language', agg=True)

    # mother tongue
    mother_tongue = parse_features(g, [115, 116, 117, 377, 378, 379, 380], 'mother tongue', agg=True)

    # language spoken at home
    home_language = parse_features(g, [384, 385, 386, 646, 647, 648, 649], 'language spoken at home', agg=True)

    # total income
    ttl_income = parse_features(g, np.arange(695, 708), name='total income', agg=True)

    # employment income
    employment_income = parse_features(g, np.arange(728, 741), name='employment income', agg=True)

    # low income
    low_income = parse_features(g, [848, 850, 851], name='low income', agg=True)

    # citizenship
    citizenship = parse_features(g, [1136,1139], name='citizenship', agg=True)

    # Immigrants
    immigrants = parse_features(g, [1141, 1142], name='immigrants', agg=True)

    # visible minority
    minority = parse_features(g, [i for i in np.arange(1325, 1338)], name='visible minority', agg=True)

    # education
    education = parse_features(g, [1684, 1685, 1687, 1690, 1691, 1693, 1694, 1695, 1696, 1697], name='education', agg=True)

    # employment
    employment = parse_features(g, [1867, 1868, 1867, 1869], name='employment', agg=True)
    
    final_df = pd.concat([age, marital, official_lang, mother_tongue, home_language, ttl_income, employment_income, low_income, citizenship, immigrants, minority, education, employment])
    
    return final_df[['da', 'feature', 'feature_num', 'feature_desc', 'sex', 'count', 'pct']].reset_index(drop=True)

In [518]:
# single-threaded split-apply-combine with pandas
% time x1 = df.groupby('da').apply(parse_user_level_features)

CPU times: user 9min 40s, sys: 1.39 s, total: 9min 41s
Wall time: 9min 41s


# DASK

In [506]:
import dask.dataframe as dd

In [522]:
ddf = dd.from_pandas(df, npartitions=64)

In [523]:
final_df = ddf.groupby('da').apply(parse_user_level_features, 
                                   meta={'da':'int64',
                                         'feature': 'object',
                                         'feature_num': 'int64',
                                         'feature_desc': 'object',
                                         'sex': 'object',
                                         'count': 'float',
                                         'pct': 'float'})

In [524]:
%time final_df.compute(scheduler='processes')

CPU times: user 6min 26s, sys: 25.2 s, total: 6min 51s
Wall time: 13min 47s


Unnamed: 0_level_0,Unnamed: 1_level_0,da,feature,feature_num,feature_desc,sex,count,pct
da,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
35010163,0,35010163,age,10,0 to 4 years,total,30.0,0.059406
35010163,1,35010163,age,11,5 to 9 years,total,15.0,0.029703
35010163,2,35010163,age,12,10 to 14 years,total,30.0,0.059406
35010163,3,35010163,age,14,15 to 19 years,total,30.0,0.059406
35010163,4,35010163,age,15,20 to 24 years,total,20.0,0.039604
35010163,5,35010163,age,16,25 to 29 years,total,20.0,0.039604
35010163,6,35010163,age,17,30 to 34 years,total,25.0,0.049505
35010163,7,35010163,age,18,35 to 39 years,total,40.0,0.079208
35010163,8,35010163,age,19,40 to 44 years,total,25.0,0.049505
35010163,9,35010163,age,20,45 to 49 years,total,30.0,0.059406
