# Processing of Inputs to Model

The code in this section is concerned with pre-processing the predictor data.

In [None]:
# For reproducibility
import random
import numpy as np
r_state = 42
random.seed(r_state) 
np.random.seed(r_state)

In [None]:
import pandas as pd
import os
import re

lkp        = os.path.join('data','lkp')
canonical  = os.path.join('data','canonical')
analytical = os.path.join('data','analytical')
travel     = os.path.join(canonical,'travel')
household  = os.path.join(canonical,'households')
housing    = os.path.join(canonical,'housing')
greenspace = os.path.join(canonical,'greenspace')
dwellings  = os.path.join(canonical,'dwellings')
incomes    = os.path.join(canonical,'incomes')
work       = os.path.join(canonical,'work')
scores     = os.path.join(canonical,'scores')

for d in [lkp,analytical,canonical,greenspace,dwellings,travel,household,housing,work,scores]:
    if not os.path.exists(d):
        os.makedirs(d)

In [None]:
ldn2011 = pd.read_pickle(os.path.join(lkp,'LSOAs 2011.pkl'))
ldn2004 = pd.read_pickle(os.path.join(lkp,'LSOAs 2004.pkl'))

print("Have built London LSOA filter data for use where needed...")
print("\t2001: " + str(ldn2004.shape[0]) + " rows.")
print("\t2011: " + str(ldn2011.shape[0]) + " rows.")

## Selecting Variables for Inclusion

Offers a fairly simple way to group variables together into classes (households, housing, etc.) and include/exclude them from the analysis. 

<span style="color:red;font-weight:bolder">Note that the four variables used in the scoring process are automatically included in the next notebook.</span>

In [None]:
sources = {
    'households' : {
        'age_struct' :'Age Structure',
        'birth_ctry' :'Country of Birth',
        'dep_child'  :'Dependent Children',
        'ethnic'     :'Ethnicity',
        'gen_hlth'   :'General Health',
        'hh_comp'    :'Household Composition',
        'mrtl_sts'   :'Marital Status',
        'rel'        :'Religion'
    },
    'housing' : {
    #    'price'      :'Values', # Included in next notebook
        'hsng_tnr'   :'Tenure',
        'density'    :'Density',
    },
    'work' : {
        'nssec'      :'NS-Sec',
        'hrs_wrkd'   :'Hours Worked',
        'industry'   :'Industry',
        'active'     :'Economic Activity'
    #    'occ'        :'Occupations', # Included in next notebook
    #    'quals'      :'Qualifications', # Included in next notebook
    #    'inc'        :'Income' # Included in next notebook
    },
    'travel' : {
        'crs_vns'    :'Cars and Vans',
        'mode'       :'TTW',
        'dist_to_dt' :'Travel Time To Bank',
        'zone_data'  :'Fare Zone'
    },
    'dwellings' : {
        'acc_type'   :'Type',
        'dwelling_age_data':'Age'
    },
    'greenspace' : {
        'os_data'    :'Share',
        'os_acc_data':'Access'
    }
}

# Variables below will not be included in the 
# 'convert to percentages' process defined in the
# function below. Including this here as _must_ 
# be configured together with DoD above.
not_to_pct = [
    'density',
    'dist_to_dt',
    'zone_data',
    'os_data',
    'os_acc_data',
    'trans',
    'inc',
    'price',
    'scores'
]

In [None]:
# Convert raw data in datasets to percentages --
# need for some variables, but not all. 
def to_share(df):
    
    df.rename(columns={'TOTAL':'total','Total':'total'}, inplace=True)
    
    if 'total' not in df.columns.values: 
        df.loc[:,'total'] = df.sum(axis=1)  #  Total number of people
    
    pred_data = pd.DataFrame()
    
    for n in df.columns.tolist():
        #print("Converting " + n)
        pred_data.loc[:,n] = (df.loc[:,n] / df.loc[:,'total'])
    
    pred_data.drop(['total'], axis=1, inplace=True)
    
    pred_data.describe()
    
    return pred_data

# Load data sets for each Census year based on schema above
def load_data(year, template):
    datasets = {}
    for group in template.keys():
        print("Dataset group: " + group)
        for ds in template[group].keys():
            print("\tLoading dataset: " + ds)
            
            # Tentative path
            tpath = os.path.join('data','canonical',group,template[group][ds])
            
            #print("\t\tChecking for: " + "-".join([tpath,str(year)])+".csv")
            
            # Load the data set
            if os.path.isfile("-".join([tpath,str(year)])+".csv"):
                print("\t\tFound: " + "-".join([tpath,str(year)])+".csv")
                datasets[ds] = pd.read_csv("-".join([tpath,str(year)])+".csv")
                
            elif os.path.isfile(tpath+".csv"):
                print("\t\tFound: " + tpath+".csv")
                datasets[ds] = pd.read_csv(tpath+".csv")
                
            else:
                print("==> Couldn't find data for: " + template[group][ds] + " <==")
                print("Tried: " + "-".join([tpath,str(year)]) + "; " + tpath+".csv")
            
            if datasets[ds].index.name != 'lsoacd':
                datasets[ds].set_index('lsoacd', inplace=True) #  predictor variables only
            
            if ds not in not_to_pct:
                datasets[ds] = to_share(datasets[ds])
            else:
                print("\t\tNot converting to percent.")
                
    return datasets

In [None]:
print("Loading 2001 datasets...")
datasets_01 = load_data(2001, sources)

print("Loading 2011 datasets...")
datasets_11 = load_data(2011, sources)

sets = {
    '01':  datasets_01,
    '11':  datasets_11
}

print("Done.")

In [None]:
main_datasets_dict = dict()

for year, dataset in sets.items():
    #  Create combined dataset
    main_dataset = pd.DataFrame(index=sets['01']['acc_type'].index) # Initialise the df
    for key, value in iter(sorted(dataset.items())):
        if key is not 'scores':
            print("Merging " + key + " on to dataset.")
            main_dataset = main_dataset.merge(value, left_index=True, right_index=True, how='left')
    
    main_datasets_dict[year] = main_dataset
    
    #  Check for missing values
    print("Missing values (if any) to be filled:")
    for c in main_dataset.columns[main_dataset.isnull().any()]:
        print("\t" + c + " has " + str(main_dataset[c].isnull().sum()) + " null values.")
        print("\t\t" + ", ".join(main_dataset[main_dataset[c].isnull()].index.values))
    
    main_dataset.fillna(0, inplace=True)
    
    print("Main dataset built for 20" + year + ".")
    print(" ")

main_datasets_dict['01'].rename(columns=lambda x: re.sub(' 2001$','',x), inplace=True)
main_datasets_dict['11'].rename(columns=lambda x: re.sub(' 2011$','',x), inplace=True)

print("2001 Shape: " + str(main_datasets_dict['01'].shape))
print("2011 Shape: " + str(main_datasets_dict['11'].shape))

### Sanity Check

We should have no difference (an empty result set) at this point since the data sets should have been fully aligned.

In [None]:
s11 = set(main_datasets_dict['11'].columns)
s01 = set(main_datasets_dict['01'].columns)
print("2011 variables diff'd against 2001 variables: " + str(s11.difference(s01)))
print("2001 variables diff'd against 2011 variables: " + str(s01.difference(s11)))

In [None]:
main_datasets_dict['11'].sample(3, random_state=r_state)

In [None]:
# This may take some time as there is quite a bit of data!
for key, value in main_datasets_dict.items():
    print("Saving data for 20" + key)
    value.to_csv(os.path.join(analytical,'Predictor-20'+key+'-Data.csv.gz'), compression='gzip', index=True)

print("Done.")