In [None]:
import pandas as pd

###Raw Data from the 2017 Philippine [Demographic Health Survey](https://dhsprogram.com/Data/), under the household recode group

In [None]:
dhs_file = 'PHHR71FL.DTA'
dhs_dict_file = 'PHHR71FL.DO'

### DHS Helper function sourced from Thinking Machines' DHS preprocessing [github repository](https://github.com/thinkingmachines/ph-poverty-mapping/blob/master/notebooks/00_dhs_prep.ipynb)

In [None]:
def get_dhs_dict(dhs_dict_file):
    dhs_dict = dict()
    with open(dhs_dict_file, 'r', errors='replace') as file:
        line = file.readline()
        while line:
            line = file.readline()
            if 'label variable' in line:
                code = line.split()[2]
                colname = ' '.join([x.strip('"') for x in line.split()[3:]])
                dhs_dict[code] = colname
    return dhs_dict

In [None]:
dhs = pd.read_stata(dhs_file, convert_categoricals=False)
dhs_dict = get_dhs_dict(dhs_dict_file)
dhs = dhs.rename(columns=dhs_dict).dropna(axis=1)
print('Data Dimensions: {}'.format(dhs.shape))

Data Dimensions: (27496, 339)


###Manually checking for the columns we could use. In this case we use the house resiliency metrics and the wealth index.

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100000 )
print(dhs.head())

  Case Identification Country code and phase  Cluster number  Household number  Respondent's line number (answering Household questionnaire)  Ultimate area unit  Household sample weight (6 decimals)  Month of interview  Year of interview  Date of interview (CMC)  Date of interview Century Day Code (CDC)  Number of household members  Number of eligible women in household  Number of de jure members  Number of de facto members  Number of children 5 and under (de jure)  Result of household interview  Day of interview  Number of visits  Interviewer identification  Ever-married sample  Primary sampling unit  Sample strata for sampling errors  Stratification used in sample design  Region  Type of place of residence  Household selected for male interview  Household weight for male subsample (6 decimals)  Field supervisor  Household selected for hemoglobin  Household selected for Domestic Violence module  Language of questionnaire  Language of interview  Native language of respondent  Translato

In [None]:
features = ['Main floor material','Main wall material','Main roof material', 'Wealth index factor score combined (5 decimals)']
data = dhs[features]

###To interpret the wealth index, we use the [DHS- IV Recode Map](https://dhsprogram.com/pubs/pdf/DHSG4/Recode4Map.pdf).

###For the materials, we use [DHS- II Recode Map](https://dhsprogram.com/pubs/pdf/DHSG4/Recode2Map.pdf).

In [None]:
print(data[0:10]) 

#Higher material value indicates higher resilience (10 to 35; 40 if other unlisted material)
#Lower wealth index = less affluent, higher = more affluent (-115000 to 150000)

   Main floor material  Main wall material  Main roof material  Wealth index factor score combined (5 decimals)
0                   33                  31                  31                                           192371
1                   33                  32                  31                                           251936
2                   34                  24                  31                                           -11558
3                   21                  26                  31                                          -104086
4                   21                  24                  31                                           -25633
5                   31                  26                  31                                          -107475
6                   21                  26                  31                                          -101771
7                   21                  21                  12                                          

In [None]:
print(dhs['Cluster number'])

0           1
1           1
2           1
3           1
4           1
         ... 
27491    1250
27492    1250
27493    1250
27494    1250
27495    1250
Name: Cluster number, Length: 27496, dtype: int16


In [None]:
#data covers whole Philippines, needs additional processing to filter out Butuan city specifically
#to do this, match Clusternumber here to DHSClust value in shapefile for specified region/area 
#data.to_csv('dhs_features_ph.csv')