
# Investigating features from LSMS


In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
geo_data = pd.read_csv('raw_data/geo_eth.csv', sep=',')
cons_data = pd.read_csv('raw_data/cons_eth.csv', sep=',')

In [12]:
import pandas as pd

def process_ethiopia_all_consumptions(cons_data, geo_data):
    consumption_pc_col = 'total_cons_ann'  # per capita (per person)
    hhsize_col = 'hh_size'  # people in household

    lat_col = 'lat_dd_mod'
    lon_col = 'lon_dd_mod'

    # purchasing power parity for Ethiopia in 2015
    ppp = 7.882

    # Define the columns for the additional consumption categories
    food_cons_col = 'food_cons_ann'
    nonfood_cons_col = 'nonfood_cons_ann'
    educ_cons_col = 'educ_cons_ann'

    df = cons_data
    df['total_cons_ph_day'] = df[consumption_pc_col] / ppp / 365
    df['total_cons_ph_month'] = df[consumption_pc_col] / ppp / 12
    df['total_cons_ph_year'] = df[consumption_pc_col] / ppp

    df['food_cons_ph_day'] = df[food_cons_col] / ppp / 365
    df['food_cons_ph_month'] = df[food_cons_col] / ppp / 12
    df['food_cons_ph_year'] = df[food_cons_col] / ppp

    df['nonfood_cons_ph_day'] = df[nonfood_cons_col] / ppp / 365
    df['nonfood_cons_ph_month'] = df[nonfood_cons_col] / ppp / 12
    df['nonfood_cons_ph_year'] = df[nonfood_cons_col] / ppp

    df['educ_cons_ph_day'] = df[educ_cons_col] / ppp / 365
    df['educ_cons_ph_month'] = df[educ_cons_col] / ppp / 12
    df['educ_cons_ph_year'] = df[educ_cons_col] / ppp

    df_geo = geo_data
    df_cords = df_geo[['household_id2', lat_col, lon_col]]
    df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)
    df_combined = pd.merge(df, df_cords, on='household_id2')
    df_combined.drop(['household_id2'], axis=1, inplace=True)
    df_combined.dropna(inplace=True)  # can't use na values

    df_clusters = df_combined.groupby(['cluster_lat', 'cluster_lon']).sum().reset_index()
    
    df_clusters['total_cons_pc_day'] = df_clusters['total_cons_day'] / df_clusters[hhsize_col]
    df_clusters['total_cons_pc_week'] = df_clusters['total_cons_week'] / df_clusters[hhsize_col]
    df_clusters['total_cons_pc_month'] = df_clusters['total_cons_month'] / df_clusters[hhsize_col]
    
    df_clusters['food_cons_pc_day'] = df_clusters['food_cons_day'] / df_clusters[hhsize_col]
    df_clusters['food_cons_pc_month'] = df_clusters['food_cons_month'] / df_clusters[hhsize_col]
    df_clusters['food_cons_pc_year'] = df_clusters['food_cons_year'] / df_clusters[hhsize_col]
    
    df_clusters['nonfood_cons_pc_day'] = df_clusters['nonfood_cons_day'] / df_clusters[hhsize_col]
    df_clusters['nonfood_cons_pc_month'] = df_clusters['nonfood_cons_month'] / df_clusters[hhsize_col]
    df_clusters['nonfood_cons_pc_year'] = df_clusters['nonfood_cons_year'] / df_clusters[hhsize_col]
    
    df_clusters['educ_cons_pc_day'] = df_clusters['educ_cons_day'] / df_clusters[hhsize_col]
    df_clusters['educ_cons_pc_month'] = df_clusters['educ_cons_month'] / df_clusters[hhsize_col]
    df_clusters['educ_cons_pc_year'] = df_clusters['educ_cons_year'] / df_clusters[hhsize_col]

    df_clusters['country'] = 'eth'
    
    return df_clusters[['country', 'cluster_lat', 'cluster_lon', 
                        'total_cons_ph_year', 'total_cons_ph_month', 'total_cons_ph_day',
                        'educ_cons_ph_year', 'educ_cons_ph_month', 'educ_cons_ph_day',
                        'nonfood_cons_ph_year', 'nonfood_cons_ph_month', 'nonfood_cons_ph_day',
                        'food_cons_ph_year', 'food_cons_ph_month', 'food_cons_ph_day',
                        'total_cons_pc_day', 'total_cons_pc_week', 'total_cons_pc_month',
                        'food_cons_pc_day', 'food_cons_pc_month', 'food_cons_pc_year',
                        'nonfood_cons_pc_day', 'nonfood_cons_pc_month', 'nonfood_cons_pc_year',
                        'educ_cons_pc_day', 'educ_cons_pc_month', 'educ_cons_pc_year']]

In [13]:
data = process_ethiopia_all_consumptions(cons_data, geo_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cords.rename(columns={lat_col: 'cluster_lat', lon_col: 'cluster_lon'}, inplace=True)


In [14]:
data.head(20)

Unnamed: 0,country,cluster_lat,cluster_lon,total_cons_ph_year,total_cons_ph_month,total_cons_ph_day,educ_cons_ph_year,educ_cons_ph_month,educ_cons_ph_day,nonfood_cons_ph_year,...,total_cons_pc_month,food_cons_pc_day,food_cons_pc_month,food_cons_pc_year,nonfood_cons_pc_day,nonfood_cons_pc_month,nonfood_cons_pc_year,educ_cons_pc_day,educ_cons_pc_month,educ_cons_pc_year
0,eth,3.455701,39.515994,52844.980801,4403.7484,144.780769,896.346105,74.695509,2.455743,14343.187008,...,62.948161,1.493168,44.795054,545.006489,0.569513,17.085393,207.872275,0.03559,1.067714,12.990523
1,eth,3.549937,39.184234,58691.265007,4890.938751,160.797986,144.887084,12.073924,0.396951,4976.782543,...,53.599329,1.630733,48.922004,595.217721,0.1515,4.545007,55.297584,0.004411,0.132317,1.609856
2,eth,3.864243,39.101366,30231.34669,2519.278891,82.825607,31.717838,2.643153,0.086898,2950.647044,...,46.882419,1.40858,42.257402,514.131727,0.152528,4.575829,55.672586,0.00164,0.049188,0.59845
3,eth,3.982931,38.491368,22562.514868,1880.209572,61.815109,94.392286,7.866024,0.258609,1509.007866,...,45.230568,1.400542,42.016268,511.197923,0.100836,3.025074,36.80507,0.006308,0.189226,2.302251
4,eth,4.048194,41.930928,1738.322136,144.860178,4.762526,0.0,0.0,0.0,536.412078,...,35.718948,0.823226,24.696781,300.477499,0.367406,11.022166,134.10302,0.0,0.0,0.0
5,eth,4.235341,38.385913,9023.022071,751.918506,24.720608,17.635118,1.469593,0.048315,138.543517,...,32.244272,1.056205,31.686159,385.51493,0.016503,0.495093,6.023631,0.002101,0.06302,0.766744
6,eth,4.281844,41.875076,39310.638559,3275.886547,107.70038,199.695509,16.641292,0.547111,5140.192844,...,50.484553,1.454227,43.62682,530.792974,0.220043,6.601275,80.315513,0.008549,0.256458,3.120242
7,eth,4.767689,39.269749,23038.868228,1919.905686,63.120187,147.170769,12.264231,0.403208,3566.099975,...,43.036491,1.203337,36.10012,439.218125,0.222049,6.661457,81.047727,0.009164,0.274914,3.34479
8,eth,4.807098,36.08184,13726.800632,1143.900053,37.607673,65.338747,5.444896,0.17901,2524.232428,...,33.183241,0.89744,26.923198,327.565574,0.203403,6.102093,74.24213,0.005265,0.15795,1.921728
9,eth,4.865832,39.173673,1095.153514,91.262793,3.000421,0.0,0.0,0.0,191.321999,...,30.004206,0.825417,24.762507,301.277172,0.174723,5.241699,63.774,0.0,0.0,0.0


In [15]:
data.to_csv('raw_data/ethiopia_all_consumptions.csv', index=False)