In [1]:
from data_cleaner import *

df = load_training_df()\
.pipe(clean_targets)\
.pipe(clean_non_numerics)\
.pipe(clean_missing_values)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
is_target_consistent = df[target_column].groupby(household_id).apply(lambda x: x.nunique() == 1)
inconsistent_targets = is_target_consistent[is_target_consistent != True]
print('There are %d households with inconsistent target values' % len(inconsistent_targets))

There are 0 households with inconsistent target values


In [3]:
def get_column_dtypes(df):
    columns_by_dtype = df.columns.groupby(df.dtypes)
    return {k.name: v for k, v in columns_by_dtype.items()}

get_column_dtypes(df)

{'int64': Index(['hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'v18q', 'r4h1', 'r4h2',
        'r4h3', 'r4m1',
        ...
        'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin',
        'agesq', 'Target', 'SQBedjefa', 'owes-montly-payments',
        'residence-stability'],
       dtype='object', length=136),
 'float64': Index(['v2a1', 'v18q1', 'dependency', 'meaneduc', 'overcrowding',
        'SQBovercrowding', 'SQBdependency', 'SQBmeaned'],
       dtype='object')}

In [4]:
nulls = df.isnull().sum(axis=0)
nulls[nulls!=0]/len(df)

Series([], dtype: float64)

### New features for individuals

Before considering data at a household level there are some new features that may be useful to generate at an individual's level.

There are 9 columns used as a binary one-hot encoding of the individuals level of education. We can compress this down to a single value to represent how far through education this individual has been.

In [5]:
df = df.pipe(compress_columns, new_col='education-level', 
        cols_to_compress=['instlevel1', 'instlevel2', 'instlevel3', 'instlevel6', 'instlevel4', 'instlevel7', 
                          'instlevel5', 'instlevel8', 'instlevel9'])

### New features for households

All our new features from this point on will be descriptions at a household level so we'll append them all to a DataFrame indexed at household level.

In [6]:
hh_df = pd.DataFrame(index=df.index.get_level_values(0).drop_duplicates())
hh_size = df.groupby(household_id).size().rename('hh_size').reindex(hh_df.index)

The data given to us calculates a dependency rate which looks at the number of adults between 19 and 64 (working age) vs the number of children or adults of 65+. This is likely to be due to the fact adults of working age will be supporting the household. Let's define a couple of terms:
 - `supporter` : Household member aged 19-64 who has not been marked as having a disability
 - `dependent` : Household member aged 0-19, 65+, or is disabled

We saw when cleaning the data that there are cases in which households have no supporters. We can add a couple of features to indicate whether there are no supporters in the household, or also no dependents in the household.

In [7]:
supporters = df[(df['age']>=18) & (df['age']<=64) & (df['dis']==0)]
dependents = df[(df['age']<=18) | (df['age']>=64) | (df['dis']==1)]

hh_df['num_supporters'] = supporters.groupby(household_id).size().reindex(hh_df.index, fill_value=0)
hh_df['num_dependents'] = dependents.groupby(household_id).size().reindex(hh_df.index, fill_value=0)

hh_df['0_supporters'] = (hh_df['num_supporters']==0).astype(int)
hh_df['0_dependents'] = (hh_df['num_dependents']==0).astype(int)

We already have our dependency calculation which was regenerated during the data cleanup, let's add this and the square of it's value as these were already present in the original data and are likely to be useful in this prediction. This value is consistent across all individuals so we'll just take the first one we see for each household.

In [8]:
hh_df['dependency'] = df['dependency'].groupby(household_id).first()
hh_df['SQBdependency'] = df['SQBdependency'].groupby(household_id).first()

It may be useful to know the gender breakdown of supporters since there is a gender driven pay gap in most countries and this may have some effect on the wealth of the family.

In [9]:
m_supporters = supporters[supporters['male']==1] 
f_supporters = supporters[supporters['female']==1] 

hh_df['num_m_supporters'] = m_supporters.groupby(household_id).size().reindex(hh_df.index, fill_value=0)
hh_df['num_f_supporters'] = f_supporters.groupby(household_id).size().reindex(hh_df.index, fill_value=0)

Education-level of household supporters is likely to have a large impact on the wealth of the family as well. We already have the mean education of adults in the household, but let's make a new value for supporters, and supporters broken down by gender.

In [10]:
hh_df['meaneduc_s'] = supporters['escolari'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)
hh_df['meaneduc_m'] = m_supporters['escolari'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)
hh_df['meaneduc_f'] = f_supporters['escolari'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)

hh_df['ed_lev_s'] = supporters['education-level'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)
hh_df['ed_lev_m'] = m_supporters['education-level'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)
hh_df['ed_lev_f'] = f_supporters['education-level'].groupby(household_id).mean().round(2).reindex(hh_df.index, fill_value=0)

Since a member of the household has been assigned 'head-of-household' it's possible that details relating this individual offer significant information about the household. We can add extra features from combinations of details about them.

In [11]:
hoh = df[(df[head_of_household]==1)].groupby(household_id).first()

hh_df['male_hoh'] = (hoh['male']==1).astype(int).reindex(hh_df.index, fill_value=0)
hh_df['educ_hoh'] = hoh['escolari'].reindex(hh_df.index, fill_value=0)
hh_df['educ_hoh_m'] = hoh['edjefe'].reindex(hh_df.index, fill_value=0)
hh_df['educ_hoh_f'] = hoh['edjefa'].reindex(hh_df.index, fill_value=0)
hh_df['SQeduc_hoh_m'] = hoh['SQBedjefe'].reindex(hh_df.index, fill_value=0)
hh_df['SQeduc_hoh_f'] = hoh['SQBedjefa'].reindex(hh_df.index, fill_value=0)
hh_df['ed_lev_hoh'] = hoh['education-level'].reindex(hh_df.index, fill_value=0)

hh_df['hoh_is_sup'] = (((hoh['age']>=18) & (hoh['age']<=64) & (hoh['dis']==0))
                       .astype(int)
                       .reindex(hh_df.index, fill_value=0))

Missing education is more significant for children as this indicates that they are falling behind rather than just showing the number of years they have been in education. Let's check for those under 18 who are falling behind in school. We'll only consider children without disabilities else the disability itself might be the cause of falling behind in school, rather than indicating it being due to wealth issues.

In [12]:
minors = df[(df['age']>=18) & (df['dis']==0)]

hh_df['missing_school'] = minors['rez_esc'].mean(level=household_id).round(2).reindex(hh_df.index, fill_value=0)
hh_df['missing_school_m'] = minors[minors['male']==1]['rez_esc'].mean(level=household_id).round(2).reindex(hh_df.index, fill_value=0)
hh_df['missing_school_f'] = minors[minors['female']==1]['rez_esc'].mean(level=household_id).round(2).reindex(hh_df.index, fill_value=0)

We can add in the values we were working with for whether monthly rent payments are owed, how much, and the stability of the household's residence.

In [13]:
hh_df['rent'] = df['v2a1'].groupby(household_id).first().reindex(hh_df.index, fill_value=0)
hh_df['pays_rent'] = df['owes-montly-payments'].groupby(household_id).first().reindex(hh_df.index, fill_value=0)
hh_df['residence-stability'] = df['residence-stability'].groupby(household_id).first().reindex(hh_df.index, fill_value=0)

What if a household owes rent payments but has no supporters:

In [14]:
hh_df['rent_problems'] = ((hh_df['pays_rent']==1) & (hh_df['0_supporters']==1)).astype(int)

While cleaning the data we also saw the feature describing the number of tablets a household owns. Let's add that and look more into features around the possessions of the household.

In [15]:
# Binary values
hh_df['refrig'] = df['refrig'].groupby(household_id).first().reindex(hh_df.index)
hh_df['computer'] = df['computer'].groupby(household_id).first().reindex(hh_df.index)
hh_df['television'] = df['television'].groupby(household_id).first().reindex(hh_df.index)
# Count of how many owned
hh_df['tablets_ratio'] = (df['v18q1'].groupby(household_id).first().reindex(hh_df.index)/hh_size).round(2)
hh_df['mobilephones_ratio'] = (df['qmobilephone'].groupby(household_id).first().reindex(hh_df.index)/hh_size).round(2)

Combinations of electronic possessions like these probably give some indication to wealth.

In [17]:
hh_df['electronics'] = (hh_df['refrig'] + hh_df['computer'] + hh_df['television'] +
                        (hh_df['tablets_ratio']>0).astype(int) +
                        (hh_df['mobilephones_ratio']>0).astype(int))

Information has been provided about numbers of rooms and overcrowding. Let's take a look.

Overcrowding is the number of people divided by the number of bedrooms. Where overcrowding is 3 or above the binary value for overcrowding by bedroom is set to True.

In [21]:
df[['rooms','hacapo','bedrooms','hacdor','overcrowding','hhsize']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rooms,hacapo,bedrooms,hacdor,overcrowding,hhsize
idhogar,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21eb7fcc1,ID_279628684,3,0,1,0,1.0,1
0e5d7a658,ID_f29eb3ddd,4,0,1,0,1.0,1
2c7317ea8,ID_68de51c94,8,0,2,0,0.5,1
2b58d945f,ID_d671db89c,5,0,3,0,1.333333,4
2b58d945f,ID_d56d6f5f5,5,0,3,0,1.333333,4


In [31]:
overcrowded = df[df['hacapo']==1]
overcrowded['p2room'] = overcrowded['hhsize']/overcrowded['rooms']
overcrowded['p2bedroom'] = overcrowded['hhsize']/overcrowded['bedrooms']
overcrowded[['p2room','hacapo','p2bedroom','hacdor','rooms','bedrooms','overcrowding','hhsize']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Unnamed: 1_level_0,p2room,hacapo,p2bedroom,hacdor,rooms,bedrooms,overcrowding,hhsize
idhogar,Id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3e16fab89,ID_848b0fdf1,3.000000,1,3.0,1,1,1,3.0,3
3e16fab89,ID_298d857e5,3.000000,1,3.0,1,1,1,3.0,3
3e16fab89,ID_1d5bf8ddf,3.000000,1,3.0,1,1,1,3.0,3
63f11d6ea,ID_912ff9986,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_3f1c6eec1,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_44bb758b6,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_fa7b8e81a,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_bc8ba77e7,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_fe1ee7296,3.333333,1,5.0,1,3,2,5.0,10
63f11d6ea,ID_715f2450c,3.333333,1,5.0,1,3,2,5.0,10


In [18]:
nulls = hh_df.isnull().sum(axis=0)
nulls[nulls!=0]/len(hh_df)

Series([], dtype: float64)

In [None]:
# hh_df[['refrig','computer','television','tablets_ratio','mobilephones_ratio','electronics']].join(hh_size)