Our  data is cleaned .. in the sense there are no more columns with empty values. Data imputation is complete.

Its time to analyze the data further.

One point that standsout from the data dictionary :

idhogar - this is a unique identifier for each household. This can be used to create household-wide features, etc. All rows in a given household will have a matching value for this identifier.

parentesco1 - indicates if this person is the head of the household.

In [1]:
import pandas as panda
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [2]:
train_data = panda.read_csv('data/train_1.csv')
train_data.shape

(9557, 145)

In [3]:
train_data.columns = [str(i).lower() for i in train_data.columns.tolist()]

In [4]:
print('number of unique households', len(set(train_data.idhogar.values)))

number of unique households 2988


In [5]:
# lets take a random sample to understand the data better
# h1 = list(set(train_data.idhogar.values))[10]
# h2 = list(set(train_data.idhogar.values))[11]

h1 = '85595a252'
h2 = '40f1c3c06'

In [6]:
train_data[train_data['idhogar'].isin([h1,h2]) ]

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4177,ID_186156bfd,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1156,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0
5403,ID_9b559d1bf,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,676,4,0.0,0.46
5404,ID_6e6ed4b49,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,841,4,0.0,0.46


In [7]:
train_data['parentesco1'].value_counts()

0    6584
1    2973
Name: parentesco1, dtype: int64

In [8]:
train_data.loc[train_data['idhogar']==h1]

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4177,ID_186156bfd,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1156,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0


In [9]:
train_data.loc[train_data['idhogar']==h1].loc[train_data['parentesco1']==1]

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4177,ID_186156bfd,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1156,4,0.0,0.46


#### Observations

1. There are multiple rows for the same household having the same target value

2. As reported in the data dictionary, the scoring is made only for head of the family.

3. We would need to transform teh data in suhc a way that one row corresponds to one single household and at the same time
it retains the information provided. eg number of children below a certain age can be a very valuable indicator of whether aid is required

4. Separate out columns which are to do with material descriptions like number of refrigerators, access to sanitation etc and columns which are to do with human statistics such as age, education, demographic of household etc

5. Join on household id column to form a table with large number of features but still retaining all data required



In [10]:
#first lets rename some columns

demo_column_name_dict ={
    'r4h1':'males_below_12',
    'r4h2':'males_above_12',
    'r4h3':'total_males',
    'r4m1':'females_below_12',
    'r4m2':'females_above_12',
    'r4m3':'total_females',
    'r4t1':'person_below_12',
    'r4t2':'person_above_12',
    'r4t3':'total_num_person',
    'tamviv' : 'total_living',
    'escolari' : 'yrs_of_schooling',
    'rez_esc' : 'yrs_behind_in_school',
    'meaneduc' : 'average_education',
    'hogar_nin': 'num_children', #Number of children 0 to 19 in household
    'hogar_adul': 'num_adults',
    'hogar_mayor': 'num_senior',
    'hogar_total' : 'total_individuals',
 
    
}

In [11]:
demographic_columns = list(demo_column_name_dict.values())
demographic_columns.extend(['male','female'])
material_columns = []

In [12]:
demographic_columns

['males_below_12',
 'males_above_12',
 'total_males',
 'females_below_12',
 'females_above_12',
 'total_females',
 'person_below_12',
 'person_above_12',
 'total_num_person',
 'total_living',
 'yrs_of_schooling',
 'yrs_behind_in_school',
 'average_education',
 'num_children',
 'num_adults',
 'num_senior',
 'total_individuals',
 'male',
 'female']

In [13]:
##all our operations will be done on these table for testing purposes on how the final data frame would look like
trial_table = train_data[train_data.idhogar.isin([h1,h2]) ]
trial_table

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4177,ID_186156bfd,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1156,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0
5403,ID_9b559d1bf,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,676,4,0.0,0.46
5404,ID_6e6ed4b49,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,841,4,0.0,0.46


In [14]:
trial_table_non_primary = trial_table[trial_table.parentesco1==0]
trial_table_non_primary

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0
5403,ID_9b559d1bf,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,676,4,0.0,0.46


In [15]:
#lets rename some columns to meaningful terms
trial_table_non_primary.rename(columns = demo_column_name_dict, inplace = True)

In [16]:
trial_table_non_primary

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,males_below_12,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0
5403,ID_9b559d1bf,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,676,4,0.0,0.46


In [17]:
agg_columns = [
    'idhogar',
    'yrs_behind_in_school_nan_with_max_appearing',
    'yrs_behind_in_school_nan_with_mean',
    'yrs_of_schooling',
    'average_education',
    'male_education_years',
    'female_education_years',
    
]
trial_table_non_primary[agg_columns]

Unnamed: 0,idhogar,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean,yrs_of_schooling,average_education,male_education_years,female_education_years
4176,85595a252,0.0,0.46,14,12.5,0,11
4178,85595a252,1.0,1.0,7,12.5,0,11
4179,85595a252,0.0,0.0,4,12.5,0,11
5403,40f1c3c06,0.0,0.46,6,8.5,11,0


In [18]:
groupby_idhogar = trial_table_non_primary.groupby(['idhogar'])

In [19]:
##perform some aggregation
t = groupby_idhogar.agg(

{
    'male_education_years':['sum','mean','max','min','std'],
    'female_education_years':['sum','mean','max','min','std'],
    'average_education' : ['nunique','sum','mean','max','min','std'],
    'yrs_of_schooling' : ['nunique','sum','mean','max','min','std'],
    'yrs_behind_in_school_nan_with_mean' : ['nunique','sum','mean','max','min','std'],
    'yrs_behind_in_school_nan_with_max_appearing' : ['nunique','sum','mean','max','min','std'],
    
}

)


In [20]:
t.reset_index(inplace=True)
# t.columns = new_cols
t.head()

Unnamed: 0_level_0,idhogar,male_education_years,male_education_years,male_education_years,male_education_years,male_education_years,female_education_years,female_education_years,female_education_years,female_education_years,...,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,max,min,std,sum,mean,max,min,...,mean,max,min,std,nunique,sum,mean,max,min,std
0,40f1c3c06,11,11,11,11,,0,0,0,0,...,0.46,0.46,0.46,,1,0.0,0.0,0.0,0.0,
1,85595a252,0,0,0,0,0.0,33,11,11,11,...,0.486667,1.0,0.0,0.500533,2,1.0,0.333333,1.0,0.0,0.57735


In [21]:
new_cols=[('occupants_'+i+'_'+j) for (i,j) in t.columns.tolist()]

t.columns = new_cols

In [22]:
t.rename(columns={'occupants_idhogar_':'idhogar'}, inplace=True)
t.fillna(value=0, inplace=True)
t.head()

Unnamed: 0,idhogar,occupants_male_education_years_sum,occupants_male_education_years_mean,occupants_male_education_years_max,occupants_male_education_years_min,occupants_male_education_years_std,occupants_female_education_years_sum,occupants_female_education_years_mean,occupants_female_education_years_max,occupants_female_education_years_min,...,occupants_yrs_behind_in_school_nan_with_mean_mean,occupants_yrs_behind_in_school_nan_with_mean_max,occupants_yrs_behind_in_school_nan_with_mean_min,occupants_yrs_behind_in_school_nan_with_mean_std,occupants_yrs_behind_in_school_nan_with_max_appearing_nunique,occupants_yrs_behind_in_school_nan_with_max_appearing_sum,occupants_yrs_behind_in_school_nan_with_max_appearing_mean,occupants_yrs_behind_in_school_nan_with_max_appearing_max,occupants_yrs_behind_in_school_nan_with_max_appearing_min,occupants_yrs_behind_in_school_nan_with_max_appearing_std
0,40f1c3c06,11,11,11,11,0.0,0,0,0,0,...,0.46,0.46,0.46,0.0,1,0.0,0.0,0.0,0.0,0.0
1,85595a252,0,0,0,0,0.0,33,11,11,11,...,0.486667,1.0,0.0,0.500533,2,1.0,0.333333,1.0,0.0,0.57735


In [23]:
trial_table.head()

Unnamed: 0,id,monthly_rent_paid,hacdor,rooms,hacapo,v14a,refrig,tablet_owner,num_of_tablets,r4h1,...,sqbhogar_total,sqbedjefe,sqbhogar_nin,sqbovercrowding,sqbdependency,sqbmeaned,agesq,target,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_mean
4176,ID_28cee0839,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1369,4,0.0,0.46
4177,ID_186156bfd,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,1156,4,0.0,0.46
4178,ID_242c9edb1,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,225,4,1.0,1.0
4179,ID_fd6615cb7,0.0,0,6,0,1,1,1,1.0,1,...,16,0,4,1.777778,1.0,156.25,121,4,0.0,0.0
5403,ID_9b559d1bf,83333.0,0,3,0,1,1,0,0.0,0,...,4,121,0,1.0,0.0,72.25,676,4,0.0,0.46


In [24]:
trial_table.drop(columns = ['sqbmeaned','rez_esc'], inplace = True)

In [47]:
trial_table.shape, trial_table.isnull().any(axis=1)

((6, 143), 4176    False
 4177    False
 4178    False
 4179    False
 5403    False
 5404    False
 dtype: bool)

In [51]:
tt = panda.merge(trial_table,t, how = 'inner' , on='idhogar')

In [52]:
t.shape

(2, 35)

In [53]:
tt.shape

(6, 177)

In [54]:
tt.isnull().any(axis=1)

0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool