Our data is cleaned .. in the sense there are no more columns with empty values. Data imputation is complete.

Its time to analyze the data further.

One point that standsout from the data dictionary :

idhogar - this is a unique identifier for each household. This can be used to create household-wide features, etc. All rows in a given household will have a matching value for this identifier.

parentesco1 - indicates if this person is the head of the household.

In [1]:
import pandas as panda
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [2]:
train_data = panda.read_csv('data/train_1.csv')
train_data.shape

(9557, 145)

In [3]:
train_data.columns = [str(i).lower() for i in train_data.columns.tolist()]

In [11]:
train_data.drop(columns = ['sqbmeaned','rez_esc'], inplace = True)

In [13]:
print('number of unique households', len(set(train_data.idhogar.values)))

number of unique households 2988


#### Observations

1. There are multiple rows for the same household having the same target value

2. As reported in the data dictionary, the scoring is made only for head of the family.

3. We would need to transform teh data in suhc a way that one row corresponds to one single household and at the same time
it retains the information provided. eg number of children below a certain age can be a very valuable indicator of whether aid is required

4. Separate out columns which are to do with material descriptions like number of refrigerators, access to sanitation etc and columns which are to do with human statistics such as age, education, demographic of household etc

5. Join on household id column to form a table with large number of features but still retaining all data required



In [26]:
#first lets rename some columns

demo_column_name_dict ={
    'r4h1':'males_below_12',
    'r4h2':'males_above_12',
    'r4h3':'total_males',
    'r4m1':'females_below_12',
    'r4m2':'females_above_12',
    'r4m3':'total_females',
    'r4t1':'person_below_12',
    'r4t2':'person_above_12',
    'r4t3':'total_num_person',
    'tamviv' : 'total_living',
    'escolari' : 'yrs_of_schooling',
    'rez_esc' : 'yrs_behind_in_school',
    'meaneduc' : 'average_education',
    'hogar_nin': 'num_children', #Number of children 0 to 19 in household
    'hogar_adul': 'num_adults',
    'hogar_mayor': 'num_senior',
    'hogar_total' : 'total_individuals',
 
    
}

In [32]:
train_data.rename(columns =demo_column_name_dict, inplace = True)

In [33]:
demographic_columns = list(demo_column_name_dict.values())
demographic_columns.extend(['male','female'])
material_columns = []

In [34]:
demographic_columns

['males_below_12',
 'males_above_12',
 'total_males',
 'females_below_12',
 'females_above_12',
 'total_females',
 'person_below_12',
 'person_above_12',
 'total_num_person',
 'total_living',
 'yrs_of_schooling',
 'yrs_behind_in_school',
 'average_education',
 'num_children',
 'num_adults',
 'num_senior',
 'total_individuals',
 'male',
 'female']

In [35]:
agg_columns = [
    'idhogar',
    'yrs_behind_in_school_nan_with_max_appearing',
    'yrs_behind_in_school_nan_with_mean',
    'yrs_of_schooling',
    'average_education',
    'male_education_years',
    'female_education_years',
    
]


In [36]:
groupby_idhogar = train_data.groupby(['idhogar'])

In [37]:
##perform some aggregation
t = groupby_idhogar.agg(

{
    'male_education_years':['sum','mean','max','min','std'],
    'female_education_years':['sum','mean','max','min','std'],
    'average_education' : ['nunique','sum','mean','max','min','std'],
    'yrs_of_schooling' : ['nunique','sum','mean','max','min','std'],
    'yrs_behind_in_school_nan_with_mean' : ['nunique','sum','mean','max','min','std'],
    'yrs_behind_in_school_nan_with_max_appearing' : ['nunique','sum','mean','max','min','std'],
    
}

)


In [38]:
t.reset_index(inplace=True)
# t.columns = new_cols
t.head()

Unnamed: 0_level_0,idhogar,male_education_years,male_education_years,male_education_years,male_education_years,male_education_years,female_education_years,female_education_years,female_education_years,female_education_years,...,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_mean,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing,yrs_behind_in_school_nan_with_max_appearing
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,max,min,std,sum,mean,max,min,...,mean,max,min,std,nunique,sum,mean,max,min,std
0,001ff74ca,0,0,0,0,0.0,32,16,16,16,...,0.46,0.46,0.46,0.0,1,0.0,0.0,0.0,0.0,0.0
1,003123ec2,24,6,6,6,0.0,0,0,0,0,...,0.46,0.46,0.46,0.0,1,0.0,0.0,0.0,0.0,0.0
2,004616164,6,3,3,3,0.0,0,0,0,0,...,0.23,0.46,0.0,0.325269,1,0.0,0.0,0.0,0.0,0.0
3,004983866,0,0,0,0,0.0,16,8,8,8,...,1.23,2.0,0.46,1.088944,2,2.0,1.0,2.0,0.0,1.414214
4,005905417,0,0,0,0,0.0,27,9,9,9,...,0.306667,0.46,0.0,0.265581,1,0.0,0.0,0.0,0.0,0.0


In [39]:
new_cols=[('occupants_'+i+'_'+j) for (i,j) in t.columns.tolist()]

t.columns = new_cols

In [40]:
t.rename(columns={'occupants_idhogar_':'idhogar'}, inplace=True)
t.fillna(value=0, inplace=True) ## why would be get nan,, coz some households have single entry and are 
t.head()

Unnamed: 0,idhogar,occupants_male_education_years_sum,occupants_male_education_years_mean,occupants_male_education_years_max,occupants_male_education_years_min,occupants_male_education_years_std,occupants_female_education_years_sum,occupants_female_education_years_mean,occupants_female_education_years_max,occupants_female_education_years_min,...,occupants_yrs_behind_in_school_nan_with_mean_mean,occupants_yrs_behind_in_school_nan_with_mean_max,occupants_yrs_behind_in_school_nan_with_mean_min,occupants_yrs_behind_in_school_nan_with_mean_std,occupants_yrs_behind_in_school_nan_with_max_appearing_nunique,occupants_yrs_behind_in_school_nan_with_max_appearing_sum,occupants_yrs_behind_in_school_nan_with_max_appearing_mean,occupants_yrs_behind_in_school_nan_with_max_appearing_max,occupants_yrs_behind_in_school_nan_with_max_appearing_min,occupants_yrs_behind_in_school_nan_with_max_appearing_std
0,001ff74ca,0,0,0,0,0.0,32,16,16,16,...,0.46,0.46,0.46,0.0,1,0.0,0.0,0.0,0.0,0.0
1,003123ec2,24,6,6,6,0.0,0,0,0,0,...,0.46,0.46,0.46,0.0,1,0.0,0.0,0.0,0.0,0.0
2,004616164,6,3,3,3,0.0,0,0,0,0,...,0.23,0.46,0.0,0.325269,1,0.0,0.0,0.0,0.0,0.0
3,004983866,0,0,0,0,0.0,16,8,8,8,...,1.23,2.0,0.46,1.088944,2,2.0,1.0,2.0,0.0,1.414214
4,005905417,0,0,0,0,0.0,27,9,9,9,...,0.306667,0.46,0.0,0.265581,1,0.0,0.0,0.0,0.0,0.0


In [73]:
train_data.shape, t.shape

((9557, 143), (2988, 35))

In [80]:
tt = panda.merge(train_data,t, how = 'inner' , on='idhogar')

In [81]:
tt.shape

(9557, 177)

In [82]:
aa = tt[tt.parentesco1==1]
aa.shape

(2973, 177)

In [83]:
aa.isnull().any().sum()

0

In [84]:
aa.to_csv('data/train_2.csv', index= False)