In [29]:
import pandas as pd 
import os 
import glob 
from collections import Counter

# Merging

In [30]:
glob.glob("*csv")

['all_data.csv',
 'binary_test.csv',
 'binary_train.csv',
 'binary_x_test.csv',
 'binary_x_train.csv',
 'binary_y_test.csv',
 'binary_y_train.csv',
 'data_merged_nocatvars.csv',
 'data_merged_noMissingData.csv',
 'diaries_trx_trunc_loans_start_bal.csv',
 'individual_outcomes_33_150.csv',
 'loan_outcomes_33_150.csv']

In [31]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [32]:
data = pd.read_csv('all_data.csv', dtype = {'m_ids_owner': 'object'})

In [33]:
data.columns

Index(['m_ids_owner', 'owner_score', 'binarize_score', 'hh_ids', 'con_pur_hh_meanptrx', 'con_pur_hh_sdptrx', 'con_pur_hh_minpti', 'con_pur_hh_meanpti', 'con_pur_hh_medpti_x', 'con_pur_hh_maxpti',
       ...
       'hh_inf_electricity_access', 'hh_inf_interiorcond', 'hh_asset_internet_access', 'hh_env_settingdesc', 'hh_env_dilapbuildings', 'hh_ownership_docu', 'hh_otherprop_any', 'hh_otherprop_N', 'hh_othershamba_N', 'add_prop_value'], dtype='object', length=193)

# Formatting: 

In [34]:
data.shape

(136, 193)

## Dealing with the categorical variables: 

In [35]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
edu_i_attain              object
dem_i_male                object
dem_i_age_5yrgrp          object
dem_i_relhead             object
dem_i_marstat             object
dem_i_tribe               object
hh_inf_lighting_source    object
dtype: object

#### Dropping age group b/c we have age itself: 

In [36]:
data = data.drop(['dem_i_age_5yrgrp'], axis = 1)

#### Reformatting the edu_i_attain entries: 

In [37]:
data.edu_i_attain.value_counts()

Primary (some or complete)           70
Secondary (some or complete)         50
Post-secondary (some or complete)    11
No education                          3
Nursery/ Kindergarten                 1
Name: edu_i_attain, dtype: int64

In [38]:
new_edu = []
for idx, row in data.iterrows(): 
    if 'Primary' in str(row['edu_i_attain']): 
        new_edu.append(2)
    elif 'Secondary' in str(row['edu_i_attain']): 
        new_edu.append(3)
    elif 'Post-secondary' in str(row['edu_i_attain']): 
        new_edu.append(4)
    elif 'education' in str(row['edu_i_attain']): 
        new_edu.append(0)
    else: 
        new_edu.append(1)

In [39]:
Counter(new_edu)

Counter({3: 50, 4: 11, 2: 70, 0: 3, 1: 2})

In [40]:
data.edu_i_attain = new_edu

In [41]:
data.edu_i_attain.value_counts()

2    70
3    50
4    11
0     3
1     2
Name: edu_i_attain, dtype: int64

#### Reformatting the dem_i_male entries: 

In [42]:
data.dem_i_male.value_counts()

Female    98
Male      37
Name: dem_i_male, dtype: int64

In [43]:
new_male = []
for idx, row in data.iterrows(): 
    if 'Female' in str(row['dem_i_male']): 
        new_male.append(0)
    else: 
        new_male.append(1)

In [44]:
data.dem_i_male = new_male

In [45]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
dem_i_relhead             object
dem_i_marstat             object
dem_i_tribe               object
hh_inf_lighting_source    object
dtype: object

#### Reformatting the dem_i_relhead entries: 

In [46]:
data.dem_i_relhead.value_counts()

Household head               89
Husband or wife              42
Son or daughter               3
Parent (mother or father)     1
Name: dem_i_relhead, dtype: int64

In [47]:
new_relhead = []
for idx, row in data.iterrows(): 
    if 'Household' in str(row['dem_i_relhead']): 
        new_relhead.append('hh_head')
    elif 'Husband' in str(row['dem_i_relhead']):
        new_relhead.append('h_or_w')
    elif 'Son' in str(row['dem_i_relhead']):
        new_relhead.append('s_or_d')    
    else: 
        new_relhead.append('parent')
        

In [48]:
data.dem_i_relhead = new_relhead

In [49]:
new_relhead = pd.get_dummies(data.dem_i_relhead, prefix = 'dem_i_relhead')

In [50]:
data = pd.concat([data.reset_index(drop=True), new_relhead], axis=1)

In [51]:
data = data.drop('dem_i_relhead', axis = 1)

In [52]:
# data.columns

In [53]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
dem_i_marstat             object
dem_i_tribe               object
hh_inf_lighting_source    object
dtype: object

#### Reformatting the dem_i_marstat entries: 

In [54]:
data.dem_i_marstat.value_counts()

Married/living together         102
Widowed                          17
Never married/lived together      8
Separated/divorced                8
Name: dem_i_marstat, dtype: int64

In [55]:
new_marstat = []
for idx, row in data.iterrows(): 
    if 'Married' in str(row['dem_i_marstat']): 
        new_marstat.append('married')
    elif 'Widowed' in str(row['dem_i_marstat']):
        new_marstat.append('widowed')
    elif 'Separated' in str(row['dem_i_marstat']):
        new_marstat.append('separated')    
    else: 
        new_marstat.append('never_married')
        

In [56]:
data.dem_i_marstat = new_marstat

In [57]:
new_marstat = pd.get_dummies(data.dem_i_marstat)

In [58]:
data = pd.concat([data.reset_index(drop=True), new_marstat], axis=1)

In [59]:
data = data.drop('dem_i_marstat', axis = 1)

In [60]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
dem_i_tribe               object
hh_inf_lighting_source    object
dtype: object

#### Reformatting the dem_i_tribe entries: 

In [61]:
data.dem_i_tribe.value_counts()

Luhya                40
Kamba                32
Kalenjin             25
Mijikenda/Swahili    14
Kikuyu               11
Luo                  10
Kisii                 2
Taita/Taveta          1
Name: dem_i_tribe, dtype: int64

In [62]:
new_tribe = pd.get_dummies(data.dem_i_tribe, prefix = 'tribe')

In [63]:
data = pd.concat([data.reset_index(drop=True), new_tribe], axis=1)

In [64]:
data = data.drop('dem_i_tribe', axis = 1)

In [65]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
hh_inf_lighting_source    object
dtype: object

In [66]:
data["hh_inf_lighting_source"].value_counts()

4                     75
6                     27
7                      9
5                      1
3                      1
Collected Firewood     1
Name: hh_inf_lighting_source, dtype: int64

In [67]:
data.dtypes[data.dtypes == 'object']

m_ids_owner               object
hh_ids                    object
hh_inf_lighting_source    object
dtype: object

###  deleting "HH" from m_ids_owner

In [68]:
data.loc[data['m_ids_owner']=="HH"]

Unnamed: 0.1,m_ids_owner,owner_score,binarize_score,hh_ids,con_pur_hh_meanptrx,con_pur_hh_sdptrx,con_pur_hh_minpti,con_pur_hh_meanpti,con_pur_hh_medpti_x,con_pur_hh_maxpti,con_pur_hh_meanpti_ALC,con_pur_hh_medpti_ALC,con_pur_hh_meanpti_CLN,con_pur_hh_medpti_CLN,con_pur_hh_meanpti_CLTH,con_pur_hh_medpti_CLTH,con_pur_hh_meanpti_COMM,con_pur_hh_medpti_COMM,con_pur_hh_meanpti_EDU,con_pur_hh_medpti_EDU,con_pur_hh_meanpti_ENRGY,con_pur_hh_medpti_ENRGY,con_pur_hh_meanpti_ENT,con_pur_hh_medpti_ENT,con_pur_hh_meanpti_FOOD,con_pur_hh_medpti_FOOD,con_pur_hh_meanpti_H20,con_pur_hh_medpti_H20,con_pur_hh_meanpti_HSNG,con_pur_hh_medpti_HSNG,con_pur_hh_meanpti_MED,con_pur_hh_medpti_MED,con_pur_hh_meanpti_OTH,con_pur_hh_medpti_OTH,con_pur_hh_meanpti_PERS,con_pur_hh_medpti_PERS,con_pur_hh_meanpti_REL,con_pur_hh_medpti_REL,con_pur_hh_meanpti_TRNSP,con_pur_hh_medpti_TRNSP,Unnamed: 0_x,happiness,economically,relationships,confidence,police,doctor,unsafe,utilities,asset_taken,miss_app,miss_inc,romantic,hungry,school_sent,m_ids_x,edu_i_enrolled,edu_i_attain,edu_i_attain_yrs,edu_hh_child613inschool,edu_hh_child1417inschool,edu_hh_girls613inschool,edu_hh_girls1417inschool,edu_hh_boys613inschool,edu_hh_boys1417inschool,edu_hh_headattain_yrs,m_ids_y,dem_i_male,dem_i_age_yrs,dem_i_age_workage,dem_i_hhead,dem_i_children_tot,dem_i_children_inhh,dem_urban_x,dem_hh_size_x,dem_hh_meanage,dem_hh_ages0_4,dem_hh_ages5_14,dem_hh_ages15_17,dem_hh_ages0_17,dem_hh_ages15plus,dem_hh_ages18plus,dem_hh_ages65plus,dem_hh_ages20_30,dem_hh_workage,dem_hh_dependents,dem_hh_agehead,dem_hh_malehead,dem_pae_oecd_x,dem_pae_kihbs_x,Unnamed: 0_y,hh_income,hh_income_only,inc_i_unique_sources,inc_i_minpti,inc_i_meanpti,inc_i_sdpti,inc_i_maxpti,inc_i_meanpti_AGRI,inc_i_sdpti_AGRI,inc_i_meanpti_CITMPC,inc_i_sdpti_CITMPC,inc_i_meanpti_NEINCG,inc_i_sdpti_NEINCG,inc_i_meanpti_REI,inc_i_sdpti_REI,inc_i_meanpti_RIOI,inc_i_sdpti_RIOI,inc_i_meanpti_SEI,inc_i_sdpti_SEI,dom_inc_AGRI,dom_inc_CITMPC,dom_inc_NEINCG,dom_inc_REI,dom_inc_RIOI,dom_inc_SEI,Unnamed: 0,rem_i_unique_sources,rem_i_meanptrx,rem_i_sdptrx,rem_i_meanptrx_RG,rem_i_sdptrx_RG,rem_i_meanptrx_RR,rem_i_sdptrx_RR,rem_i_count_medianpti,rem_i_abs_medianpti,rem_i_net_medianpti,rem_i_count_medianpti_RG,rem_i_abs_medianpti_RG,rem_i_count_medianpti_RR,rem_i_abs_medianpti_RR,rem_i_count_medianpti_cash,rem_i_abs_medianpti_cash,rem_i_net_medianpti_cash,rem_i_count_medianpti_inkind,rem_i_abs_medianpti_inkind,rem_i_net_medianpti_inkind,rem_i_count_medianpti_mm,rem_i_abs_medianpti_mm,rem_i_net_medianpti_mm,rem_i_count_medianpti_other,rem_i_abs_medianpti_other,rem_i_net_medianpti_other,inc_hh_medianpti,con_pur_hh_medpti_y,con_prod_hh_medpti,dem_urban_y,dem_hh_size_y,dem_pae_oecd_y,dem_pae_kihbs_y,con_hh_medpti,inc_hh_pti_pae_oecd,con_hh_pti_pae_oecd,inc_hh_pti_pae_kihbs,con_hh_pti_pae_kihbs,inc_below_85_kes_oecd,con_below_85_kes_oecd,inc_below_85_kes_kihbs,con_below_85_kes_kihbs,inc_below_170_kes_oecd,con_below_170_kes_oecd,inc_below_170_kes_kihbs,con_below_170_kes_kihbs,inc_below_425_kes_oecd,con_below_425_kes_oecd,inc_below_425_kes_kihbs,con_below_425_kes_kihbs,monthly_left,hh_inf_wall_material,hh_inf_roof_material,hh_inf_floor_material,hh_inf_cookfuel_source,hh_inf_lighting_source,hh_inf_water_source,hh_inf_rooms_habitable,hh_inf_rooms_other,hh_inf_toilet_type,hh_inf_toilet_shared,hh_inf_housing_type,hh_inf_electricity_access,hh_inf_interiorcond,hh_asset_internet_access,hh_env_settingdesc,hh_env_dilapbuildings,hh_ownership_docu,hh_otherprop_any,hh_otherprop_N,hh_othershamba_N,add_prop_value,dem_i_relhead_h_or_w,dem_i_relhead_hh_head,dem_i_relhead_parent,dem_i_relhead_s_or_d,married,never_married,separated,widowed,tribe_Kalenjin,tribe_Kamba,tribe_Kikuyu,tribe_Kisii,tribe_Luhya,tribe_Luo,tribe_Mijikenda/Swahili,tribe_Taita/Taveta
35,HH,0.571429,999,KVIHC16,115.6588,345.8901,1070,5825.0,5470.0,15380,0.0,0.0,119.0909,150.0,50.0,0,72.72727,70.0,1145.455,0.0,234.5455,230.0,0.0,0,3925.909,4040.0,0.0,0.0,0.0,0.0,50.0,0,27.27273,0.0,10.90909,0.0,15.45455,0.0,173.6364,60.0,245,3.6,3.45,4.0,3.95,0.0,0.157895,0.0,0.0,0.0,0.052632,0.052632,0.0,0.105263,0.272727,,,1,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0


In [69]:
data = data.drop(35)

In [70]:
data.to_csv('data_merged_nocatvars.csv', index = False)

## I'm only going to take those variables that are complete from here; later once Dinara's ds is fixed we can do more with imputation and stuff: 

In [71]:
data = pd.read_csv('data_merged_nocatvars.csv')

In [72]:
data.describe().loc['count']

m_ids_owner                     135.0
owner_score                     135.0
binarize_score                  135.0
con_pur_hh_meanptrx             135.0
con_pur_hh_sdptrx               135.0
con_pur_hh_minpti               135.0
con_pur_hh_meanpti              135.0
con_pur_hh_medpti_x             135.0
con_pur_hh_maxpti               135.0
con_pur_hh_meanpti_ALC          135.0
con_pur_hh_medpti_ALC           135.0
con_pur_hh_meanpti_CLN          135.0
con_pur_hh_medpti_CLN           135.0
con_pur_hh_meanpti_CLTH         135.0
con_pur_hh_medpti_CLTH          135.0
con_pur_hh_meanpti_COMM         135.0
con_pur_hh_medpti_COMM          135.0
con_pur_hh_meanpti_EDU          135.0
con_pur_hh_medpti_EDU           135.0
con_pur_hh_meanpti_ENRGY        135.0
con_pur_hh_medpti_ENRGY         135.0
con_pur_hh_meanpti_ENT          135.0
con_pur_hh_medpti_ENT           135.0
con_pur_hh_meanpti_FOOD         135.0
con_pur_hh_medpti_FOOD          135.0
con_pur_hh_meanpti_H20          135.0
con_pur_hh_m

In [75]:
data[data["inc_i_meanpti"].isnull()]

Unnamed: 0.1,m_ids_owner,owner_score,binarize_score,hh_ids,con_pur_hh_meanptrx,con_pur_hh_sdptrx,con_pur_hh_minpti,con_pur_hh_meanpti,con_pur_hh_medpti_x,con_pur_hh_maxpti,con_pur_hh_meanpti_ALC,con_pur_hh_medpti_ALC,con_pur_hh_meanpti_CLN,con_pur_hh_medpti_CLN,con_pur_hh_meanpti_CLTH,con_pur_hh_medpti_CLTH,con_pur_hh_meanpti_COMM,con_pur_hh_medpti_COMM,con_pur_hh_meanpti_EDU,con_pur_hh_medpti_EDU,con_pur_hh_meanpti_ENRGY,con_pur_hh_medpti_ENRGY,con_pur_hh_meanpti_ENT,con_pur_hh_medpti_ENT,con_pur_hh_meanpti_FOOD,con_pur_hh_medpti_FOOD,con_pur_hh_meanpti_H20,con_pur_hh_medpti_H20,con_pur_hh_meanpti_HSNG,con_pur_hh_medpti_HSNG,con_pur_hh_meanpti_MED,con_pur_hh_medpti_MED,con_pur_hh_meanpti_OTH,con_pur_hh_medpti_OTH,con_pur_hh_meanpti_PERS,con_pur_hh_medpti_PERS,con_pur_hh_meanpti_REL,con_pur_hh_medpti_REL,con_pur_hh_meanpti_TRNSP,con_pur_hh_medpti_TRNSP,Unnamed: 0_x,happiness,economically,relationships,confidence,police,doctor,unsafe,utilities,asset_taken,miss_app,miss_inc,romantic,hungry,school_sent,m_ids_x,edu_i_enrolled,edu_i_attain,edu_i_attain_yrs,edu_hh_child613inschool,edu_hh_child1417inschool,edu_hh_girls613inschool,edu_hh_girls1417inschool,edu_hh_boys613inschool,edu_hh_boys1417inschool,edu_hh_headattain_yrs,m_ids_y,dem_i_male,dem_i_age_yrs,dem_i_age_workage,dem_i_hhead,dem_i_children_tot,dem_i_children_inhh,dem_urban_x,dem_hh_size_x,dem_hh_meanage,dem_hh_ages0_4,dem_hh_ages5_14,dem_hh_ages15_17,dem_hh_ages0_17,dem_hh_ages15plus,dem_hh_ages18plus,dem_hh_ages65plus,dem_hh_ages20_30,dem_hh_workage,dem_hh_dependents,dem_hh_agehead,dem_hh_malehead,dem_pae_oecd_x,dem_pae_kihbs_x,Unnamed: 0_y,hh_income,hh_income_only,inc_i_unique_sources,inc_i_minpti,inc_i_meanpti,inc_i_sdpti,inc_i_maxpti,inc_i_meanpti_AGRI,inc_i_sdpti_AGRI,inc_i_meanpti_CITMPC,inc_i_sdpti_CITMPC,inc_i_meanpti_NEINCG,inc_i_sdpti_NEINCG,inc_i_meanpti_REI,inc_i_sdpti_REI,inc_i_meanpti_RIOI,inc_i_sdpti_RIOI,inc_i_meanpti_SEI,inc_i_sdpti_SEI,dom_inc_AGRI,dom_inc_CITMPC,dom_inc_NEINCG,dom_inc_REI,dom_inc_RIOI,dom_inc_SEI,Unnamed: 0,rem_i_unique_sources,rem_i_meanptrx,rem_i_sdptrx,rem_i_meanptrx_RG,rem_i_sdptrx_RG,rem_i_meanptrx_RR,rem_i_sdptrx_RR,rem_i_count_medianpti,rem_i_abs_medianpti,rem_i_net_medianpti,rem_i_count_medianpti_RG,rem_i_abs_medianpti_RG,rem_i_count_medianpti_RR,rem_i_abs_medianpti_RR,rem_i_count_medianpti_cash,rem_i_abs_medianpti_cash,rem_i_net_medianpti_cash,rem_i_count_medianpti_inkind,rem_i_abs_medianpti_inkind,rem_i_net_medianpti_inkind,rem_i_count_medianpti_mm,rem_i_abs_medianpti_mm,rem_i_net_medianpti_mm,rem_i_count_medianpti_other,rem_i_abs_medianpti_other,rem_i_net_medianpti_other,inc_hh_medianpti,con_pur_hh_medpti_y,con_prod_hh_medpti,dem_urban_y,dem_hh_size_y,dem_pae_oecd_y,dem_pae_kihbs_y,con_hh_medpti,inc_hh_pti_pae_oecd,con_hh_pti_pae_oecd,inc_hh_pti_pae_kihbs,con_hh_pti_pae_kihbs,inc_below_85_kes_oecd,con_below_85_kes_oecd,inc_below_85_kes_kihbs,con_below_85_kes_kihbs,inc_below_170_kes_oecd,con_below_170_kes_oecd,inc_below_170_kes_kihbs,con_below_170_kes_kihbs,inc_below_425_kes_oecd,con_below_425_kes_oecd,inc_below_425_kes_kihbs,con_below_425_kes_kihbs,monthly_left,hh_inf_wall_material,hh_inf_roof_material,hh_inf_floor_material,hh_inf_cookfuel_source,hh_inf_lighting_source,hh_inf_water_source,hh_inf_rooms_habitable,hh_inf_rooms_other,hh_inf_toilet_type,hh_inf_toilet_shared,hh_inf_housing_type,hh_inf_electricity_access,hh_inf_interiorcond,hh_asset_internet_access,hh_env_settingdesc,hh_env_dilapbuildings,hh_ownership_docu,hh_otherprop_any,hh_otherprop_N,hh_othershamba_N,add_prop_value,dem_i_relhead_h_or_w,dem_i_relhead_hh_head,dem_i_relhead_parent,dem_i_relhead_s_or_d,married,never_married,separated,widowed,tribe_Kalenjin,tribe_Kamba,tribe_Kikuyu,tribe_Kisii,tribe_Luhya,tribe_Luo,tribe_Mijikenda/Swahili,tribe_Taita/Taveta
121,65134382764500000,1.0,1,KMAKL10,122.5856,224.0656,2008,3711.0,3005.0,6545,0.0,0.0,73.63636,70.0,247.2727,0,28.18182,10.0,876.0909,400.0,112.2727,115.0,0.0,0,1595.636,1840.0,0.0,0.0,0.0,0.0,158.6364,150,5.454545,0.0,40.63636,25.0,156.8182,75.0,416.3636,0.0,96,3.090909,3.0,4.181818,4.045455,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.357143,6.513438e+16,1.0,1,0.0,0.666667,1.0,0.5,,1.0,1.0,8.0,6.513438e+16,0,17.0,1.0,0.0,,,0.0,8.0,16.5,1.0,4.0,1.0,6.0,3.0,2.0,0.0,0.0,0.375,0.625,38.0,0.0,4.9,5.84,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0


There seems to be one person that has a loan but has no income information: 65134382764500000.

In [45]:
cols_to_remove = data.describe().count().index[data.describe().loc['count']<135]

In [46]:
data = data.drop(cols_to_remove, axis= 1)

In [47]:
data.describe().loc['count'].value_counts()

135.0    95
Name: count, dtype: int64

In [48]:
data.to_csv('data_merged_noMissingData.csv', index = False)

## Removing unnecessary variables: 

This shows how many columns have 139 entries

In [49]:
data = pd.read_csv('data_merged_nocatvars.csv')

In [50]:
data.shape

(135, 205)

In [51]:
pd.DataFrame(data.describe()).loc['count'].value_counts()

135.0    95
116.0    47
134.0    19
123.0     7
114.0     5
115.0     3
113.0     3
133.0     2
42.0      2
128.0     2
109.0     2
99.0      1
81.0      1
62.0      1
54.0      1
68.0      1
0.0       1
23.0      1
74.0      1
70.0      1
112.0     1
108.0     1
111.0     1
106.0     1
82.0      1
35.0      1
32.0      1
Name: count, dtype: int64

98 cols have 138 entries (e.g. no NA values) 

If we don't have information for over half the population, going to just delete those columns: 

In [52]:
data.describe().loc['count']<69

m_ids_owner                     False
owner_score                     False
binarize_score                  False
con_pur_hh_meanptrx             False
con_pur_hh_sdptrx               False
con_pur_hh_minpti               False
con_pur_hh_meanpti              False
con_pur_hh_medpti_x             False
con_pur_hh_maxpti               False
con_pur_hh_meanpti_ALC          False
con_pur_hh_medpti_ALC           False
con_pur_hh_meanpti_CLN          False
con_pur_hh_medpti_CLN           False
con_pur_hh_meanpti_CLTH         False
con_pur_hh_medpti_CLTH          False
con_pur_hh_meanpti_COMM         False
con_pur_hh_medpti_COMM          False
con_pur_hh_meanpti_EDU          False
con_pur_hh_medpti_EDU           False
con_pur_hh_meanpti_ENRGY        False
con_pur_hh_medpti_ENRGY         False
con_pur_hh_meanpti_ENT          False
con_pur_hh_medpti_ENT           False
con_pur_hh_meanpti_FOOD         False
con_pur_hh_medpti_FOOD          False
con_pur_hh_meanpti_H20          False
con_pur_hh_m

In [53]:
cols_to_remove = data.describe().count().index[data.describe().loc['count']<69]

In [54]:
data = data.drop(cols_to_remove, axis= 1)

In [55]:
data.describe().loc['count'].value_counts()

135.0    95
116.0    47
134.0    19
123.0     7
114.0     5
115.0     3
113.0     3
133.0     2
128.0     2
109.0     2
99.0      1
81.0      1
82.0      1
106.0     1
70.0      1
112.0     1
108.0     1
111.0     1
74.0      1
Name: count, dtype: int64

In [56]:
data.shape

(135, 196)