In [52]:
import pandas as pd 
import os 
import glob 
from collections import Counter

# Merging

In [53]:
glob.glob("*csv")

['loan_outcomes_33_150.csv',
 'individual_outcomes_33_150.csv',
 'binary_train.csv',
 'all_data.csv',
 'binary_test.csv',
 'data_merged_noMissingData.csv',
 'data.csv',
 'diaries_trx_trunc_loans_start_bal.csv',
 'binary_y_train.csv',
 'data_merged_nocatvars.csv',
 'binary_x_train.csv',
 'binary_x_test.csv',
 'binary_y_test.csv']

In [54]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [55]:
data = pd.read_csv('all_data.csv', dtype = {'m_ids_owner': 'object'})

In [56]:
data.columns

Index(['m_ids_owner', 'owner_score', 'binarize_score', 'hh_ids', 'con_pur_hh_meanptrx', 'con_pur_hh_sdptrx', 'con_pur_hh_minpti', 'con_pur_hh_meanpti', 'con_pur_hh_medpti_x', 'con_pur_hh_maxpti',
       ...
       'inc_below_85_kes_kihbs', 'con_below_85_kes_kihbs', 'inc_below_170_kes_oecd', 'con_below_170_kes_oecd', 'inc_below_170_kes_kihbs', 'con_below_170_kes_kihbs', 'inc_below_425_kes_oecd', 'con_below_425_kes_oecd', 'inc_below_425_kes_kihbs', 'con_below_425_kes_kihbs'], dtype='object', length=138)

# Formatting: 

## Dealing with the categorical variables: 

In [57]:
data.dtypes[data.dtypes == 'object']

m_ids_owner         object
hh_ids              object
edu_i_attain        object
dem_i_male          object
dem_i_age_5yrgrp    object
dem_i_relhead       object
dem_i_marstat       object
dem_i_tribe         object
inc_i_dom           object
dtype: object

#### Dropping age group b/c we have age itself: 

In [58]:
data = data.drop(['dem_i_age_5yrgrp'], axis = 1)

#### Reformatting the edu_i_attain entries: 

In [59]:
data.edu_i_attain.value_counts()

Primary (some or complete)           71
Secondary (some or complete)         52
Post-secondary (some or complete)    11
No education                          3
Nursery/ Kindergarten                 1
Name: edu_i_attain, dtype: int64

In [60]:
new_edu = []
for idx, row in data.iterrows(): 
    if 'Primary' in str(row['edu_i_attain']): 
        new_edu.append(2)
    elif 'Secondary' in str(row['edu_i_attain']): 
        new_edu.append(3)
    elif 'Post-secondary' in str(row['edu_i_attain']): 
        new_edu.append(4)
    elif 'education' in str(row['edu_i_attain']): 
        new_edu.append(0)
    else: 
        new_edu.append(1)

In [61]:
Counter(new_edu)

Counter({0: 3, 1: 2, 2: 71, 3: 52, 4: 11})

In [62]:
data.edu_i_attain = new_edu

In [63]:
data.edu_i_attain.value_counts()

2    71
3    52
4    11
0     3
1     2
Name: edu_i_attain, dtype: int64

#### Reformatting the dem_i_male entries: 

In [64]:
data.dem_i_male.value_counts()

Female    100
Male       38
Name: dem_i_male, dtype: int64

In [65]:
new_male = []
for idx, row in data.iterrows(): 
    if 'Female' in str(row['dem_i_male']): 
        new_male.append(0)
    else: 
        new_male.append(1)

In [66]:
data.dem_i_male = new_male

In [67]:
data.dtypes[data.dtypes == 'object']

m_ids_owner      object
hh_ids           object
dem_i_relhead    object
dem_i_marstat    object
dem_i_tribe      object
inc_i_dom        object
dtype: object

#### Reformatting the dem_i_relhead entries: 

In [68]:
data.dem_i_relhead.value_counts()

Household head               92
Husband or wife              42
Son or daughter               3
Parent (mother or father)     1
Name: dem_i_relhead, dtype: int64

In [69]:
new_relhead = []
for idx, row in data.iterrows(): 
    if 'Household' in str(row['dem_i_relhead']): 
        new_relhead.append('hh_head')
    elif 'Husband' in str(row['dem_i_relhead']):
        new_relhead.append('h_or_w')
    elif 'Son' in str(row['dem_i_relhead']):
        new_relhead.append('s_or_d')    
    else: 
        new_relhead.append('parent')
        

In [70]:
data.dem_i_relhead = new_relhead

In [71]:
new_relhead = pd.get_dummies(data.dem_i_relhead, prefix = 'dem_i_relhead')

In [72]:
data = pd.concat([data.reset_index(drop=True), new_relhead], axis=1)

In [73]:
data = data.drop('dem_i_relhead', axis = 1)

In [74]:
# data.columns

In [75]:
data.dtypes[data.dtypes == 'object']

m_ids_owner      object
hh_ids           object
dem_i_marstat    object
dem_i_tribe      object
inc_i_dom        object
dtype: object

#### Reformatting the dem_i_marstat entries: 

In [76]:
data.dem_i_marstat.value_counts()

Married/living together         104
Widowed                          17
Separated/divorced                9
Never married/lived together      8
Name: dem_i_marstat, dtype: int64

In [77]:
new_marstat = []
for idx, row in data.iterrows(): 
    if 'Married' in str(row['dem_i_marstat']): 
        new_marstat.append('married')
    elif 'Widowed' in str(row['dem_i_marstat']):
        new_marstat.append('widowed')
    elif 'Separated' in str(row['dem_i_marstat']):
        new_marstat.append('separated')    
    else: 
        new_marstat.append('never_married')
        

In [78]:
data.dem_i_marstat = new_marstat

In [79]:
new_marstat = pd.get_dummies(data.dem_i_marstat)

In [80]:
data = pd.concat([data.reset_index(drop=True), new_marstat], axis=1)

In [81]:
data = data.drop('dem_i_marstat', axis = 1)

In [82]:
data.dtypes[data.dtypes == 'object']

m_ids_owner    object
hh_ids         object
dem_i_tribe    object
inc_i_dom      object
dtype: object

#### Reformatting the dem_i_tribe entries: 

In [83]:
data.dem_i_tribe.value_counts()

Luhya                40
Kamba                33
Kalenjin             25
Mijikenda/Swahili    15
Luo                  11
Kikuyu               11
Kisii                 2
Taita/Taveta          1
Name: dem_i_tribe, dtype: int64

In [84]:
new_tribe = pd.get_dummies(data.dem_i_tribe, prefix = 'tribe')

In [85]:
data = pd.concat([data.reset_index(drop=True), new_tribe], axis=1)

In [86]:
data = data.drop('dem_i_tribe', axis = 1)

In [87]:
data.dtypes[data.dtypes == 'object']

m_ids_owner    object
hh_ids         object
inc_i_dom      object
dtype: object

#### Reformatting the inc_i_dom entries: 

In [88]:
data.inc_i_dom.value_counts()

SEI       51
CITMPC    27
AGRI      23
REI       21
NEINCG     8
RIOI       6
Name: inc_i_dom, dtype: int64

In [89]:
new_tribe = pd.get_dummies(data.inc_i_dom, prefix = 'inc_i_dom')

In [90]:
data = pd.concat([data.reset_index(drop=True), new_tribe], axis=1)

In [91]:
data = data.drop('inc_i_dom', axis = 1)

In [92]:
data.dtypes[data.dtypes == 'object']

m_ids_owner    object
hh_ids         object
dtype: object

###  deleting "HH" from m_ids_owner

In [93]:
data.loc[data['m_ids_owner']=="HH"]

Unnamed: 0.1,m_ids_owner,owner_score,binarize_score,hh_ids,con_pur_hh_meanptrx,con_pur_hh_sdptrx,con_pur_hh_minpti,con_pur_hh_meanpti,con_pur_hh_medpti_x,con_pur_hh_maxpti,con_pur_hh_meanpti_ALC,con_pur_hh_medpti_ALC,con_pur_hh_meanpti_CLN,con_pur_hh_medpti_CLN,con_pur_hh_meanpti_CLTH,con_pur_hh_medpti_CLTH,con_pur_hh_meanpti_COMM,con_pur_hh_medpti_COMM,con_pur_hh_meanpti_EDU,con_pur_hh_medpti_EDU,con_pur_hh_meanpti_ENRGY,con_pur_hh_medpti_ENRGY,con_pur_hh_meanpti_ENT,con_pur_hh_medpti_ENT,con_pur_hh_meanpti_FOOD,con_pur_hh_medpti_FOOD,con_pur_hh_meanpti_H20,con_pur_hh_medpti_H20,con_pur_hh_meanpti_HSNG,con_pur_hh_medpti_HSNG,con_pur_hh_meanpti_MED,con_pur_hh_medpti_MED,con_pur_hh_meanpti_OTH,con_pur_hh_medpti_OTH,con_pur_hh_meanpti_PERS,con_pur_hh_medpti_PERS,con_pur_hh_meanpti_REL,con_pur_hh_medpti_REL,con_pur_hh_meanpti_TRNSP,con_pur_hh_medpti_TRNSP,happiness,economically,relationships,confidence,police,doctor,unsafe,utilities,asset_taken,miss_app,miss_inc,romantic,hungry,school_sent,m_ids_x,edu_i_enrolled,edu_i_attain,edu_i_attain_yrs,edu_hh_child613inschool,edu_hh_child1417inschool,edu_hh_girls613inschool,edu_hh_girls1417inschool,edu_hh_boys613inschool,edu_hh_boys1417inschool,edu_hh_headattain_yrs,m_ids_y,dem_i_male,dem_i_age_yrs,dem_i_age_workage,dem_i_hhead,dem_i_children_tot,dem_i_children_inhh,dem_urban_x,dem_hh_size_x,dem_hh_meanage,dem_hh_ages0_4,dem_hh_ages5_14,dem_hh_ages15_17,dem_hh_ages0_17,dem_hh_ages15plus,dem_hh_ages18plus,dem_hh_ages65plus,dem_hh_ages20_30,dem_hh_workage,dem_hh_dependents,dem_hh_agehead,dem_hh_malehead,dem_pae_oecd_x,dem_pae_kihbs_x,Unnamed: 0,hh_income,hh_income_only,inc_i_unique_sources,inc_i_minpti,inc_i_meanpti,inc_i_sdpti,inc_i_maxpti,inc_i_meanpti_AGRI,inc_i_sdpti_AGRI,inc_i_meanpti_CITMPC,inc_i_sdpti_CITMPC,inc_i_meanpti_NEINCG,inc_i_sdpti_NEINCG,inc_i_meanpti_REI,inc_i_sdpti_REI,inc_i_meanpti_RIOI,inc_i_sdpti_RIOI,inc_i_meanpti_SEI,inc_i_sdpti_SEI,inc_hh_medianpti,con_pur_hh_medpti_y,con_prod_hh_medpti,dem_urban_y,dem_hh_size_y,dem_pae_oecd_y,dem_pae_kihbs_y,con_hh_medpti,inc_hh_pti_pae_oecd,con_hh_pti_pae_oecd,inc_hh_pti_pae_kihbs,con_hh_pti_pae_kihbs,inc_below_85_kes_oecd,con_below_85_kes_oecd,inc_below_85_kes_kihbs,con_below_85_kes_kihbs,inc_below_170_kes_oecd,con_below_170_kes_oecd,inc_below_170_kes_kihbs,con_below_170_kes_kihbs,inc_below_425_kes_oecd,con_below_425_kes_oecd,inc_below_425_kes_kihbs,con_below_425_kes_kihbs,dem_i_relhead_h_or_w,dem_i_relhead_hh_head,dem_i_relhead_parent,dem_i_relhead_s_or_d,married,never_married,separated,widowed,tribe_Kalenjin,tribe_Kamba,tribe_Kikuyu,tribe_Kisii,tribe_Luhya,tribe_Luo,tribe_Mijikenda/Swahili,tribe_Taita/Taveta,inc_i_dom_AGRI,inc_i_dom_CITMPC,inc_i_dom_NEINCG,inc_i_dom_REI,inc_i_dom_RIOI,inc_i_dom_SEI
35,HH,0.666667,999,KVIHC16,115.6588,345.8901,1070,5825.0,5470.0,15380,0.0,0.0,119.0909,150.0,50.0,0,72.72727,70.0,1145.455,0.0,234.5455,230.0,0.0,0,3925.909,4040.0,0.0,0.0,0.0,0.0,50.0,0,27.27273,0.0,10.90909,0.0,15.45455,0.0,173.6364,60.0,3.6,3.45,4.0,3.95,0.0,0.157895,0.0,0.0,0.0,0.052632,0.052632,0.0,0.105263,0.272727,,,1,,,,,,,,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5345.0,5470.0,102.8571,0,5,3.2,3.95,5572.857,54.76434,57.09894,44.36605,46.25737,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [94]:
data = data.drop(35)

In [95]:
data.to_csv('data_merged_nocatvars.csv', index = False)

## I'm only going to take those variables that are complete from here; later once Dinara's ds is fixed we can do more with imputation and stuff: 

In [96]:
data = pd.read_csv('data_merged_nocatvars.csv')

In [97]:
data.describe().loc['count']

m_ids_owner                 138.0
owner_score                 138.0
binarize_score              138.0
con_pur_hh_meanptrx         138.0
con_pur_hh_sdptrx           138.0
con_pur_hh_minpti           138.0
con_pur_hh_meanpti          138.0
con_pur_hh_medpti_x         138.0
con_pur_hh_maxpti           138.0
con_pur_hh_meanpti_ALC      138.0
con_pur_hh_medpti_ALC       138.0
con_pur_hh_meanpti_CLN      138.0
con_pur_hh_medpti_CLN       138.0
con_pur_hh_meanpti_CLTH     138.0
con_pur_hh_medpti_CLTH      138.0
con_pur_hh_meanpti_COMM     138.0
con_pur_hh_medpti_COMM      138.0
con_pur_hh_meanpti_EDU      138.0
con_pur_hh_medpti_EDU       138.0
con_pur_hh_meanpti_ENRGY    138.0
con_pur_hh_medpti_ENRGY     138.0
con_pur_hh_meanpti_ENT      138.0
con_pur_hh_medpti_ENT       138.0
con_pur_hh_meanpti_FOOD     138.0
con_pur_hh_medpti_FOOD      138.0
con_pur_hh_meanpti_H20      138.0
con_pur_hh_medpti_H20       138.0
con_pur_hh_meanpti_HSNG     138.0
con_pur_hh_medpti_HSNG      138.0
con_pur_hh_mea

In [98]:
cols_to_remove = data.describe().count().index[data.describe().loc['count']<138]

In [99]:
data = data.drop(cols_to_remove, axis= 1)

In [100]:
data.describe().loc['count'].value_counts()

138.0    124
Name: count, dtype: int64

In [101]:
data.to_csv('data_merged_noMissingData.csv', index = False)

## Removing unnecessary variables: 

This shows how many columns have 139 entries

In [102]:
data = pd.read_csv('data_merged_nocatvars.csv')

In [103]:
data.shape

(138, 155)

In [104]:
pd.DataFrame(data.describe()).loc['count'].value_counts()

138.0    124
136.0     15
125.0      7
131.0      2
43.0       1
70.0       1
54.0       1
63.0       1
82.0       1
101.0      1
Name: count, dtype: int64

98 cols have 138 entries (e.g. no NA values) 

If we don't have information for over half the population, going to just delete those columns: 

In [105]:
data.describe().loc['count']<69

m_ids_owner                 False
owner_score                 False
binarize_score              False
con_pur_hh_meanptrx         False
con_pur_hh_sdptrx           False
con_pur_hh_minpti           False
con_pur_hh_meanpti          False
con_pur_hh_medpti_x         False
con_pur_hh_maxpti           False
con_pur_hh_meanpti_ALC      False
con_pur_hh_medpti_ALC       False
con_pur_hh_meanpti_CLN      False
con_pur_hh_medpti_CLN       False
con_pur_hh_meanpti_CLTH     False
con_pur_hh_medpti_CLTH      False
con_pur_hh_meanpti_COMM     False
con_pur_hh_medpti_COMM      False
con_pur_hh_meanpti_EDU      False
con_pur_hh_medpti_EDU       False
con_pur_hh_meanpti_ENRGY    False
con_pur_hh_medpti_ENRGY     False
con_pur_hh_meanpti_ENT      False
con_pur_hh_medpti_ENT       False
con_pur_hh_meanpti_FOOD     False
con_pur_hh_medpti_FOOD      False
con_pur_hh_meanpti_H20      False
con_pur_hh_medpti_H20       False
con_pur_hh_meanpti_HSNG     False
con_pur_hh_medpti_HSNG      False
con_pur_hh_mea

In [121]:
cols_to_remove = data.describe().count().index[data.describe().loc['count']<69]

In [122]:
data = data.drop(cols_to_remove, axis= 1)

In [123]:
data.describe().loc['count'].value_counts()

138.0    98
121.0    41
136.0    15
125.0     7
119.0     6
118.0     3
131.0     2
114.0     2
120.0     2
113.0     1
78.0      1
74.0      1
117.0     1
70.0      1
82.0      1
101.0     1
116.0     1
Name: count, dtype: int64

In [124]:
data.shape

(138, 185)