In [44]:
import pandas as pd 
import os 
import glob 
from collections import Counter

# Merging

In [3]:
glob.glob('../datasets/*csv')

['../datasets/cons_wb_go_edu_demos.csv',
 '../datasets/edu_hh_ind_merge.csv',
 '../datasets/wellbeing_goingson.csv',
 '../datasets/goingson.csv',
 '../datasets/demos_merged.csv',
 '../datasets/wellbeing.csv',
 '../datasets/consumption_mapped_to_m_ids.csv']

In [4]:
predictors = pd.read_csv('../datasets/cons_wb_go_edu_demos.csv', dtype = {'m_ids_owner': 'object'})

In [5]:
glob.glob('*csv')

['loan_outcomes_33_150.csv',
 'individual_outcomes_33_150.csv',
 'data.csv',
 'diaries_trx_trunc_loans_start_bal.csv',
 'binary_y_train.csv',
 'binary_x_train.csv',
 'binary_x_test.csv',
 'binary_y_test.csv']

In [6]:
outcome = pd.read_csv('individual_outcomes_33_150.csv', dtype = {'m_ids_owner': 'object'})

In [7]:
print(predictors.shape)
print(outcome.shape)

(670, 91)
(139, 4)


In [8]:
predictors.head()

Unnamed: 0,m_ids_owner,hh_ids,con_pur_hh_meanptrx,con_pur_hh_sdptrx,con_pur_hh_minpti,con_pur_hh_meanpti,con_pur_hh_medpti,con_pur_hh_maxpti,con_pur_hh_meanpti_ALC,con_pur_hh_medpti_ALC,...,dem_hh_ages15plus,dem_hh_ages18plus,dem_hh_ages65plus,dem_hh_ages20_30,dem_hh_workage,dem_hh_dependents,dem_hh_agehead,dem_hh_malehead,dem_pae_oecd,dem_pae_kihbs
0,58134383397900000,KELDL02,107.9147,286.7984,3293,14372.27,15640.0,23670,204.5455,0.0,...,2.0,2.0,0.0,2.0,0.5,0.5,27.0,1.0,2.7,2.48
1,58134383535800000,KELDL02,107.9147,286.7984,3293,14372.27,15640.0,23670,204.5455,0.0,...,2.0,2.0,0.0,2.0,0.5,0.5,27.0,1.0,2.7,2.48
2,HH,KVIHC16,115.6588,345.8901,1070,5825.0,5470.0,15380,0.0,0.0,...,,,,,,,,,,
3,59134423954300000,KVIHC16,115.6588,345.8901,1070,5825.0,5470.0,15380,0.0,0.0,...,2.0,1.0,0.0,0.0,0.4,0.6,39.0,0.0,3.2,3.95
4,65134441430300000,KELDK21,1061.19,4257.939,1150,26529.75,17300.0,75020,0.0,0.0,...,4.0,4.0,1.0,1.0,0.75,0.25,56.0,1.0,3.1,4.0


In [9]:
outcome.head()

Unnamed: 0,loans,m_ids_owner,owner_score,binarize_score
0,"['56134804374600000.3', '56134804374600000.1',...",65134441430300000,0.75,1
1,"['60137430710900000.1', '60134978362300000.1']",60134547419200000,0.5,999
2,"['63136740549700000.1', '63136740416600000.2',...",63134425702500000,1.0,1
3,"['89136459180500000.2', '89136459180500000.1']",64134429266300000,1.0,1
4,"['105136540140100000.1', '105136540140100000.2']",65134432186900000,1.0,1


In [10]:
 outcome = outcome.drop(['loans'], axis = 1)

In [11]:
outcome.shape

(139, 3)

In [118]:
data = outcome.merge(predictors, on = 'm_ids_owner', how = 'left')

In [119]:
data.shape

(139, 93)

In [120]:
# data.to_csv('data.csv', index = False)

# Formatting: 

## Dealing with the categorical variables: 

In [121]:
data.dtypes[data.dtypes == 'object']

m_ids_owner         object
hh_ids              object
edu_i_attain        object
dem_i_male          object
dem_i_age_5yrgrp    object
dem_i_relhead       object
dem_i_marstat       object
dem_i_tribe         object
dtype: object

#### Dropping age group b/c we have age itself: 

In [122]:
data = data.drop(['dem_i_age_5yrgrp'], axis = 1)

#### Reformatting the edu_i_attain entries: 

In [123]:
data.edu_i_attain.value_counts()

Primary (some or complete)           71
Secondary (some or complete)         52
Post-secondary (some or complete)    11
No education                          3
Nursery/ Kindergarten                 1
Name: edu_i_attain, dtype: int64

In [124]:
new_edu = []
for idx, row in data.iterrows(): 
    if 'Primary' in str(row['edu_i_attain']): 
        new_edu.append(2)
    elif 'Secondary' in str(row['edu_i_attain']): 
        new_edu.append(3)
    elif 'Post-secondary' in str(row['edu_i_attain']): 
        new_edu.append(4)
    elif 'education' in str(row['edu_i_attain']): 
        new_edu.append(0)
    else: 
        new_edu.append(1)

In [125]:
Counter(new_edu)

Counter({0: 3, 1: 2, 2: 71, 3: 52, 4: 11})

In [126]:
data.edu_i_attain = new_edu

In [127]:
data.edu_i_attain.value_counts()

2    71
3    52
4    11
0     3
1     2
Name: edu_i_attain, dtype: int64

#### Reformatting the dem_i_male entries: 

In [128]:
data.dem_i_male.value_counts()

Female    100
Male       38
Name: dem_i_male, dtype: int64

In [129]:
new_male = []
for idx, row in data.iterrows(): 
    if 'Female' in str(row['dem_i_male']): 
        new_male.append(0)
    else: 
        new_male.append(1)

In [130]:
data.dem_i_male = new_male

In [131]:
data.dtypes[data.dtypes == 'object']

m_ids_owner      object
hh_ids           object
dem_i_relhead    object
dem_i_marstat    object
dem_i_tribe      object
dtype: object

#### Reformatting the dem_i_relhead entries: 

In [132]:
data.dem_i_relhead.value_counts()

Household head               92
Husband or wife              42
Son or daughter               3
Parent (mother or father)     1
Name: dem_i_relhead, dtype: int64

In [133]:
new_relhead = []
for idx, row in data.iterrows(): 
    if 'Household' in str(row['dem_i_relhead']): 
        new_relhead.append('hh_head')
    elif 'Husband' in str(row['dem_i_relhead']):
        new_relhead.append('h_or_w')
    elif 'Son' in str(row['dem_i_relhead']):
        new_relhead.append('s_or_d')    
    else: 
        new_relhead.append('parent')
        

In [134]:
data.dem_i_relhead = new_relhead

In [135]:
new_relhead = pd.get_dummies(data.dem_i_relhead, prefix = 'dem_i_relhead')

In [136]:
data = pd.concat([data.reset_index(drop=True), new_relhead], axis=1)

In [138]:
data = data.drop('dem_i_relhead', axis = 1)

In [140]:
# data.columns

In [141]:
data.dtypes[data.dtypes == 'object']

m_ids_owner      object
hh_ids           object
dem_i_marstat    object
dem_i_tribe      object
dtype: object

#### Reformatting the dem_i_marstat entries: 

In [142]:
data.dem_i_marstat.value_counts()

Married/living together         104
Widowed                          17
Separated/divorced                9
Never married/lived together      8
Name: dem_i_marstat, dtype: int64

In [145]:
new_marstat = []
for idx, row in data.iterrows(): 
    if 'Married' in str(row['dem_i_marstat']): 
        new_marstat.append('married')
    elif 'Widowed' in str(row['dem_i_marstat']):
        new_marstat.append('widowed')
    elif 'Separated' in str(row['dem_i_marstat']):
        new_marstat.append('separated')    
    else: 
        new_marstat.append('never_married')
        

In [146]:
data.dem_i_marstat = new_marstat

In [147]:
new_marstat = pd.get_dummies(data.dem_i_marstat)

In [148]:
data = pd.concat([data.reset_index(drop=True), new_marstat], axis=1)

In [149]:
data = data.drop('dem_i_marstat', axis = 1)

In [150]:
data.dtypes[data.dtypes == 'object']

m_ids_owner    object
hh_ids         object
dem_i_tribe    object
dtype: object

#### Reformatting the dem_i_tribe entries: 

In [151]:
data.dem_i_tribe.value_counts()

Luhya                40
Kamba                33
Kalenjin             25
Mijikenda/Swahili    15
Luo                  11
Kikuyu               11
Kisii                 2
Taita/Taveta          1
Name: dem_i_tribe, dtype: int64

In [152]:
new_tribe = pd.get_dummies(data.dem_i_tribe, prefix = 'tribe')

In [153]:
data = pd.concat([data.reset_index(drop=True), new_tribe], axis=1)

In [154]:
data = data.drop('dem_i_tribe', axis = 1)

In [155]:
data.dtypes[data.dtypes == 'object']

m_ids_owner    object
hh_ids         object
dtype: object

In [156]:
data.to_csv('data_merged_nocatvars.csv', index = False)