### import libraries and data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../data/poverty/train.csv')
test = pd.read_csv('../data/poverty/test.csv')

In [3]:
data_dict = pd.read_table('../data/poverty/data_dict.txt',header = None)

In [4]:
data_dict = data_dict[0].str.split(pat = ',',expand = True)

In [5]:
data_dict.set_index(0,inplace = True)

In [6]:
data_dict.head()

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
v2a1,Monthly rent payment,,,
hacdor,=1 Overcrowding by bedrooms,,,
rooms,number of all rooms in the house,,,
hacapo,=1 Overcrowding by rooms,,,
v14a,=1 has bathroom in the household,,,


In [7]:
train.shape

(9557, 143)

In [8]:
train.set_index('Id',inplace  = True)
test.set_index('Id',inplace  = True)

### analysis of target

In [9]:
train['Target'].value_counts(normalize = True)

4    0.627394
2    0.167103
3    0.126504
1    0.079000
Name: Target, dtype: float64

### how do we want to split up our data types?

In [10]:
data_types = train.dtypes

In [11]:
data_types.value_counts()

int64      130
float64      8
object       4
dtype: int64

#### object data types

In [12]:
data_types[data_types == 'object']

idhogar       object
dependency    object
edjefe        object
edjefa        object
dtype: object

In [13]:
train['edjefe'].value_counts()

no     3762
6      1845
11      751
9       486
3       307
15      285
8       257
7       234
5       222
14      208
17      202
2       194
4       137
16      134
yes     123
12      113
10      111
13      103
21       43
18       19
19       14
20        7
Name: edjefe, dtype: int64

#### float data types

In [14]:
float_index = data_types[data_types == 'float64'].index.values.tolist()

In [15]:
data_dict.loc[float_index,1]

0
v2a1                                            Monthly rent payment
v18q1                               number of tablets household owns
rez_esc                                       Years behind in school
meaneduc                 average years of education for adults (18+)
overcrowding                                      # persons per room
SQBovercrowding                                 overcrowding squared
SQBdependency                                     dependency squared
SQBmeaned           square of the mean years of education of adul...
Name: 1, dtype: object

In [16]:
train[float_index].isnull().sum()

v2a1               6860
v18q1              7342
rez_esc            7928
meaneduc              5
overcrowding          0
SQBovercrowding       0
SQBdependency         0
SQBmeaned             5
dtype: int64

#### integer data types

In [17]:
int_index = data_types[data_types == 'int64'].index.values.tolist()

In [18]:
data_dict.loc[int_index]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hacdor,=1 Overcrowding by bedrooms,,,
rooms,number of all rooms in the house,,,
hacapo,=1 Overcrowding by rooms,,,
v14a,=1 has bathroom in the household,,,
refrig,=1 if the household has refrigerator,,,
v18q,owns a tablet,,,
r4h1,Males younger than 12 years of age,,,
r4h2,Males 12 years of age and older,,,
r4h3,Total males in the household,,,
r4m1,Females younger than 12 years of age,,,


#### credit to https://www.kaggle.com/willkoehrsen/start-here-a-complete-walkthrough who has determined the data types and how they should be used

In [19]:
id_ = ['idhogar']

target = ['Target']

ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']

ind_ordered = ['rez_esc', 'escolari', 'age']

hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']

hh_ordered = [ 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1','r4m2','r4m3', 'r4t1',  'r4t2', 
              'r4t3', 'v18q1', 'tamhog','tamviv','hhsize','hogar_nin',
              'hogar_adul','hogar_mayor','hogar_total',  'bedrooms', 'qmobilephone']

hh_cont = ['v2a1', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding']

sqr_ = ['SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 
        'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned', 'agesq']

#### tidy up object data types

In [20]:
data_types[data_types == 'object']

idhogar       object
dependency    object
edjefe        object
edjefa        object
dtype: object

In [21]:
pd.options.display.max_colwidth = 1000

data_dict.loc['dependency']

1                                                                                                                          Dependency rate
2     calculated = (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)
3                                                                                                                                     None
4                                                                                                                                     None
Name: dependency, dtype: object

In [22]:
mapping = {'yes' : 1, 'no' : 0}

train['dependency'] = train['dependency'].replace(mapping).astype(np.float64)
train['edjefe'] = train['edjefe'].replace(mapping).astype(np.float64)
train['edjefa'] = train['edjefa'].replace(mapping).astype(np.float64)

In [23]:
test['dependency'] = test['dependency'].replace(mapping).astype(np.float64)
test['edjefe'] = test['edjefe'].replace(mapping).astype(np.float64)
test['edjefa'] = test['edjefa'].replace(mapping).astype(np.float64)

#### missing values

In [24]:
miss_vals = train[ind_bool + ind_ordered + hh_bool + hh_ordered + hh_cont + sqr_].isnull().sum()

miss_vals[miss_vals != 0]

rez_esc      7928
v18q1        7342
v2a1         6860
meaneduc        5
SQBmeaned       5
dtype: int64

In [25]:
miss_values_index = miss_vals[miss_vals != 0].index.values.tolist()

pd.options.display.max_colwidth = 1000

data_dict.loc[miss_values_index,:1]

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
rez_esc,Years behind in school
v18q1,number of tablets household owns
v2a1,Monthly rent payment
meaneduc,average years of education for adults (18+)
SQBmeaned,square of the mean years of education of adults (>=18) in the household


In [26]:
train.drop(columns = ['rez_esc','v18q1','v2a1'],inplace = True)
test.drop(columns = ['rez_esc','v18q1','v2a1'],inplace = True)

#remove the values from the indexes that we have previous got rid of

ind_ordered.remove('rez_esc')
hh_ordered.remove('v18q1')
hh_cont.remove('v2a1')

#### grouped data

In [27]:
ind_agged = train.groupby(['idhogar'])[ind_ordered].agg(['mean','min','max','std','count'])

#(thanks to https://www.kaggle.com/willkoehrsen/start-here-a-complete-walkthrough for this snippet)

new_cols = []
for col in ind_agged.columns.levels[0]:
    for stat in ind_agged.columns.levels[1]:
        new_cols.append(f'{col}-{stat}')
        
        
ind_agged.columns = new_cols

In [28]:
train = pd.merge(train,ind_agged,left_on = 'idhogar',right_index = True).copy()

In [29]:
ind_agged_t = test.groupby(['idhogar'])[ind_ordered].agg(['mean','min','max','std','count'])

ind_agged_t.columns = new_cols

In [30]:
ind_agged_t.shape

(7352, 10)

In [31]:
test = pd.merge(test,ind_agged_t,left_on = 'idhogar',right_index = True).copy()

### creating percentage columns

In [32]:
def calculate_percentages(num,denom):
    if denom == 0:
        return 0
    else:
        percentage = num/denom
        return percentage

In [33]:
train['r4h1_r4h3'] = train.apply(lambda x : calculate_percentages(x['r4h1'],x['r4h3']),axis = 1)
train['r4m1_r4m3'] = train.apply(lambda x : calculate_percentages(x['r4m1'],x['r4m3']),axis = 1)
train['r4t1_r4t3'] = train.apply(lambda x : calculate_percentages(x['r4t1'],x['r4t3']),axis = 1)

In [34]:
test['r4h1_r4h3'] = test.apply(lambda x : calculate_percentages(x['r4h1'],x['r4h3']),axis = 1)
test['r4m1_r4m3'] = test.apply(lambda x : calculate_percentages(x['r4m1'],x['r4m3']),axis = 1)
test['r4t1_r4t3'] = test.apply(lambda x : calculate_percentages(x['r4t1'],x['r4t3']),axis = 1)

In [35]:
new_cols.extend(['r4h1_r4h3','r4m1_r4m3','r4t1_r4t3'])

In [36]:
train['nin%'] = train.apply(lambda x : calculate_percentages(x['hogar_nin'],x['hogar_total']),axis = 1)
train['adul%'] = train.apply(lambda x : calculate_percentages(x['hogar_adul'],x['hogar_total']),axis = 1)
train['mayor%'] = train.apply(lambda x : calculate_percentages(x['hogar_mayor'],x['hogar_total']),axis = 1)

In [37]:
test['nin%'] = test.apply(lambda x : calculate_percentages(x['hogar_nin'],x['hogar_total']),axis = 1)
test['adul%'] = test.apply(lambda x : calculate_percentages(x['hogar_adul'],x['hogar_total']),axis = 1)
test['mayor%'] = test.apply(lambda x : calculate_percentages(x['hogar_mayor'],x['hogar_total']),axis = 1)

In [38]:
new_cols.extend(['nin%','adul%','mayor%'])

### create pipeline

In [39]:
train[ind_bool + ind_ordered + hh_bool + hh_ordered + hh_cont + sqr_ + new_cols].dtypes.value_counts()

int64      135
float64     18
dtype: int64

In [40]:
from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV, train_test_split

In [41]:
trfm_pipeline = Pipeline([
    ('imputer',Imputer(strategy = 'median'))
    ,('scaler',StandardScaler())
    #,('polynom',PolynomialFeatures(interaction_only=True))
])

# adding in polynomial features does not improve performance on the test set: my guess is that it is overfitting to train

In [42]:
ind_feats = trfm_pipeline.fit(train[ind_bool + ind_ordered + hh_bool + hh_ordered + hh_cont + sqr_ + new_cols])

#this returns an array

In [43]:
X_train,X_test,y_train,y_test = train_test_split(train[ind_bool + ind_ordered + hh_bool + hh_ordered + hh_cont + sqr_ + new_cols]
                                                ,train['Target']
                                                ,test_size = 0.33)

In [54]:
#very simply just do a random forest for now

clf = RandomForestClassifier(random_state=42)

In [55]:
model = Pipeline([
    ('features',ind_feats)
    ,('classifier',clf)
])

In [56]:
param_grid={}

model_CV = GridSearchCV(model,param_grid,cv=10)

In [57]:
model_CV.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('classifier', RandomForestClassifier(bootstrap=True, class_weight=None, criteri...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [58]:
from sklearn.metrics import f1_score,confusion_matrix,roc_curve

In [59]:
test_pred = model_CV.predict(X_test)

print(f1_score(y_test,test_pred,average='macro'))
print(confusion_matrix(y_test,test_pred))

0.8464179819497148
[[ 178   30    6   39]
 [   9  439   15   71]
 [   6   27  293   45]
 [   9   31   22 1934]]


### submit first attempt

In [60]:
predictions = model_CV.predict(test.loc[:, test.columns != 'idhogar'])

In [61]:
submission = pd.DataFrame(data = predictions,index = test.index,columns = ['Target'])

In [62]:
submission.to_csv('submission.csv')

In [53]:
submission['Target'].value_counts(normalize = True)

4    0.658954
2    0.204225
1    0.135521
3    0.001299
Name: Target, dtype: float64