# Voting classifier using binary classification
* Binary classicifation  
model_1 predicts that the household poverty level is 1 or not.  
model_2 ... 2 or not.  
model_3 ... 3 or not.  
model_4 ... 4 or not.  



* Voting classifier  
The household poverty level is which is predicted with the highest probability among all models.

In [1]:
input_dir = '../input/'
working_dir = '../working/'

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
import lightgbm as lgb
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test = pd.read_csv(os.path.join(input_dir, 'test.csv'))

# Set index
train.index = train['Id'].values
test.index = test['Id'].values

# Pick Target
train_target = train['Target']

# Union train and test
all_data = pd.concat([train.drop('Target', axis=1), test], axis=0)

print(train.shape)
print(test.shape)
print(all_data.shape)

(9557, 143)
(23856, 142)
(33413, 142)


In [4]:
# data cleaning
# copy from https://www.kaggle.com/katacs/data-cleaning-and-random-forest
# and make a change it
def data_cleaning(data):
    data['dependency']=np.sqrt(data['SQBdependency'])
#     data['rez_esc']=data['rez_esc'].fillna(0)
    data['v18q1']=data['v18q1'].fillna(0)
    data['v2a1']=data['v2a1'].fillna(0)
    
    data['edjefa'] = data['edjefa'].replace({'no': 0, 'yes': 1})
    data['edjefa'] = data['edjefa'].astype('int')
    data['edjefe'] = data['edjefe'].replace({'no': 0, 'yes': 1})
    data['edjefe'] = data['edjefe'].astype('int')
    meaneduc_nan=data[data['meaneduc'].isnull()][['Id','idhogar','escolari']]
    me=meaneduc_nan.groupby('idhogar')['escolari'].mean().reset_index()
    for row in meaneduc_nan.iterrows():
        idx=row[0]
        idhogar=row[1]['idhogar']
        m=me[me['idhogar']==idhogar]['escolari'].tolist()[0]
        data.at[idx, 'meaneduc']=m
        data.at[idx, 'SQBmeaned']=m*m
        
    return data

In [5]:
# train = data_cleaning(train)
# test = data_cleaning(test)
all_data = data_cleaning(all_data)

In [6]:
# Tranform One-hot variables into Categorical variables
def onehot2cat(data, cat_col_new, cat_col_olds):
    cat_col = data[cat_col_olds].idxmax(1)
    cat_col.name = cat_col_new
    cat_col = cat_col.astype('category')
    data = pd.concat([data, cat_col], axis=1)
    data = data.drop(cat_col_olds, axis=1)
    return data

In [7]:
cat_col_new_list = ['pared', 'piso', 'techo', 'abastagua', 'sanitario',
                'energcocinar', 'elimbasu', 'estadocivil',
                'parentesco', 'tipovivi', 'lugar', 'area']
cat_col_dict = {}
for cat_col_new in cat_col_new_list:
    cat_col_olds = [s for s in train.columns.tolist() if s.startswith(cat_col_new)]
    cat_col_dict[cat_col_new] = cat_col_olds
    
cat_col_dict['electricity'] = ['public', 'planpri', 'noelec', 'coopele']
cat_col_dict['sex'] = ['male', 'female']

In [8]:
for cat_col_new, cat_col_olds in cat_col_dict.items():
    print(cat_col_olds)
#     train = onehot2cat(train, cat_col_new, cat_col_olds)
#     test = onehot2cat(test, cat_col_new, cat_col_olds)
    all_data = onehot2cat(all_data, cat_col_new, cat_col_olds)

['paredblolad', 'paredzocalo', 'paredpreb', 'pareddes', 'paredmad', 'paredzinc', 'paredfibras', 'paredother']
['pisomoscer', 'pisocemento', 'pisoother', 'pisonatur', 'pisonotiene', 'pisomadera']
['techozinc', 'techoentrepiso', 'techocane', 'techootro']
['abastaguadentro', 'abastaguafuera', 'abastaguano']
['sanitario1', 'sanitario2', 'sanitario3', 'sanitario5', 'sanitario6']
['energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4']
['elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 'elimbasu5', 'elimbasu6']
['estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7']
['parentesco1', 'parentesco2', 'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 'parentesco7', 'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12']
['tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5']
['lugar1', 'lugar2', 'lugar3', 'lugar4', 'lugar5', 'lugar6']
['area1', 'area2']
['public', 'planpri', 'noelec', 

In [9]:
# Encode one-hot variables into numeric
# like (bad, regular, good) -> (0 ,1, 2)
def onehot2num(data, status_col_new, status_col_olds):
    status_df = data[status_col_olds]
    status_df.columns = list(range(len(status_col_olds)))
    num_col = status_df.idxmax(1)
    num_col.name = status_col_new
    data = pd.concat([data, num_col], axis=1)
    data = data.drop(status_col_olds, axis=1)
    return data

In [10]:
status_col_new_list = ['epared', 'etecho', 'eviv', 'instlevel']
status_col_dict = {}
for status_col_new in status_col_new_list:
    status_col_olds = [s for s in train.columns.tolist() if s.startswith(status_col_new)]
    status_col_dict[status_col_new] = status_col_olds

In [11]:
for status_col_new, status_col_olds in status_col_dict.items():
    print(status_col_olds)
#     train = onehot2num(train, status_col_new, status_col_olds)
#     test = onehot2num(test, status_col_new, status_col_olds)
    all_data = onehot2num(all_data, status_col_new, status_col_olds)

['epared1', 'epared2', 'epared3']
['etecho1', 'etecho2', 'etecho3']
['eviv1', 'eviv2', 'eviv3']
['instlevel1', 'instlevel2', 'instlevel3', 'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9']


In [12]:
# Delete needless columns
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone']
SQB_cols = [s for s in train.columns.tolist() if 'SQB' in s]
needless_cols.extend(SQB_cols)
# train = train.drop(needless_cols, axis=1)
# test = test.drop(needless_cols, axis=1)
all_data = all_data.drop(needless_cols, axis=1)

In [13]:
# Encode overcrowdig variables into crowding rate
all_data['hacdor_rate'] = all_data['hogar_total'] / all_data['bedrooms']
all_data['hacapo_rate'] = all_data['hogar_total'] / all_data['rooms']
all_data = all_data.drop(['hacdor', 'hacapo'], axis=1)

In [14]:
# Encode dummy variables into category type
dummy_col_list = ['cielorazo', 'dis', 'computer', 'television']
for dummy_col in dummy_col_list:
#     train[dummy_col] = train[dummy_col].astype('category')
#     test[dummy_col] = test[dummy_col].astype('category')
    all_data[dummy_col] = all_data[dummy_col].astype('category')

In [15]:
all_data.shape

(33413, 52)

## Aggregate household variables

大人の平均や子供の平均を取るとき、その世帯に大人あるいは子供がいない場合、値はNULLにする。

In [16]:
adult_data = all_data.query('age>18')
child_data = all_data.query('age<=18')

In [17]:
# DataFrameとカラム名が来たら、idhogarでgroupbyして、そのカラムの平均、分散、最小値などを一括で出力する関数

def agg_hogar(df, df_name, col):
    df_g = df.groupby('idhogar')[col]
    
    df_g_mean = df_g.mean()
    df_g_std =  df_g.std()
    df_g_min = df_g.min()
    df_g_median = df_g.median()
    df_g_max = df_g.max()
    df_g_range = df_g_max - df_g_min
    
    df_g_agg = pd.concat([df_g_mean,
                          df_g_std,
                          df_g_min,
                          df_g_median,
                          df_g_max,
                          df_g_range],
                         axis=1)
    
    agg_names = ['mean', 'std', 'min', 'median', 'max', 'range']
    col_names = []
    for agg_name in agg_names:
        col_name = df_name + '_' + col + '_' + agg_name
        col_names.append(col_name)
    df_g_agg.columns = col_names
    
    return df_g_agg

In [18]:
agg_dt_col_dict = {
    'all'  : ['age', 'rez_esc'],
    'adult': ['age', 'escolari', 'instlevel'],
    'child': ['age', 'rez_esc']
}

agg_dt_dict = {
    'all'  : all_data,
    'adult': adult_data,
    'child': child_data
}

In [19]:
hogar_data = all_data.groupby('idhogar')['idhogar'].head(1)
hogar_data.index = hogar_data

for key, cols_list_to_agg in agg_dt_col_dict.items():
    df = agg_dt_dict[key]
    for col in cols_list_to_agg:
        hogar_data = pd.concat([hogar_data, agg_hogar(df, key, col)], axis=1, sort=False)

In [20]:
# add male ratio
all_data_male_ratio = all_data[['Id', 'idhogar', 'sex']].copy()
all_data_male_ratio['is_male'] = all_data_male_ratio['sex']=='male'
is_male_g_mean = all_data_male_ratio.groupby('idhogar')['is_male'].mean()
is_male_g_mean.name = 'male_ratio'
hogar_data = pd.concat([hogar_data, all_data_male_ratio.groupby('idhogar')['is_male'].mean()], axis=1, sort=False)

## Rent Prediction
### Extract heads of household and tipovivi==3(Room for rent)

In [21]:
hh_data = all_data[all_data['parentesco']=='parentesco1']
rent_data = hh_data[hh_data['tipovivi']=='tipovivi3']
rent_data.shape

(1745, 52)

## Pick up columns related to dwellings

In [22]:
rent_df_col_list = ['v2a1', 'rooms', 'cielorazo', 'idhogar',
                     'bedrooms', 'pared', 'piso', 'techo',
                     'abastagua', 'sanitario', 'energcocinar',
                     'elimbasu', 'lugar', 'area',
                     'electricity', 'epared', 'etecho', 'eviv']

rent_df = rent_data[rent_df_col_list]

rent_idhogar = rent_df['idhogar']
rent_y = rent_df['v2a1']
rent_X = rent_df.drop(['idhogar', 'v2a1'], axis=1)

In [23]:
params = {'num_leaves': 9, 'min_data_in_leaf': 4, 'max_depth': 6}
gbm = lgb.LGBMRegressor(boosting_type='dart', objective='regression', random_state=0)
gbm.set_params(**params)
gbm.fit(rent_X, rent_y)
pred_rent = gbm.predict(all_data[rent_df_col_list].drop(['idhogar', 'v2a1'], axis=1))

## LightGBM

### Merge features

In [24]:
# rent prediction
all_data['pred_rent'] = pred_rent

# aggregated household features
all_data = pd.merge(all_data, hogar_data, on='idhogar', how='left')
all_data.set_index('Id', drop=False, inplace=True)

In [25]:
train = all_data.loc[train.index]
train['Target'] = train_target
train = train[train['parentesco']=='parentesco1']
train = train.drop('parentesco', axis=1)

test = all_data.loc[test.index]
test = test.drop('parentesco', axis=1)

In [26]:
# Split data
train_Id = train['Id'] # individual ID
train_idhogar = train['idhogar'] # household ID
train_y = train['Target'] # Target value
train_X = train.drop(['Id', 'Target', 'idhogar'], axis=1) # features

test_Id = test['Id'] # individual ID
test_idhogar = test['idhogar'] # household ID
test_X = test.drop(['Id', 'idhogar'], axis=1) # features

# Union train and test
# all_Id = pd.concat([train_Id, test_Id], axis=0, sort=False)
# all_idhogar = pd.concat([train_idhogar, test_idhogar], axis=0, sort=False)
# all_X = pd.concat([train_X, test_X], axis=0, sort=False)

## Encode train_y as binary variables
* model_1  
{'num_leaves': 54, 'min_data_in_leaf': 7, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74}  
  
* model_2  
{'num_leaves': 64, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74}  
  
* model_3  
{'num_leaves': 22, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74}  
  
* model_4  
{'num_leaves': 73, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74}  

In [27]:
param_dict_lsit = [
    {'num_leaves': 54, 'min_data_in_leaf': 7, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74},
    {'num_leaves': 64, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74},
    {'num_leaves': 22, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74},
    {'num_leaves': 73, 'min_data_in_leaf': 11, 'max_depth': 9, 'learning_rate': 0.09, 'feature_fraction': 0.74}
]

In [28]:
# それぞれのモデルで予測値をだすfor文を書きたい
def vb_gbm(X_train, y_train, X_test, param_dict_lsit):
    dict_num = 1
    pred_ys = pd.DataFrame()
    for param_dict in param_dict_lsit:
        y_train_i = y_train==dict_num
        gbm = lgb.LGBMClassifier(objective='binary', boosting_type='dart', class_weight='balanced', random_state=0)
        gbm.set_params(**param_dict)
        gbm.fit(X_train, y_train_i)
        pred_i = gbm.predict_proba(X_test)[:,1]
        pred_s = pd.Series(data=pred_i, name=str(dict_num))
        pred_ys = pd.concat([pred_ys, pred_s.to_frame()], axis=1)
        dict_num += 1
    return pred_ys

In [29]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.1, random_state=0)
pred_ys = vb_gbm(X_train, y_train, X_test, param_dict_lsit)
pred_ys.head()

Unnamed: 0,1,2,3,4
0,0.324497,0.452036,0.351618,0.398598
1,0.145019,0.330504,0.643416,0.440768
2,0.046721,0.153461,0.61453,0.683948
3,0.03344,0.06046,0.151099,0.848277
4,0.21443,0.295509,0.422478,0.704719


In [30]:
# pred_ys = (pred_ys)/(pred_ys.mean())
# pred_ys.head()

In [31]:
# maximum possible label
pred_ys['max'] = pred_ys.idxmax(axis=1)

# write labels in oreder of list (later element has priority)
pred_ys['pred'] = pd.Series(data=None, name='pred')
for i in [3,2,1,4]:
    for j in range(pred_ys.shape[0]):
        if pred_ys.loc[j, str(i)]>0.5:
            pred_ys.loc[j, 'pred'] = i
pred_ys.loc[pred_ys['pred'].isnull(), 'pred'] = pred_ys.loc[pred_ys['pred'].isnull(), 'max']

pred_ys.head()

Unnamed: 0,1,2,3,4,max,pred
0,0.324497,0.452036,0.351618,0.398598,2,2
1,0.145019,0.330504,0.643416,0.440768,3,3
2,0.046721,0.153461,0.61453,0.683948,4,4
3,0.03344,0.06046,0.151099,0.848277,4,4
4,0.21443,0.295509,0.422478,0.704719,4,4


In [32]:
y_test_pred = pred_ys['pred'].astype('int')

In [33]:
# y_test_pred = gbm.predict(X_test)
cm = confusion_matrix(y_test, y_test_pred)
cm_df_columns = ['pred_'+str(i) for i in range(cm.shape[0])]
cm_df_index = ['true_'+str(i) for i in range(cm.shape[0])]
cm_df = pd.DataFrame(data=cm, columns=cm_df_columns, index=cm_df_index)
f1 = f1_score(y_test, y_test_pred, average='macro')
print("confusion matrix: \n", cm_df)
print("macro F1 score: \n", f1)

confusion matrix: 
         pred_0  pred_1  pred_2  pred_3
true_0      10       5       1       6
true_1      11      16       9      12
true_2       7       9       6      16
true_3       5      13      16     156
macro F1 score: 
 0.42694147957305856


In [34]:
pred_ys_fin = vb_gbm(train_X, train_y, test_X, param_dict_lsit)

# maximum possible label
pred_ys_fin['max'] = pred_ys_fin.idxmax(axis=1)

# write labels in oreder of list (later element has priority)
pred_ys_fin['pred'] = pd.Series(data=None, name='pred')
for i in [3,2,1,4]:
    for j in range(pred_ys_fin.shape[0]):
        if pred_ys_fin.loc[j, str(i)]>0.5:
            pred_ys_fin.loc[j, 'pred'] = i
pred_ys_fin.loc[pred_ys_fin['pred'].isnull(), 'pred'] = pred_ys_fin.loc[pred_ys_fin['pred'].isnull(), 'max']

pred_ys_fin.head()

Unnamed: 0,1,2,3,4,max,pred
0,0.025761,0.069472,0.025572,0.967908,4,4
1,0.025866,0.033356,0.025572,0.966821,4,4
2,0.025866,0.033356,0.025572,0.966821,4,4
3,0.054122,0.028232,0.030697,0.957797,4,4
4,0.026453,0.052564,0.039217,0.954859,4,4


In [42]:
pred = pred_ys_fin['pred'].values.astype('int')
pred = pd.Series(data=pred, index=test_Id.values, name='Target')
pred = pd.concat([test_Id, pred], axis=1, join_axes=[test_Id.index])
pred.to_csv('20180916_vote_binary.csv', index=False)
pred.head()

Unnamed: 0,Id,Target
ID_2f6873615,ID_2f6873615,4
ID_1c78846d2,ID_1c78846d2,4
ID_e5442cf6a,ID_e5442cf6a,4
ID_a8db26a79,ID_a8db26a79,4
ID_a62966799,ID_a62966799,4
