In [62]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import tqdm.notebook as tq
import scipy.cluster.hierarchy as spc
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.feature_selection import VarianceThreshold
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split as tts, cross_val_score as cv, RepeatedStratifiedKFold as rsk
from sklearn.ensemble import RandomForestClassifier as rf, ExtraTreesClassifier as et, BaggingClassifier as bc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.utils import class_weight
import lightgbm as lgb

In [82]:
test_df = pd.read_csv("test_df_renamed_new.csv")

In [107]:
column = list((test_df.columns.drop('=1 if household head')))
column.append('=1 if household head')

In [115]:
test_df = test_df[column]
test_df = test_df.drop(columns = ['Household level identifier','Id'])

In [116]:
test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level,=1 if household head
0,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,4,17,3,3,3,1,0
1,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,41,17,3,3,3,8,0
2,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,41,17,3,3,3,9,1
3,0.0,0,14,0,1,1.0,0,1,1,0,...,0,0,0,59,16,3,3,3,8,1
4,175000.0,0,4,0,1,1.0,0,0,0,0,...,0,0,0,18,11,2,3,3,5,1


In [121]:
#need to normalise some of the columns
#need to normalise some of the columns
def prepData(df,test_size = 0.3,normalize = True,include_eval = False):
    unnormal_cols = selectUnnormalised(df)
    x = df.drop("=1 if household head",axis = 1)
    y = df['=1 if household head'] -1
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = test_size, random_state = 42)
    if include_eval:
        xtest, x_eval, ytest, y_eval = tts(xtest,ytest,test_size = 0.1/0.3, random_state = 42)
        return xtest, xtest, x_eval, ytest, ytest, y_eval
    else:
        if normalize:
            xtrain, xtest = normalise(unnormal_cols, xtrain, xtest)
            ytrain, ytest = ytrain.values.ravel(), ytest.values.ravel()
            return  xtrain, xtest, ytrain, ytest
        return  xtrain, xtest, ytrain, ytest
    
#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    return unnormCols

#normalising data in testing set
def normalise(unnormCols, xtest, xTest):
    #normalise testing data
    toBeNorm_test = xtest[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_test)
    xtest_norm = std_scale.transform(toBeNorm_test)
    
    #covert numpy array to df
    xtest_normCol = pd.DataFrame(xtest_norm, index = toBeNorm_test.index,
                                 columns = toBeNorm_test.columns)
    xtest.update(xtest_normCol)
    
    #normalise test data using mean and SD of testing set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xtest, xTest


In [122]:
xtrain, xtest, ytrain, ytest = prepData(test_df,test_size = 0.1)

In [127]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03,
                             random_state=None, silent=True, 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xtrain.values, ytrain, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xtest)



[100]	valid_0's binary_logloss: 0.0523509
[200]	valid_0's binary_logloss: 0.0231232
[300]	valid_0's binary_logloss: 0.0206546
[400]	valid_0's binary_logloss: 0.0224787
[500]	valid_0's binary_logloss: 0.0250928
[600]	valid_0's binary_logloss: 0.0277075


In [128]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n", cm)
# print("macro F1 score: \n", f1)
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(ytest, y_pred, average = 'macro')))

confusion matrix: 
 [[1606   15]
 [   1  764]]
Classification accuracy: 0.99
F1 score : 0.99


# Machine Learning Testing

In [26]:
train_df = pd.read_csv("train_df_renamed_new.csv")
train_df = train_df.drop(columns = ['Household level identifier','Id'])
train_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 has toilet in the household,=1 if the household has refrigerator,owns a tablet,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,...,age squared,hogar_total squared,edjefe squared,hogar_nin squared,overcrowding squared,dependency squared,meaned squared,Age squared,Years of education of head of household,Target
0,190000.0,0,3,0,1,1,0,0.0,0,1,...,1849,1,100,0,1.0,0.0,100.0,1849,10,4
1,135000.0,0,4,0,1,1,1,1.0,0,1,...,4489,1,144,0,1.0,64.0,144.0,4489,12,4
2,0.0,0,8,0,1,1,0,0.0,0,0,...,8464,1,0,0,0.25,64.0,121.0,8464,11,4
3,180000.0,0,5,0,1,1,1,1.0,0,2,...,289,16,121,4,1.777778,1.0,121.0,289,11,4
4,180000.0,0,5,0,1,1,1,1.0,0,2,...,1369,16,121,4,1.777778,1.0,121.0,1369,11,4


In [27]:
test_df = pd.read_csv("test_df_renamed_new.csv")
identity = test_df.iloc[:, 1:2]
test_df = test_df.drop(columns = ['Household level identifier','Id'])
test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 has toilet in the household,=1 if the household has refrigerator,owns a tablet,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,...,escolari squared,age squared,hogar_total squared,edjefe squared,hogar_nin squared,overcrowding squared,dependency squared,meaned squared,Age squared,Years of education of head of household
0,0.0,0,5,0,1,1,0,0.0,1,1,...,0,16,9,0,1,2.25,0.25,272.25,16,17
1,0.0,0,5,0,1,1,0,0.0,1,1,...,256,1681,9,0,1,2.25,0.25,272.25,1681,17
2,0.0,0,5,0,1,1,0,0.0,1,1,...,289,1681,9,0,1,2.25,0.25,272.25,1681,17
3,0.0,0,14,0,1,1,1,1.0,0,1,...,256,3481,1,256,0,1.0,0.0,256.0,3481,16
4,175000.0,0,4,0,1,1,1,1.0,0,0,...,121,324,1,0,1,0.25,64.0,121.0,324,11


## Helper Functions

In [70]:
#need to normalise some of the columns
#need to normalise some of the columns
def prepData(df,test_size = 0.3,normalize = True,include_eval = False):
    unnormal_cols = selectUnnormalised(df)
    x = df.drop("Target",axis = 1)
    y = df['Target'] -1
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = test_size, random_state = 42)
    if include_eval:
        xtest, x_eval, ytest, y_eval = tts(xtest,ytest,test_size = 0.1/0.3, random_state = 42)
        return xtest, xtest, x_eval, ytest, ytest, y_eval
    else:
        if normalize:
            xtrain, xtest = normalise(unnormal_cols, xtrain, xtest)
            ytrain, ytest = ytrain.values.ravel(), ytest.values.ravel()
            return  xtrain, xtest, ytrain, ytest
        return  xtrain, xtest, ytrain, ytest
    
    
    

#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    unnormCols.remove('Target')
    return unnormCols

#normalising data in testing set
def normalise(unnormCols, xtest, xTest):
    #normalise testing data
    toBeNorm_test = xtest[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_test)
    xtest_norm = std_scale.transform(toBeNorm_test)
    
    #covert numpy array to df
    xtest_normCol = pd.DataFrame(xtest_norm, index = toBeNorm_test.index,
                                 columns = toBeNorm_test.columns)
    xtest.update(xtest_normCol)
    
    #normalise test data using mean and SD of testing set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xtest, xTest



In [71]:
# xtest, xtest, ytest, ytest = prepData(test_df,normalize=False)
xtest, xtest, ytest, ytest = prepData(train_df,test_size = 0.1)

## LightGBM

In [67]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xtest.values, ytest, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xtest)



[100]	valid_0's multi_logloss: 0.811486
[200]	valid_0's multi_logloss: 0.638983
[300]	valid_0's multi_logloss: 0.51657
[400]	valid_0's multi_logloss: 0.422425
[500]	valid_0's multi_logloss: 0.349278
[600]	valid_0's multi_logloss: 0.289432
[700]	valid_0's multi_logloss: 0.240412
[800]	valid_0's multi_logloss: 0.199422
[900]	valid_0's multi_logloss: 0.167122
[1000]	valid_0's multi_logloss: 0.140825
[1100]	valid_0's multi_logloss: 0.118221
[1200]	valid_0's multi_logloss: 0.0988971
[1300]	valid_0's multi_logloss: 0.0833492
[1400]	valid_0's multi_logloss: 0.0703137
[1500]	valid_0's multi_logloss: 0.0594297
[1600]	valid_0's multi_logloss: 0.0496408
[1700]	valid_0's multi_logloss: 0.0420378
[1800]	valid_0's multi_logloss: 0.0358641
[1900]	valid_0's multi_logloss: 0.0305127
[2000]	valid_0's multi_logloss: 0.0258896
[2100]	valid_0's multi_logloss: 0.0220988
[2200]	valid_0's multi_logloss: 0.0187699
[2300]	valid_0's multi_logloss: 0.0160448
[2400]	valid_0's multi_logloss: 0.0137567
[2500]	valid_

In [74]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n", cm)
# print("macro F1 score: \n", f1)
print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(ytest, y_pred, average = 'macro')))

confusion matrix: 
 [[ 79   0   0   0]
 [  0 156   0   0]
 [  0   0 131   0]
 [  0   0   0 590]]
Classification accuracy: 1.00
F1 score : 1.00


In [878]:
final_df = copy.deepcopy(xtest)
final_df['Target'] = ytest
final_df['Pred'] = y_pred
final_df

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 has toilet in the household,=1 if the household has refrigerator,owns a tablet,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,...,hogar_total squared,edjefe squared,hogar_nin squared,overcrowding squared,dependency squared,meaned squared,Age squared,Years of education of head of household,Target,Pred
8503,-0.421354,0,-0.652043,0,1,1,0,-0.467602,-0.571187,-1.506516,...,-0.967108,-0.679713,-0.555631,-0.722963,-0.312299,-0.708219,1.116875,-0.450839,1,1
7752,-0.421354,0,-0.652043,0,1,1,0,-0.467602,-0.571187,0.426580,...,0.894454,-0.565009,1.760820,1.383382,0.007169,1.988735,-0.667071,-1.133406,0,0
9350,-0.421354,0,-0.652043,0,1,1,0,-0.467602,0.901028,0.426580,...,-0.169296,-0.628734,0.023482,-0.355189,-0.232432,-0.961392,-0.897793,-1.360929,0,0
8414,-0.421354,0,0.026590,0,1,1,0,-0.467602,0.901028,-0.539968,...,-0.541608,-0.679713,-0.410853,-0.542420,-0.292333,-0.708219,0.164216,-0.450839,2,2
7671,-0.421354,0,0.026590,0,1,1,0,-0.467602,0.901028,-0.539968,...,0.894454,0.862416,3.063824,0.179756,1.684378,3.609040,-0.886916,0.686773,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,-0.421354,0,1.383856,0,1,1,0,-0.467602,-0.571187,1.393128,...,0.894454,0.594774,-0.410853,0.179756,-0.309105,0.293385,-0.428906,0.459251,3,3
1982,2.215327,0,0.026590,0,1,1,1,3.823125,-0.571187,0.426580,...,-0.169296,1.818282,0.023482,-0.355189,-0.232432,1.149268,-0.815352,1.369341,3,3
4387,0.285800,0,-1.330676,0,1,1,0,-0.467602,0.901028,-0.539968,...,-0.169296,0.135959,0.023482,0.179756,-0.232432,-0.228524,-0.357915,0.004206,3,3
5489,-0.421354,0,0.026590,0,1,1,0,-0.467602,-0.571187,0.426580,...,-0.169296,-0.220898,-0.555631,-0.355189,-0.312299,-0.492356,1.116875,-0.450839,3,3


In [880]:
final_hh_df = final_df[final_df["=1 if household head"]==1]
# print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
print("F1 score : {:.2f}".format(f1_score(final_hh_df['Target'].values, final_hh_df['Pred'].values, average = 'macro')))

F1 score : 0.90


### Just household heads

In [75]:
train_hh_df = train_df[train_df["=1 if household head"]==1]

In [76]:
train_hh_df = train_hh_df.drop("=1 if household head",axis =1)

In [77]:
xTrain, xTest, yTrain, yTest = prepData(train_hh_df,test_size = 0.3)

In [78]:
len(yTrain)

892

In [79]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  1.0, min_child_samples = 20, num_leaves = 31, subsample = 1.0)

# fit data into the model and predict the test set
lgb_clf.fit(xTrain.values, yTrain, eval_set=[(xTest.values, yTest)], 
            early_stopping_rounds=400, verbose=100)
yPred = lgb_clf.predict(xTest.values)



[100]	valid_0's multi_logloss: 0.340883
[200]	valid_0's multi_logloss: 0.100829
[300]	valid_0's multi_logloss: 0.0296409
[400]	valid_0's multi_logloss: 0.00933806
[500]	valid_0's multi_logloss: 0.00288358
[600]	valid_0's multi_logloss: 0.000889352
[700]	valid_0's multi_logloss: 0.000284637
[800]	valid_0's multi_logloss: 0.000100593
[900]	valid_0's multi_logloss: 4.87618e-05
[1000]	valid_0's multi_logloss: 3.2923e-05
[1100]	valid_0's multi_logloss: 2.58888e-05
[1200]	valid_0's multi_logloss: 2.18787e-05
[1300]	valid_0's multi_logloss: 1.94626e-05
[1400]	valid_0's multi_logloss: 1.76498e-05
[1500]	valid_0's multi_logloss: 1.62876e-05
[1600]	valid_0's multi_logloss: 1.52037e-05
[1700]	valid_0's multi_logloss: 1.43004e-05
[1800]	valid_0's multi_logloss: 1.35642e-05
[1900]	valid_0's multi_logloss: 1.29325e-05
[2000]	valid_0's multi_logloss: 1.24694e-05
[2100]	valid_0's multi_logloss: 1.20509e-05
[2200]	valid_0's multi_logloss: 1.16664e-05
[2300]	valid_0's multi_logloss: 1.13199e-05
[2400]	v

In [80]:
cm = confusion_matrix(yTest, yPred)
f1 = f1_score(yTest, yPred, average='macro')
print("confusion matrix: \n", cm)
print("macro F1 score: \n", f1)
print('recall: ', recall_score(yTest, yPred, average='macro'))
print('precision: ', precision_score(yTest, yPred, average='macro'))
# print("Classification accuracy: {:.2f}".format(lgb_clf.score(xtest, ytest)))
# print("F1 score : {}".format(f1_score(ytest.values, y_pred, average = 'macro')))

confusion matrix: 
 [[ 68   0   0   0]
 [  0 140   0   0]
 [  0   0 108   0]
 [  0   0   0 576]]
macro F1 score: 
 1.0
recall:  1.0
precision:  1.0


# Submission

In [22]:
submission_df = lgb_clf.predict(test_df)

In [81]:
identity['Target'] = submission_df
identity.head()

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4


In [25]:
identity.to_csv('submission.csv',index = False)