In [1]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import tqdm.notebook as tq
import scipy.cluster.hierarchy as spc
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.feature_selection import VarianceThreshold
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split as tts, cross_val_score as cv, RepeatedStratifiedKFold as rsk
from sklearn.ensemble import RandomForestClassifier as rf, ExtraTreesClassifier as et, BaggingClassifier as bc
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, recall_score, precision_score
from sklearn.utils import class_weight
import lightgbm as lgb

# Machine Learning Testing

In [2]:
test_path = "../input/updated/test_df_renamed_new.csv"
train_path = "../input/updated/train_df_renamed_new.csv"

In [3]:
train_df = pd.read_csv(train_path)
train_df = train_df.drop(columns = ['Household level identifier','Id'])
train_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level,Target
0,190000.0,0,3,0,1,0.0,0,1,1,0,...,0,0,0,43,10,2,1,1,4,4
1,135000.0,0,4,0,1,1.0,0,1,1,0,...,0,0,0,67,12,2,2,2,8,4
2,0.0,0,8,0,1,0.0,0,0,0,0,...,0,0,0,92,11,2,3,3,5,4
3,180000.0,0,5,0,1,1.0,0,2,2,1,...,0,0,0,17,11,3,3,3,4,4
4,180000.0,0,5,0,1,1.0,0,2,2,1,...,0,0,0,37,11,3,3,3,5,4


In [4]:

test_df = pd.read_csv(test_path)
identity = test_df.iloc[:, 1:2]
test_df = test_df.drop(columns = ['Household level identifier','Id'])
test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Brunca,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level
0,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,4,17,3,3,3,1
1,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,41,17,3,3,3,8
2,0.0,0,5,0,1,0.0,1,1,2,0,...,0,0,0,0,41,17,3,3,3,9
3,0.0,0,14,0,1,1.0,0,1,1,0,...,0,0,0,0,59,16,3,3,3,8
4,175000.0,0,4,0,1,1.0,0,0,0,0,...,0,0,0,0,18,11,2,3,3,5


In [5]:
if list(train_df.columns[:-1]) == list(test_df.columns):
    print("Order is matched")

Order is matched


## Helper Functions

In [6]:
#need to normalise some of the columns
def prepData(df,test_size = 0.3,normalize = True,include_eval = False):
    unnormal_cols = selectUnnormalised(df)
    x = df.drop("Target",axis = 1)
    y = df['Target'] -1
    xtrain, xtest, ytrain, ytest = tts(x,y,test_size = test_size, random_state = 42)
    if include_eval:
        xtest, x_eval, ytest, y_eval = tts(xtest,ytest,test_size = 0.1/0.3, random_state = 42)
        return xtest, xtest, x_eval, ytest, ytest, y_eval
    else:
        if normalize:
            xtrain, xtest = normalise(unnormal_cols, xtrain, xtest)
            ytrain, ytest = ytrain.values.ravel(), ytest.values.ravel()
            return  xtrain, xtest, ytrain, ytest
        return  xtrain, xtest, ytrain, ytest
    
    
    

#getting the columns with non-normalised values
def selectUnnormalised(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    #remove target because we shouldn't normalise that, but rather encode
    unnormCols.remove('Target')
    return unnormCols

#normalising data in testing set
def normalise(unnormCols, xtest, xTest):
    #normalise testing data
    toBeNorm_test = xtest[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm_test)
    xtest_norm = std_scale.transform(toBeNorm_test)
    
    #covert numpy array to df
    xtest_normCol = pd.DataFrame(xtest_norm, index = toBeNorm_test.index,
                                 columns = toBeNorm_test.columns)
    xtest.update(xtest_normCol)
    
    #normalise test data using mean and SD of testing set
    toBeNorm_test = xTest[[i for i in unnormCols]]
    xTest_norm = std_scale.transform(toBeNorm_test)
    xTest_normCol = pd.DataFrame(xTest_norm, index = toBeNorm_test.index,
                                columns = toBeNorm_test.columns)
    xTest.update(xTest_normCol)
    
    return xtest, xTest



In [7]:

xtrain, xtest, ytrain, ytest = prepData(train_df,test_size = 0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


## LightGBM

In [8]:
# build the model
lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf.fit(xtrain.values, ytrain, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf.predict(xtest)

Training until validation scores don't improve for 400 rounds
[100]	valid_0's multi_logloss: 0.569822
[200]	valid_0's multi_logloss: 0.35869
[300]	valid_0's multi_logloss: 0.253686
[400]	valid_0's multi_logloss: 0.194504
[500]	valid_0's multi_logloss: 0.16232
[600]	valid_0's multi_logloss: 0.143235
[700]	valid_0's multi_logloss: 0.134964
[800]	valid_0's multi_logloss: 0.132109
[900]	valid_0's multi_logloss: 0.135065
[1000]	valid_0's multi_logloss: 0.141193
[1100]	valid_0's multi_logloss: 0.148449
Early stopping, best iteration is:
[785]	valid_0's multi_logloss: 0.131818


In [9]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n ", cm)
print("macro F1 score: ", f1)
print('recall: ', recall_score(ytest, y_pred, average='macro'))
print('precision: ', precision_score(ytest, y_pred, average='macro'))
print("Classification accuracy on train: {:.2f}".format(lgb_clf.score(xtrain, ytrain)))
print("Classification accuracy on test: {:.2f}".format(lgb_clf.score(xtest, ytest)))

confusion matrix: 
  [[ 77   1   0   1]
 [  1 147   1   7]
 [  0   2 120   9]
 [  0   6   5 579]]
macro F1 score:  0.958360695789851
recall:  0.9535944257915061
precision:  0.9633361604838115
Classification accuracy on train: 1.00
Classification accuracy on test: 0.97


### Just household heads

In [10]:
train_hh_df = train_df[train_df["=1 if household head"]==1]
train_hh_df = train_hh_df.drop("=1 if household head",axis =1)

In [11]:
xtrain, xtest, ytrain, ytest = prepData(train_hh_df,test_size = 0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [12]:
# build the model
lgb_clf_two = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
lgb_clf_two.fit(xtrain.values, ytrain, eval_set=[(xtest.values, ytest)], 
            early_stopping_rounds=400, verbose=100)
y_pred = lgb_clf_two.predict(xtest)


Training until validation scores don't improve for 400 rounds
[100]	valid_0's multi_logloss: 0.949844
[200]	valid_0's multi_logloss: 0.901646
[300]	valid_0's multi_logloss: 0.881381
[400]	valid_0's multi_logloss: 0.880977
[500]	valid_0's multi_logloss: 0.891064
[600]	valid_0's multi_logloss: 0.909164
[700]	valid_0's multi_logloss: 0.931505
Early stopping, best iteration is:
[323]	valid_0's multi_logloss: 0.879556


In [13]:
cm = confusion_matrix(ytest, y_pred)
f1 = f1_score(ytest, y_pred, average='macro')
print("confusion matrix: \n ", cm)
print("macro F1 score: ", f1)
print('recall: ', recall_score(ytest, y_pred, average='macro'))
print('precision: ', precision_score(ytest, y_pred, average='macro'))
print("Classification accuracy on train: {:.2f}".format(lgb_clf_two.score(xtrain, ytrain)))
print("Classification accuracy on test: {:.2f}".format(lgb_clf_two.score(xtest, ytest)))
# print("F1 score : {}".format(f1_score(ytest.values, y_pred, average = 'macro')))

confusion matrix: 
  [[ 16   5   2   2]
 [  6  17  13   8]
 [  3  10  11  14]
 [  7  16  17 151]]
macro F1 score:  0.5069275691141744
recall:  0.5266033092011323
precision:  0.4932094407530454
Classification accuracy on train: 0.89
Classification accuracy on test: 0.65


## Older dataset

In [14]:
old_test_path = "../input/older-dataset/test_df_renamed.csv"
old_train_path = "../input/older-dataset/train_df_renamed.csv"

In [15]:
old_train_df = pd.read_csv(old_train_path)
old_train_df = old_train_df.drop(columns = ['Household level identifier','Id'])
old_train_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 has toilet in the household,=1 if the household has refrigerator,owns a tablet,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,...,escolari squared,age squared,hogar_total squared,edjefe squared,hogar_nin squared,overcrowding squared,dependency squared,meaned squared,Age squared,Target
0,190000.0,0,3,0,1,1,0,0.0,0,1,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,135000.0,0,4,0,1,1,1,1.0,0,1,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,0.0,0,8,0,1,1,0,0.0,0,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,180000.0,0,5,0,1,1,1,1.0,0,2,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,180000.0,0,5,0,1,1,1,1.0,0,2,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [16]:
x_train, x_test, y_train, y_test = prepData(old_train_df,test_size = 0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


In [17]:
# build the model
old_lgb_clf = lgb.LGBMClassifier(max_depth=-1, learning_rate=0.03, objective='multiclass',
                             random_state=None, silent=True, metric='multi_logloss', 
                             n_jobs=4, n_estimators=5000, class_weight='balanced',
                             colsample_bytree =  0.89, min_child_samples = 90, num_leaves = 56, subsample = 0.96)

# fit data into the model and predict the test set
old_lgb_clf.fit(x_train.values, y_train, eval_set=[(x_test.values, y_test)], 
            early_stopping_rounds=400, verbose=100)
old_y_pred = old_lgb_clf.predict(x_test)

Training until validation scores don't improve for 400 rounds
[100]	valid_0's multi_logloss: 0.587755
[200]	valid_0's multi_logloss: 0.399928
[300]	valid_0's multi_logloss: 0.310244
[400]	valid_0's multi_logloss: 0.257139
[500]	valid_0's multi_logloss: 0.22763
[600]	valid_0's multi_logloss: 0.211271
[700]	valid_0's multi_logloss: 0.203244
[800]	valid_0's multi_logloss: 0.200364
[900]	valid_0's multi_logloss: 0.201312
[1000]	valid_0's multi_logloss: 0.205225
[1100]	valid_0's multi_logloss: 0.21281
[1200]	valid_0's multi_logloss: 0.221095
Early stopping, best iteration is:
[845]	valid_0's multi_logloss: 0.200268


In [18]:
cm = confusion_matrix(y_test, old_y_pred)
f1 = f1_score(y_test, old_y_pred, average='macro')
print("confusion matrix: \n ", cm)
print("macro F1 score: ", f1)
print('recall: ', recall_score(y_test, old_y_pred, average='macro'))
print('precision: ', precision_score(y_test, old_y_pred, average='macro'))
print("Classification accuracy on train: {:.2f}".format(old_lgb_clf.score(x_train, y_train)))
print("Classification accuracy on test: {:.2f}".format(old_lgb_clf.score(x_test, y_test)))
# print("F1 score : {}".format(f1_score(ytest.values, y_pred, average = 'macro')))

confusion matrix: 
  [[ 70   6   1   2]
 [  2 144   5   6]
 [  0   7 118   7]
 [  0   9   5 574]]
macro F1 score:  0.9245922079128632
recall:  0.9183508179315646
precision:  0.932238472717947
Classification accuracy on train: 1.00
Classification accuracy on test: 0.95


# Submission

In [19]:
#need to prepare special function to get normalised test set
#need to normalise some of the columns
def prepData2(df):
    unnormal_cols = selectUnnormalised2(df)
    x_df = normalise2(unnormal_cols, df)
    return x_df

#getting the columns with non-normalised values
def selectUnnormalised2(df):
    normCol = df.columns[df.isin([0,1]).all()] #getting the binary columns
    dfCols_list = list(df.columns)
    unnormCols = list(set(dfCols_list)-set(normCol))
    
    return unnormCols

#normalising data in training set
def normalise2(unnormCols, df):
    #normalise training data
    toBeNorm = df[[i for i in unnormCols]]
    ss = StandardScaler()
    std_scale = ss.fit(toBeNorm)
    x_norm = std_scale.transform(toBeNorm)
    
    #covert numpy array to df
    x_normCols = pd.DataFrame(x_norm, index = toBeNorm.index,
                                 columns = toBeNorm.columns)
    df.update(x_normCols)
    
    return df

In [20]:
norm_test_df = prepData2(test_df) #generate normalised actual test set
norm_test_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Brunca,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level
0,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,-1.403691,2.058231,0.795525,0.794371,0.705438,-1.287353
1,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,0.301709,2.058231,0.795525,0.794371,0.705438,1.834781
2,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,0.301709,2.058231,0.795525,0.794371,0.705438,2.2808
3,-0.420026,0,5.873939,0,1,1.010554,-0.584012,-0.569577,-0.836896,-0.574709,...,0,0,0,0,1.131363,1.829539,0.795525,0.794371,0.705438,1.834781
4,1.133893,0,-0.620747,0,1,1.010554,-0.584012,-1.579524,-1.690428,-0.574709,...,0,0,0,0,-0.758404,0.686077,-0.661858,0.794371,0.705438,0.496724


In [21]:
norm_test_hh_df = norm_test_df.drop("=1 if household head",axis = 1)
norm_test_hh_df.head()

Unnamed: 0,Monthly rent payment,=1 Overcrowding by bedrooms,number of all rooms in the house,=1 Overcrowding by rooms,=1 if the household has refrigerator,number of tablets household owns,Males younger than 12 years of age,Males 12 years of age and older,Total males in the household,Females younger than 12 years of age,...,=1 region Brunca,=1 region Huetar AtlÃƒÂ¡ntica,=1 region Huetar Norte,=2 zona rural,Age in years,Years of education of head of household,wall_condition,roof_condition,floor_condition,education_level
0,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,-1.403691,2.058231,0.795525,0.794371,0.705438,-1.287353
1,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,0.301709,2.058231,0.795525,0.794371,0.705438,1.834781
2,-0.420026,0,0.028722,0,1,-0.484131,0.81804,-0.569577,0.016637,-0.574709,...,0,0,0,0,0.301709,2.058231,0.795525,0.794371,0.705438,2.2808
3,-0.420026,0,5.873939,0,1,1.010554,-0.584012,-0.569577,-0.836896,-0.574709,...,0,0,0,0,1.131363,1.829539,0.795525,0.794371,0.705438,1.834781
4,1.133893,0,-0.620747,0,1,1.010554,-0.584012,-1.579524,-1.690428,-0.574709,...,0,0,0,0,-0.758404,0.686077,-0.661858,0.794371,0.705438,0.496724


In [22]:
# submission_df = lgb_clf.predict(norm_test_df)
submission_df = lgb_clf_two.predict(norm_test_hh_df)

In [23]:
identity['Target'] = submission_df+1
identity.head()

Unnamed: 0,Id,Target
0,ID_2f6873615,4
1,ID_1c78846d2,4
2,ID_e5442cf6a,4
3,ID_a8db26a79,4
4,ID_a62966799,4


In [24]:
identity.to_csv('submission.csv',index = False)