# LGBM with random split for early stopping

In Latin America, one popular method uses an algorithm to verify income qualification. It’s called the Proxy Means Test (or PMT). With PMT, agencies use a model that considers a family’s observable household attributes like the material of their walls and ceiling, or the assets found in the home to classify them and predict their level of need.

While this is an improvement, accuracy remains a problem as the region’s population grows and poverty declines.

Notes from Original Kernel (edited by EAS):

* This kernel runs training on the heads of housholds only
* It seems to be very important to balance class frequencies.
* This kernel uses macro F1 score to early stopping in training.
* Categoricals are turned into numbers with proper mapping instead of blind label encoding.
* OHE if reversed into label encoding, as it is easier to digest for a tree model.
* idhogar is NOT used in training. 
* There are aggregations done within households and new features are hand-crafted.
* A voting classifier is used to average over several LightGBM models

In [1]:
import numpy as np # linear algebra
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")



## encode_data(), feature_importance()

In [2]:
from sklearn.preprocessing import LabelEncoder

# this only transforms the idhogar field, the other things this function used to do are done elsewhere
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])

# plot feature importance for sklearn decision trees    
def feature_importance(forest, X_train, display_results=True):
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]
    
    if display_results:
        # Print the feature ranking
        print("Feature ranking:")

    for f in range(X_train.shape[1]):
        if display_results:
            print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " - " + X_train.columns[indices[f]])
        
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

Label encoder

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

## do_features()

In [3]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]
    
    # tamviv, number of persons living in the household

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    # aggregation rules over household
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    # aggregation over household
    for name_, df_ in [('18', df.query('age >= 18'))]: # 조건 부합 데이터 추출
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

    # Drop id's
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

agg 함수

- 모든열에 여러 함수를 매핑 : group객체.agg([함수1,함수2,함수3,…])
- 각 열마다 다른 함수를 매핑 : group객체.agg({‘열1’: 함수1, ‘열2’:함수2, …})

** : https://www.daleseo.com/python-ditonary/

## convert_OHE2LE()

In [4]:
# convert one hot encoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

# Read in the data and clean it up

In [5]:
train = pd.read_csv('C:/temp/train.csv')
test = pd.read_csv('C:/temp/test.csv')

test_ids = test.Id

In [6]:
def process_df(df_):
    encode_data(df_)
    
    return do_features(df_)

train = process_df(train)
test = process_df(test)

Clean up some missing data and convert objects to numeric.

## 전처리 1

### dependency 채우기

In [7]:
train['dependency'] = np.sqrt(train['SQBdependency'])  # SQBdependency, dependency squared
test['dependency'] = np.sqrt(test['SQBdependency'])

### years of education 변수 "no" -> 0

In [8]:
train.loc[train['edjefa'] == "no", "edjefa"] = 0
train.loc[train['edjefe'] == "no", "edjefe"] = 0
test.loc[test['edjefa'] == "no", "edjefa"] = 0
test.loc[test['edjefe'] == "no", "edjefe"] = 0

 - edjefa, years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0 meaneduc,average years of education for adults (18+)
 
 여성 가장의 교육년수, yes=1 and no=0 
 
 
  - edjefe, years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
  
  남성 가장의 교육년수, yes=1 and no=0

### escolari 변수로 household & education 으로 채우기

In [9]:
train.loc[(train['edjefa'] == "yes") & (train['parentesco1'] == 1)
          , "edjefa"] = train.loc[(train['edjefa'] == "yes")&(train['parentesco1'] == 1)
                                  , "escolari"]
train.loc[(train['edjefe'] == "yes") & (train['parentesco1'] == 1)
          , "edjefe"] = train.loc[(train['edjefe'] == "yes")&(train['parentesco1'] == 1)
                                  , "escolari"]

test.loc[(test['edjefa'] == "yes") & (test['parentesco1'] == 1)
         , "edjefa"] = test.loc[(test['edjefa'] == "yes")&(test['parentesco1']==1)
                                , "escolari"]
test.loc[(test['edjefe'] == "yes") & (test['parentesco1'] == 1)
         , "edjefe"] = test.loc[(test['edjefe'] == "yes")&(test['parentesco1'] == 1)
                                , "escolari"]

 - escolari, years of schooling
 - parentesco1, =1 if household head

### years of education가 "yes" 인 경우는 정확히 몇년인지 모름 -> 4로 채움

왜 4?

In [10]:
train.loc[train['edjefa'] == "yes", "edjefa"] = 4
train.loc[train['edjefe'] == "yes", "edjefe"] = 4

test.loc[test['edjefa'] == "yes", "edjefa"] = 4
test.loc[test['edjefe'] == "yes", "edjefe"] = 4

### 형변환

In [11]:
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

### 변수생성 (가장인 사람의 max years of education)

In [12]:
train['edjef'] = np.max(train[['edjefa', 'edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa', 'edjefe']], axis=1)

### fill na

In [13]:
train['v2a1'].isnull().sum()
train['v2a1']=train['v2a1'].fillna(0)
test['v2a1']=test['v2a1'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train['meaneduc'].isnull().sum()
train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0

test.loc[test.meaneduc.isnull(), "meaneduc"] = 0
test.loc[test.SQBmeaned.isnull(), "SQBmeaned"] = 0

### 모순되는 부분 고치기 - 물이 없다고 하면 화장실 없다고 하기

In [14]:
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "v14a"] = 0
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "sanitario1"] = 0

test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "v14a"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "sanitario1"] = 0

 - dependency, Dependency rate, calculated = (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)
 
 의존성 비율, (19세 미만 또는 64세 이상 가구원 수)/(19~64세 가구원 수)
 
 
 - edjefa, years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0 meaneduc,average years of education for adults (18+)
 
 여성 가장의 교육년수, yes=1 and no=0 
 
 
  - edjefe, years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
  
  남성 가장의 교육년수, yes=1 and no=0
  
  
  - escolari, years of schooling
  - parentesco1, =1 if household head
  - v14a, =1 has toilet in the household
  - sanitario1, =1 no toilet in the dwelling
  - abastaguano, =1 if no water provision

## train_test_apply_func()

In [15]:
def train_test_apply_func(train_, test_, func_):
    test_['Target'] = 0
    xx = pd.concat([train_, test_])
    
    xx_func = func_(xx)
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)
    
    del xx, xx_func
    return train_, test_


## convert_OHE2LE 실행

In [16]:
# 라벨인코딩
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


# Geo aggregates

In [17]:
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE',
              'pared_LE']
cols_nums = ['age', 'meaneduc', 'dependency', 
             'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',
             'bedrooms', 'overcrowding']

- age
- meanduc, average years of education for adults (18+)
- hogar_nin, Number of children 0 to 19 in household
- hogar_adul, Number of adults in household
- hogar_mayor, Number of individuals 65+ in the household
- hogar_total, Number of total individuals in the household
- bedrooms, Number of bedrooms
- overcrowding, Number persons per room

## convert_geo2aggs()

In [18]:
def convert_geo2aggs(df_):
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar']+cols_nums)]
                        , pd.get_dummies(df_[cols_2_ohe], columns = cols_2_ohe)]
                       ,axis=1)
    geo_agg = tmp_df.groupby(['lugar_LE', 'idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')

train, test = train_test_apply_func(train, test, convert_geo2aggs)

get_dummies 함수

https://zephyrus1111.tistory.com/91

- idhogar, this is a unique identifier for each household. This can be used to create household-wide features, etc. All rows in a given household will have a matching value for this identifier.

## 'num_over_18'

가정 별 18세 이상의 구성원 수 (?)

In [19]:
train['num_over_18'] = 0
train['num_over_18'] = train[train.age >= 18].groupby('idhogar').transform("count")
train['num_over_18'] = train.groupby("idhogar")["num_over_18"].transform("max")
train['num_over_18'] = train['num_over_18'].fillna(0)

In [20]:
# 2번째 실행 후
train['num_over_18']

0       1.0
1       1.0
2       1.0
3       2.0
4       2.0
       ... 
9552    4.0
9553    4.0
9554    4.0
9555    4.0
9556    4.0
Name: num_over_18, Length: 9557, dtype: float64

In [21]:
# 3번째 실행 후
train['num_over_18']

0       1.0
1       1.0
2       1.0
3       2.0
4       2.0
       ... 
9552    4.0
9553    4.0
9554    4.0
9555    4.0
9556    4.0
Name: num_over_18, Length: 9557, dtype: float64

In [22]:
test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform("count")
test['num_over_18'] = test.groupby("idhogar")["num_over_18"].transform("max")
test['num_over_18'] = test['num_over_18'].fillna(0)

## add some extra features

In [23]:
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    # tamhog - size of the household
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] 
    # r4t3 - Total persons in the household
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] 
    # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] 
    # rent to people in household
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] 
    # rent to people under age 12
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) 
    # rooms per person
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] 
    # rent to household size
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] 
    # some households have no one over 18, use the total rent for those
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
    
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)    
extract_features(test)

- v2a1, Monthly rent payment
- tamhog, size of the household
- r4t3, Total persons in the household
- r4t1, persons younger than 12 years of age
- hhsize, household size

## drop duplicated columns

In [24]:
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female', ]

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]  # 도대체 무슨소리

needless_cols.extend(instlevel_cols) # 리스트에 추가

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)

# Split the data

We split the data by household to avoid leakage, since rows belonging to the same household usually have the same target. 

## split_data()

In [25]:
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.2, seed=None):
    train2 = train.copy()
    
    cv_hhs = np.random.choice(households, size=int(len(households)*test_percentage), replace=False)
    
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]
    
    X_train = train2[~cv_idx] 
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]  # sample_weight 의 역할은?
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

np.isin()

내가 찾는게 있는지 여부를 각 index 위치에 True, False 형태로  알려줌

In [26]:
X = train.query('parentesco1==1')

# pull out and drop the target variable
y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)

train2 = X.copy()

train_hhs = train2.idhogar

households = train2.idhogar.unique()
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False)

cv_idx = np.isin(train2.idhogar, cv_hhs)

X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]

# train on entire dataset
X_train = train2
y_train = y

train_households = X_train.idhogar

In [27]:
# figure out the class weights for training with unbalanced classes
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

compute sample weight

https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_sample_weight.html

In [28]:
# drop some features which aren't used by the LGBM or have very low importance
extra_drop_features = [
 'agg18_estadocivil1_MEAN',
 'agg18_estadocivil6_COUNT',
 'agg18_estadocivil7_COUNT',
 'agg18_parentesco10_COUNT',
 'agg18_parentesco11_COUNT',
 'agg18_parentesco12_COUNT',
 'agg18_parentesco1_COUNT',
 'agg18_parentesco2_COUNT',
 'agg18_parentesco3_COUNT',
 'agg18_parentesco4_COUNT',
 'agg18_parentesco5_COUNT',
 'agg18_parentesco6_COUNT',
 'agg18_parentesco7_COUNT',
 'agg18_parentesco8_COUNT',
 'agg18_parentesco9_COUNT',
 'geo_elimbasu_LE_4',
 'geo_energcocinar_LE_1',
 'geo_energcocinar_LE_2',
 'geo_epared_LE_0',
 'geo_hogar_mayor',
 'geo_manual_elec_LE_2',
 'geo_pared_LE_3',
 'geo_pared_LE_4',
 'geo_pared_LE_5',
 'geo_pared_LE_6',
 'num_over_18',
 'parentesco_LE',
 'rez_esc']

In [29]:
xgb_drop_cols = extra_drop_features + ["idhogar",  'parentesco1']

# Fit a voting classifier

Vote based on LGBM models with early stopping based on macro F1 and decaying learning rate.

RandomizedSearchCV로 하이퍼파라미터 튜닝
(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

(https://www.kaggle.com/mlisovyi/lighgbm-hyperoptimisation-with-f1-macro)

Grid Search is good when we work with a small number of hyperparameters. However, if the number of parameters to consider is particularly high and the magnitudes of influence are imbalanced, the better choice is to use the Random Search.

https://towardsdatascience.com/machine-learning-gridsearchcv-randomizedsearchcv-d36b89231b10

In [30]:
# 4
opt_parameters = {'max_depth':35, 'eta':0.1, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 1, 'num_class': 4, 'gamma': 2.0, 'colsample_bylevel': 0.9, 'subsample': 0.84, 'colsample_bytree': 0.88, 'reg_lambda': 0.40 }
# 5
opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':1, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.5, 'colsample_bylevel': 1, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }
# 6
# opt_parameters = {'max_depth':35, 'eta':0.15, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 2.75, 'colsample_bylevel': 0.95, 'subsample': 0.95, 'colsample_bytree': 0.85, 'reg_lambda': 0.35 }
# # 7
# opt_parameters = {'max_depth':35, 'eta':0.12, 'silent':0, 'objective':'multi:softmax', 'min_child_weight': 2, 'num_class': 4, 'gamma': 3.25, 'colsample_bylevel': 0.95, 'subsample': 0.88, 'colsample_bytree': 0.88, 'reg_lambda': 0.35 }

def evaluate_macroF1_lgb(predictions, truth):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.argmax(axis=1)
    truth = truth.get_label()
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', 1-f1) 

fit_params={"early_stopping_rounds":500,
            "eval_metric" : evaluate_macroF1_lgb, 
            "eval_set" : [(X_train,y_train), (X_test,y_test)],
            'verbose': False,
           }

def learning_rate_power_0997(current_iter):
    base_learning_rate = 0.1
    min_learning_rate = 0.02
    lr = base_learning_rate  * np.power(.995, current_iter)
    return max(lr, min_learning_rate)

fit_params['verbose'] = 50


* argmax(axis=0) : 각 열을 따라 가장 높은 값의 인덱스를 제공
(https://www.delftstack.com/ko/api/numpy/python-numpy-argmax/)

* f1_score(y_true, y_pred, average='macro') : truth : 실제값, pred_labels : 예측값
//average를 macro로 두면 각 열에 대한 precision 값을 모두 더한 다음 열의 갯수로 나눈 것입니다.

* np.power(a, b) : a^b

In [31]:
np.random.seed(100)

def _parallel_fit_estimator(estimator1, X, y, sample_weight=None, threshold=True, **fit_params):
    estimator = clone(estimator1)
    
    # randomly split the data so we have a test set for early stopping
    if sample_weight is not None:
        X_train, y_train, X_test, y_test, y_train_weight = split_data(X, y, sample_weight, households=train_households)
    else:
        X_train, y_train, X_test, y_test = split_data(X, y, None, households=train_households)
        
    # update the fit params with our new split
    fit_params["eval_set"] = [(X_test,y_test)]
    
    # fit the estimator
    if sample_weight is not None:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train) # 자료형 확인
        else:
            _ = estimator.fit(X_train, y_train, sample_weight=y_train_weight, **fit_params)
    else:
        if isinstance(estimator1, ExtraTreesClassifier) or isinstance(estimator1, RandomForestClassifier):
            estimator.fit(X_train, y_train)
        else:
            _ = estimator.fit(X_train, y_train, **fit_params)
    
    if not isinstance(estimator1, ExtraTreesClassifier) 
            and not isinstance(estimator1, RandomForestClassifier) 
            and not isinstance(estimator1, xgb.XGBClassifier):
        best_cv_round = np.argmax(estimator.evals_result_['validation_0']['mlogloss']) # mlogloss : Multiclass logloss (손실함수)
        best_cv = np.max(estimator.evals_result_['validation_0']['mlogloss'])
        best_train = estimator.evals_result_['train']['macroF1'][best_cv_round]
        # evals_result : 결괏값 리턴
    else:
        best_train = f1_score(y_train, estimator.predict(X_train), average="macro")
        best_cv = f1_score(y_test, estimator.predict(X_test), average="macro")
        print("Train F1:", best_train)
        print("Test F1:", best_cv)
        
    # reject some estimators based on their performance on train and test sets
    if threshold:
        # if the valid score is very high we'll allow a little more leeway with the train scores
        if ((best_cv > 0.37) and (best_train > 0.75)) or ((best_cv > 0.44) and (best_train > 0.65)):
            return estimator

        # else recurse until we get a better one
        else:
            print("Unacceptable!!! Trying again...")
            return _parallel_fit_estimator(estimator1, X, y, sample_weight=sample_weight, **fit_params)
    
    else:
        return estimator
    
class VotingClassifierLGBM(VotingClassifier):
    '''
    This implements the fit method of the VotingClassifier propagating fit_params
    '''
    def fit(self, X, y, sample_weight=None, threshold=True, **fit_params):
        
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if (self.weights is not None and
                len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                 sample_weight=sample_weight, threshold=threshold, **fit_params)
                for clf in clfs if clf is not None)

        return self

In [32]:
clfs = []
for i in range(15):
    clf = xgb.XGBClassifier(random_state=217+i, n_estimators=300, learning_rate=0.15, n_jobs=4, **opt_parameters)
    
    clfs.append(('xgb{}'.format(i), clf))
    
vc = VotingClassifierLGBM(clfs, voting='soft')
del(clfs)

#Train the final model with learning rate decay
_ = vc.fit(X_train.drop(xgb_drop_cols, axis=1), y_train, sample_weight=y_train_weights, threshold=False, **fit_params)

clf_final = vc.estimators_[0]

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.29894	validation_0-macroF1:0.63874
[50]	validation_0-mlogloss:0.89615	validation_0-macroF1:0.56756
[100]	validation_0-mlogloss:0.89371	validation_0-macroF1:0.55565
[150]	validation_0-mlogloss:0.89062	validation_0-macroF1:0.56537
[200]	validation_0-mlogloss:0.89161	validation_0-macroF1:0.56447
[250]	validation_0-mlogloss:0.89255	validation_0-macroF1:0.56603
[299]	validation_0-mlogloss:0.89179	validation_0-macroF1:0.56650
Train F1: 0.9115660908746297
Test F1: 0.44462641001160175
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip thr

[250]	validation_0-mlogloss:0.90326	validation_0-macroF1:0.58032
[299]	validation_0-mlogloss:0.90476	validation_0-macroF1:0.57784
Train F1: 0.9131085061977701
Test F1: 0.42540312297831107
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.30457	validation_0-macroF1:0.66229
[50]	validation_0-mlogloss:0.92731	validation_0-macroF1:0.62286
[100]	validation_0-mlogloss:0.92147	validation_0-macroF1:0.62384
[150]	validation_0-mlogloss:0.92070	validation_0-macroF1:0.63108
[200]	validation_0-mlogloss:0.91867	validation_0-macroF1:0.63529
[250]	validation_0-mlogloss:0.91730	validation_0-macroF1:0.62598
[299]	validation_0-mlogloss:0.91654	validation_0-macroF1:0.63122
Train F1: 0.9039317978478034
Test F1: 0.3870640568511129
Parameters: { silent 

[100]	validation_0-mlogloss:0.91136	validation_0-macroF1:0.58107
[150]	validation_0-mlogloss:0.90792	validation_0-macroF1:0.56804
[200]	validation_0-mlogloss:0.90564	validation_0-macroF1:0.56718
[250]	validation_0-mlogloss:0.90621	validation_0-macroF1:0.56815
[299]	validation_0-mlogloss:0.90505	validation_0-macroF1:0.57516
Train F1: 0.9216674147144617
Test F1: 0.43368257176436975
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-mlogloss:1.29451	validation_0-macroF1:0.62019
[50]	validation_0-mlogloss:0.87714	validation_0-macroF1:0.57123
[100]	validation_0-mlogloss:0.87027	validation_0-macroF1:0.57247
[150]	validation_0-mlogloss:0.86424	validation_0-macroF1:0.56200
[200]	validation_0-mlogloss:0.86369	validation_0-macroF1:0.55931
[250]	validat

In [34]:
# params 4 - 400 early stop - 15 estimators - l1 used features - weighted
global_score = f1_score(y_test, clf_final.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'soft'
global_score_soft = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')
vc.voting = 'hard'
global_score_hard = f1_score(y_test, vc.predict(X_test.drop(xgb_drop_cols, axis=1)), average='macro')

print('Validation score of a single LGBM Classifier: {:.4f}'.format(global_score))
print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_score_hard))

Validation score of a single LGBM Classifier: 0.8220
Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8876
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.8934


In [36]:
# see which features are not used by ANY models
useless_features = []
drop_features = set()
counter = 0
for est in vc.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(xgb_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'agg18_estadocivil4_COUNT',
 'agg18_estadocivil5_COUNT',
 'geo_energcocinar_LE_0',
 'geo_epared_LE_2'}

In [37]:
ranked_features = feature_importance(clf_final, X_train.drop(xgb_drop_cols, axis=1))

Feature ranking:
1. feature 114 (0.030279) - geo_epared_LE_1
2. feature 42 (0.019682) - fe_children_fraction
3. feature 74 (0.018983) - agg18_parentesco2_MEAN
4. feature 59 (0.017248) - agg18_escolari_MAX
5. feature 133 (0.016442) - geo_pared_LE_1
6. feature 60 (0.014158) - agg18_escolari_MEAN
7. feature 40 (0.013327) - SQBdependency
8. feature 22 (0.012823) - dependency
9. feature 34 (0.012632) - SQBescolari
10. feature 12 (0.011761) - r4t1
11. feature 37 (0.011632) - SQBedjefe
12. feature 112 (0.011575) - geo_etecho_LE_1
13. feature 126 (0.010669) - geo_sanitario_LE_3
14. feature 96 (0.010495) - estadocivil_LE
15. feature 116 (0.010293) - geo_elimbasu_LE_0
16. feature 11 (0.010211) - r4m3
17. feature 100 (0.010105) - geo_age
18. feature 94 (0.010058) - etecho_LE
19. feature 17 (0.009895) - male
20. feature 105 (0.009877) - geo_hogar_total
21. feature 87 (0.009871) - piso_LE
22. feature 39 (0.009789) - SQBovercrowding
23. feature 63 (0.009531) - agg18_estadocivil2_MEAN
24. feature 117

# Random Forest

In [38]:
et_drop_cols = ['agg18_age_MAX', 'agg18_age_MEAN', 'agg18_age_MIN', 'agg18_dis_MEAN',
       'agg18_escolari_MAX', 'agg18_escolari_MEAN', 'agg18_escolari_MIN',
       'agg18_estadocivil1_COUNT', 'agg18_estadocivil1_MEAN',
       'agg18_estadocivil2_COUNT', 'agg18_estadocivil2_MEAN',
       'agg18_estadocivil3_COUNT', 'agg18_estadocivil3_MEAN',
       'agg18_estadocivil4_COUNT', 'agg18_estadocivil4_MEAN',
       'agg18_estadocivil5_COUNT', 'agg18_estadocivil5_MEAN',
       'agg18_estadocivil6_COUNT', 'agg18_estadocivil6_MEAN',
       'agg18_estadocivil7_COUNT', 'agg18_estadocivil7_MEAN',
       'agg18_parentesco10_COUNT', 'agg18_parentesco10_MEAN',
       'agg18_parentesco11_COUNT', 'agg18_parentesco11_MEAN',
       'agg18_parentesco12_COUNT', 'agg18_parentesco12_MEAN',
       'agg18_parentesco1_COUNT', 'agg18_parentesco1_MEAN',
       'agg18_parentesco2_COUNT', 'agg18_parentesco2_MEAN',
       'agg18_parentesco3_COUNT', 'agg18_parentesco3_MEAN',
       'agg18_parentesco4_COUNT', 'agg18_parentesco4_MEAN',
       'agg18_parentesco5_COUNT', 'agg18_parentesco5_MEAN',
       'agg18_parentesco6_COUNT', 'agg18_parentesco6_MEAN',
       'agg18_parentesco7_COUNT', 'agg18_parentesco7_MEAN',
       'agg18_parentesco8_COUNT', 'agg18_parentesco8_MEAN',
       'agg18_parentesco9_COUNT', 'agg18_parentesco9_MEAN'] #+ ['parentesco_LE', 'rez_esc']

et_drop_cols.extend(["idhogar", "parentesco1", 'fe_rent_per_person', 'fe_rent_per_room',
       'fe_tablet_adult_density', 'fe_tablet_density'])

In [39]:
# do the same thing for some extra trees classifiers
ets = []    
for i in range(10):
    rf = RandomForestClassifier(max_depth=None, random_state=217+i, n_jobs=4, n_estimators=700, min_impurity_decrease=1e-3, min_samples_leaf=2, verbose=0, class_weight="balanced")
    ets.append(('rf{}'.format(i), rf))   

vc2 = VotingClassifierLGBM(ets, voting='soft')    
_ = vc2.fit(X_train.drop(et_drop_cols, axis=1), y_train, threshold=False)   

Train F1: 0.8995785797400715
Test F1: 0.41644760797490754
Train F1: 0.8974057100172891
Test F1: 0.4173294959009245
Train F1: 0.8929947876700697
Test F1: 0.380791318982666
Train F1: 0.8897229006999118
Test F1: 0.43258661725401276
Train F1: 0.8894013811159895
Test F1: 0.46517518214144904
Train F1: 0.8963246456113938
Test F1: 0.426575122901441
Train F1: 0.890822628612706
Test F1: 0.41104823629263454
Train F1: 0.8958771082511664
Test F1: 0.46546114892390233
Train F1: 0.9064600301011827
Test F1: 0.4286664637370844
Train F1: 0.8948837246950454
Test F1: 0.41037179469569823


In [40]:
# w/ threshold, extra drop cols
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8379
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.8499


In [41]:
# w/o threshold, extra drop cols
vc2.voting = 'soft'
global_rf_score_soft = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')
vc2.voting = 'hard'
global_rf_score_hard = f1_score(y_test, vc2.predict(X_test.drop(et_drop_cols, axis=1)), average='macro')

print('Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: {:.4f}'.format(global_rf_score_soft))
print('Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: {:.4f}'.format(global_rf_score_hard))

Validation score of a VotingClassifier on 3 LGBMs with soft voting strategy: 0.8379
Validation score of a VotingClassifier on 3 LGBMs with hard voting strategy: 0.8499


In [42]:
# see which features are not used by ANY models
useless_features = []
drop_features = set()
counter = 0
for est in vc2.estimators_:
    ranked_features, unused_features = feature_importance(est, X_train.drop(et_drop_cols, axis=1), display_results=False)
    useless_features.append(unused_features)
    if counter == 0:
        drop_features = set(unused_features)
    else:
        drop_features = drop_features.intersection(set(unused_features))
    counter += 1
    
drop_features

{'parentesco_LE', 'rez_esc'}

In [43]:
def combine_voters(data, weights=[0.5, 0.5]):
    # do soft voting with both classifiers
    vc.voting="soft"
    vc1_probs = vc.predict_proba(data.drop(xgb_drop_cols, axis=1))
    vc2.voting="soft"
    vc2_probs = vc2.predict_proba(data.drop(et_drop_cols, axis=1))
    
    final_vote = (vc1_probs * weights[0]) + (vc2_probs * weights[1])
    predictions = np.argmax(final_vote, axis=1)
    
    return predictions

In [44]:
combo_preds = combine_voters(X_test, weights=[0.5, 0.5])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.8902509459276232

In [45]:
combo_preds = combine_voters(X_test, weights=[0.4, 0.6])
global_combo_score_soft= f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.8799250865445447

In [46]:
combo_preds = combine_voters(X_test, weights=[0.6, 0.4])
global_combo_score_soft = f1_score(y_test, combo_preds, average='macro')
global_combo_score_soft

0.8928776032611756

# Prepare submission

In [47]:
y_subm = pd.DataFrame()
y_subm['Id'] = test_ids

In [48]:
vc.voting = 'soft'
y_subm_lgb = y_subm.copy(deep=True)
y_subm_lgb['Target'] = vc.predict(test.drop(xgb_drop_cols, axis=1)) + 1

vc2.voting = 'soft'
y_subm_rf = y_subm.copy(deep=True)
y_subm_rf['Target'] = vc2.predict(test.drop(et_drop_cols, axis=1)) + 1

y_subm_ens = y_subm.copy(deep=True)
y_subm_ens['Target'] = combine_voters(test) + 1

In [50]:
from datetime import datetime
now = datetime.now()

sub_file_lgb = 'submission_soft_XGB_{:.4f}_{}.csv'.format(global_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_rf = 'submission_soft_RF_{:.4f}_{}.csv'.format(global_rf_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))
sub_file_ens = 'submission_ens_{:.4f}_{}.csv'.format(global_combo_score_soft, str(now.strftime('%Y-%m-%d-%H-%M')))

y_subm_lgb.to_csv(sub_file_lgb, index=False)
y_subm_rf.to_csv(sub_file_rf, index=False)
y_subm_ens.to_csv(sub_file_ens, index=False)