In [4]:
import numpy as np # linear algebra
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")

In [3]:
import six
import sys
sys.modules['sklearn.externals.six'] = six
import mlrose
import joblib
sys.modules['sklearn.externals.joblib'] = joblib

In [None]:
pip install mlrose

In [5]:
from sklearn.preprocessing import LabelEncoder

#idhogar label 인코딩
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])

# plot feature importance for sklearn decision trees    
def feature_importance(forest, X_train, display_results=True):
    
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]
    
    if display_results:
        # Print the feature ranking
        print("Feature ranking:")

    for f in range(X_train.shape[1]):
        if display_results:
            print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " - " + X_train.columns[indices[f]])
                    
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features


In [6]:
def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    # aggregation rules over household
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    # aggregation over household
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

    # Drop id's
    df.drop(['Id'], axis=1, inplace=True)
    
    return df


In [7]:
# convert one hot encoded fields to label encoding ; 원핫인코딩을 레이블 인코딩으로 변경
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    
    # 범주형 피처의 접두사
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        
        # manual이 범주형 피처 접두사에 없다면 : manual_elec 만 제외됨
        if 'manual_' not in s_:
             #cols_s_리스트에 추가
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]    
            
        # 범주형 피처 접두사에 elec이 있다면 : manual_elec이라면
        elif 'elec' in s_:
            # eleccity 공급처에 대한 리스트 
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
            
        
        # 각 컬럼의 합계 
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        
        # 만약 각 컬럼의 합계 리스트 중 0 있다면 그 값은 의미가 없음
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # 접두사_dummy 컬럼 생성
            col_dummy = s_+'_dummy'
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()            
            if 0 in sum_ohe:
                 print("The category completion did not work")
                    
        # 한 접두사의 컬럼을 가지는 데이터 더 큰값을 가지는 값의 인덱스(컬럼명)을 가져옴
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        
        # 그 컬럼에 _labelencoding 첨자를 붙여 새로운 컬럼 생성
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df


In [8]:
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
        
    return tmp_df

In [9]:
train[['paredblolad','paredzocalo']].idxmax(axis=1)

NameError: name 'train' is not defined

**범주형 변수 리스트**
- pared 
paredblolad, =1 if predominant material on the outside wall is block or brick
paredzocalo, "=1 if predominant material on the outside wall is socket (wood,  zinc or absbesto"
paredpreb, =1 if predominant material on the outside wall is prefabricated or cement
pareddes, =1 if predominant material on the outside wall is waste material
paredmad, =1 if predominant material on the outside wall is wood
paredzinc, =1 if predominant material on the outside wall is zink
paredfibras, =1 if predominant material on the outside wall is natural fibers
paredother, =1 if predominant material on the outside wall is other
- piso
pisomoscer, "=1 if predominant material on the floor is mosaic,  ceramic,  terrazo"
pisocemento, =1 if predominant material on the floor is cement
pisoother, =1 if predominant material on the floor is other
pisonatur, =1 if predominant material on the floor is  natural material
pisonotiene, =1 if no floor at the household
pisomadera, =1 if predominant material on the floor is wood
- techo
techozinc, =1 if predominant material on the roof is metal foil or zink
techoentrepiso, "=1 if predominant material on the roof is fiber cement,  mezzanine "
techocane, =1 if predominant material on the roof is natural fibers
techootro, =1 if predominant material on the roof is other
- abastagua
abastaguadentro, =1 if water provision inside the dwelling
abastaguafuera, =1 if water provision outside the dwelling
abastaguano, =1 if no water provision
- sanitario
sanitario1, =1 no toilet in the dwelling
sanitario2, =1 toilet connected to sewer or cesspool
sanitario3, =1 toilet connected to  septic tank
sanitario5, =1 toilet connected to black hole or letrine
sanitario6, =1 toilet connected to other system
- 'energcocinar'
(no kitchen)
energcocinar2, =1 main source of energy used for cooking electricity
energcocinar3, =1 main source of energy used for cooking gas
energcocinar4, =1 main source of energy used for cooking wood charcoal
- 'elimbasu'
elimbasu1, =1 if rubbish disposal mainly by tanker truck
elimbasu2, =1 if rubbish disposal mainly by botan hollow or buried
elimbasu3, =1 if rubbish disposal mainly by burning
elimbasu4, =1 if rubbish disposal mainly by throwing in an unoccupied space
elimbasu5, "=1 if rubbish disposal mainly by throwing in river,  creek or sea"
elimbasu6, =1 if rubbish disposal mainly other
- 'epared'
epared1, =1 if walls are bad
epared2, =1 if walls are regular
epared3, =1 if walls are good
- 'etecho', 
etecho1, =1 if roof are bad
etecho2, =1 if roof are regular
etecho3, =1 if roof are good
- 'eviv', 
eviv1, =1 if floor are bad
eviv2, =1 if floor are regular
eviv3, =1 if floor are good
- 'estadocivil'
estadocivil1, =1 if less than 10 years old
estadocivil2, =1 if free or coupled uunion
estadocivil3, =1 if married
estadocivil4, =1 if divorced
estadocivil5, =1 if separated
estadocivil6, =1 if widow/er
estadocivil7, =1 if single
- 'parentesco'
parentesco1, =1 if household head
parentesco2, =1 if spouse/partner
parentesco3, =1 if son/doughter
parentesco4, =1 if stepson/doughter
parentesco5, =1 if son/doughter in law
parentesco6, =1 if grandson/doughter
parentesco7, =1 if mother/father
parentesco8, =1 if father/mother in law
parentesco9, =1 if brother/sister
parentesco10, =1 if brother/sister in law
parentesco11, =1 if other family member
parentesco12, =1 if other non family member
- 'instlevel'
instlevel1, =1 no level of education
instlevel2, =1 incomplete primary
instlevel3, =1 complete primary
instlevel4, =1 incomplete academic secondary level
instlevel5, =1 complete academic secondary level
instlevel6, =1 incomplete technical secondary level
instlevel7, =1 complete technical secondary level
instlevel8, =1 undergraduate and higher education
instlevel9, =1 postgraduate higher education
- 'lugar',
lugar1, =1 region Central
lugar2, =1 region Chorotega
lugar3, =1 region PacÃƒÂ­fico central
lugar4, =1 region Brunca
lugar5, =1 region Huetar AtlÃƒÂ¡ntica
lugar6, =1 region Huetar Norte
- 'tipovivi',
tipovivi1, =1 own and fully paid house
tipovivi2, "=1 own,  paying in installments"
tipovivi3, =1 rented
tipovivi4, =1 precarious
tipovivi5, "=1 other(assigned,  borrowed)"
- 'manual_elec'


# Read in the data and clean it up

In [10]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

test_ids = test.Id

In [11]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [12]:
def process_df(df_):
    encode_data(df_)
    return do_features(df_)

In [13]:
train = process_df(train)
test = process_df(test)

- 결측값 처리, object형 변수 숫자형으로 변환

In [14]:
# dependency 컬럼을 제곱 컬럼으로 변경
train['dependency'] = np.sqrt(train['SQBdependency'])
test['dependency'] = np.sqrt(test['SQBdependency'])

# no 인 값들을 0으로 변경
train.loc[train['edjefa'] == 'no', 'edjefa'] = 0
train.loc[train['edjefe'] == 'no', 'edjefe'] = 0
test.loc[test['edjefa'] == 'no', 'edjefa'] = 0
test.loc[test['edjefe'] == 'no', 'edjefe'] = 0

# 세대주이고 education에 값이 있으면, 세대주이고 교육기간 컬럼인 escolari에 해당 값 채우기
train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'edjefa'] = train.loc[(train['edjefa'] == 'yes') & (train['parentesco1'] == 1), 'escolari']
train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'edjefe'] = train.loc[(train['edjefe'] == 'yes') & (train['parentesco1'] == 1), 'escolari']

test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'edjefa'] = test.loc[(test['edjefa'] == 'yes') & (test['parentesco1'] == 1), 'escolari']
test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'edjefe'] = test.loc[(test['edjefe'] == 'yes') & (test['parentesco1'] == 1), 'escolari']

# gender 와 escolari가 연관되어있음. 'yes'는 확실하지 않으니 4를 채우기
train.loc[train['edjefa'] == 'yes', 'edjefa'] = 4
train.loc[train['edjefe'] == 'yes', 'edjefe'] = 4

test.loc[test['edjefa'] == 'yes', 'edjefa'] = 4
test.loc[test['edjefe'] == 'yes', 'edjefe'] = 4

# 데이터 타입 변경
train['edjefe'] = train['edjefe'].astype("int")
train['edjefa'] = train['edjefa'].astype("int")
test['edjefe'] = test['edjefe'].astype("int")
test['edjefa'] = test['edjefa'].astype("int")

# 새로운 컬럼 생성, 새대주의 max education 
train['edjef'] = np.max(train[['edjefa','edjefe']], axis=1)
test['edjef'] = np.max(test[['edjefa','edjefe']], axis=1)

# 결측값 처리
train['v2a1']=train['v2a1'].fillna(0)
test['v2a1']=test['v2a1'].fillna(0)

test['v18q1']=test['v18q1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)

train['rez_esc']=train['rez_esc'].fillna(0)
test['rez_esc']=test['rez_esc'].fillna(0)

train.loc[train.meaneduc.isnull(), "meaneduc"] = 0
train.loc[train.SQBmeaned.isnull(), "SQBmeaned"] = 0

test.loc[test.meaneduc.isnull(), "meaneduc"] = 0
test.loc[test.SQBmeaned.isnull(), "SQBmeaned"] = 0

# 데이터 정합성이 안맞는 데이터
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "v14a"] = 0
train.loc[(train.v14a ==  1) & (train.sanitario1 ==  1) & (train.abastaguano == 0), "sanitario1"] = 0

test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "v14a"] = 0
test.loc[(test.v14a ==  1) & (test.sanitario1 ==  1) & (test.abastaguano == 0), "sanitario1"] = 0


In [15]:
def train_test_apply_func(train_, test_, func_):
    #test df target을 0으로 지정
    test_['Target'] = 0
    
    # 전체 데이터로 병합
    xx = pd.concat([train_, test_])
    
    # 특정 함수에 적용
    xx_func = func_(xx)
    
    #train, test 적용, test 데이터의 임의 타겟 드롭
    train_ = xx_func.iloc[:train_.shape[0], :]
    test_ = xx_func.iloc[train_.shape[0]:, :].drop('Target', axis=1)
    
    del xx, xx_func
    return train_, test_

In [16]:
# 원핫 인코딩 적용
train, test = train_test_apply_func(train, test, convert_OHE2LE)

The OHE in techo is incomplete. A new column will be added before label encoding
The OHE in instlevel is incomplete. A new column will be added before label encoding
The OHE in manual_elec is incomplete. A new column will be added before label encoding


In [17]:
train.shape

(9557, 148)

In [18]:
train['lugar_LE']

0       0
1       0
2       0
3       0
4       0
       ..
9552    5
9553    5
9554    5
9555    5
9556    5
Name: lugar_LE, Length: 9557, dtype: int16

# Geo aggregates

In [19]:
# 레이블 인코딩한 컬럼의 일부
cols_2_ohe = ['eviv_LE', 'etecho_LE', 'epared_LE', 'elimbasu_LE', 
              'energcocinar_LE', 'sanitario_LE', 'manual_elec_LE',
              'pared_LE']
# 숫자형 컬럼
cols_nums = ['age', 'meaneduc', 'dependency', 
             'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',
             'bedrooms', 'overcrowding']

# 어그리게이션 함수 생성
def convert_geo2aggs(df_):
    # 지역 레이블인코딩 컬럼과, 가구 키값, 숫자형 컬럼 + 레이블 인코딩d을 다시 원핫 인코딩한 컬럼을 concat
    tmp_df = pd.concat([df_[(['lugar_LE', 'idhogar']+cols_nums)],
                       pd.get_dummies(df_[cols_2_ohe],
                                     columns=cols_2_ohe)], axis=1)
    # 지역별, 가구의 키값별 평균을 구하고 다시 지역별로 평균
    geo_agg = tmp_df.groupby(['lugar_LE', 'idhogar']).mean().groupby('lugar_LE').mean().astype(np.float32)
    geo_agg.columns = pd.Index(['geo_' + e for e in geo_agg.columns.tolist()])
    
    del tmp_df
    return df_.join(geo_agg, how='left', on='lugar_LE')


train, test = train_test_apply_func(train, test, convert_geo2aggs)

In [20]:
# pd.get_dummies(train[cols_2_ohe], columns=cols_2_ohe)

In [21]:
train.shape

(9557, 194)

In [22]:
train[train.age >= 18].groupby('idhogar').transform('count')

Unnamed: 0,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,r4h2,...,geo_manual_elec_LE_3,geo_manual_elec_LE_4,geo_pared_LE_0,geo_pared_LE_1,geo_pared_LE_2,geo_pared_LE_3,geo_pared_LE_4,geo_pared_LE_5,geo_pared_LE_6,geo_pared_LE_7
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
5,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9551,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
9552,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
9554,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
9555,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4


In [23]:
# 가구의 18살 이상이 몇명 있는지 컬럼 생성
train['num_over_18'] = 0
train['num_over_18'] = train[train['age'] >= 18].groupby('idhogar').transform("count")
train['num_over_18'] = train.groupby("idhogar")["num_over_18"].transform("max")
train['num_over_18'] = train['num_over_18'].fillna(0)

test['num_over_18'] = 0
test['num_over_18'] = test[test.age >= 18].groupby('idhogar').transform("count")
test['num_over_18'] = test.groupby("idhogar")["num_over_18"].transform("max")
test['num_over_18'] = test['num_over_18'].fillna(0)


# 추가 피처 생성
def extract_features(df):
    df['bedrooms_to_rooms'] = df['bedrooms']/df['rooms']
    df['rent_to_rooms'] = df['v2a1']/df['rooms']
    df['tamhog_to_rooms'] = df['tamhog']/df['rooms'] # tamhog - size of the household
    df['r4t3_to_tamhog'] = df['r4t3']/df['tamhog'] # r4t3 - Total persons in the household
    df['r4t3_to_rooms'] = df['r4t3']/df['rooms'] # r4t3 - Total persons in the household
    df['v2a1_to_r4t3'] = df['v2a1']/df['r4t3'] # rent to people in household
    df['v2a1_to_r4t3'] = df['v2a1']/(df['r4t3'] - df['r4t1']) # rent to people under age 12
    df['hhsize_to_rooms'] = df['hhsize']/df['rooms'] # rooms per person
    df['rent_to_hhsize'] = df['v2a1']/df['hhsize'] # rent to household size
    df['rent_to_over_18'] = df['v2a1']/df['num_over_18']
    # some households have no one over 18, use the total rent for those
    df.loc[df.num_over_18 == 0, "rent_to_over_18"] = df[df.num_over_18 == 0].v2a1
    
extract_features(train)    
extract_features(test)

ValueError: Columns must be same length as key

In [24]:
needless_cols = ['r4t3', 'tamhog', 'tamviv', 'hhsize', 'v18q', 'v14a', 'agesq',
                 'mobilephone', 'female', ]

instlevel_cols = [s for s in train.columns.tolist() if 'instlevel' in s]

needless_cols.extend(instlevel_cols)

train = train.drop(needless_cols, axis=1)
test = test.drop(needless_cols, axis=1)


## Split the data
- 같은 가구라면 같은 target을 가지므로 household data와 individual data를 분리
- 개인 데이터가 꼭 필요한 건 아니지만 개인데이터도 포함하여 학습하도록 세팅
- 가구 데이터가 정리된 이후 개인 데이터에 덮어 씌움

**같은 가구있으록 같은 target을 가질 것이므로 가구의 키값인 idhogar를 기준으로 데이터를 분리하고 분리된 idhogar값을 전체 데이터에 적용하여 가구별로 data가 split되도록함**

In [25]:
def split_data(train, y, sample_weight=None, households=None, test_percentage=0.20, seed=None):

    train2 = train.copy()
    
    # pick some random households to use for the test data
    cv_hhs = np.random.choice(households, 
                              size=int(len(households) * test_percentage),
                              replace=False)
    
    # select households which are in the random selection
    cv_idx = np.isin(households, cv_hhs)
    X_test = train2[cv_idx]
    y_test = y[cv_idx]

    X_train = train2[~cv_idx]
    y_train = y[~cv_idx]
    
    if sample_weight is not None:
        y_train_weights = sample_weight[~cv_idx]
        return X_train, y_train, X_test, y_test, y_train_weights
    
    return X_train, y_train, X_test, y_test

In [26]:
# 세대주인 data 만 분리
X = train.query('parentesco1==1')
# X = train.copy()

# train데이터에서 타겟값과 피처 데이터 분리
y = X['Target'] - 1
X = X.drop(['Target'], axis=1)

np.random.seed(seed=None)

# X 데이터 카피
train2 = X.copy()

# idhogar 키값 저장
train_hhs = train2.idhogar

# 유니크 값만 household 변수에 저장
households = train2.idhogar.unique()

# 총 idhogar의 15%를 랜덤 추출 
cv_hhs = np.random.choice(households, size=int(len(households) * 0.15), replace=False)

# 추출한 idhogar 값을 기준으로 전체 데이터에 있는지 없는지 확인 
cv_idx = np.isin(train2.idhogar, cv_hhs)

# 
X_test = train2[cv_idx]
y_test = y[cv_idx]

X_train = train2[~cv_idx]
y_train = y[~cv_idx]

# train on entire dataset
X_train = train2
y_train = y

train_households = X_train.idhogar



In [32]:
len(cv_idx)

2973

In [28]:
train['idhogar']

0        401
1        171
2        522
3        513
4        513
        ... 
9552    2499
9553    2499
9554    2499
9555    2499
9556    2499
Name: idhogar, Length: 9557, dtype: int64

In [37]:
# figure out the class weights for training with unbalanced classes
y_train_weights = class_weight.compute_sample_weight('balanced', y_train, indices=None)

In [38]:
y_train_weights

array([0.38037359, 0.38037359, 0.38037359, ..., 0.38037359, 1.68156109,
       1.68156109])

In [39]:
X_train, y_train, X_test, y_test = split_data(X, y, None, households=train_households)



In [46]:
X_train['idhogar']

2        522
12      2172
13      2295
18      1226
20      2443
        ... 
9535    1802
9541    2687
9545    1964
9551     395
9552    2499
Name: idhogar, Length: 2379, dtype: int64