In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_study/data'
train_data_path = join(data_dir, 'train2.csv')
test_data_path = join(data_dir, 'test1.csv')

train = pd.read_csv(train_data_path)
test_x = pd.read_csv(test_data_path)
train_x = train.drop(['SalePrice'], axis=1)
train_y = train['SalePrice']
print(train_y)

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


In [3]:
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()


In [4]:
def load_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
    return train_x, test_x

In [5]:
cat_cols = ['LotArea', 'BldgType','PoolArea']  # 픽쳐들을 뽑아내자 
#(LotArea : 평방 피트 단위의 부지 크기,BldgType : 주거 유형,OverallQual : 전체 재질과 마감 품질,OverallCond : 전체 상태 등급
          #  ,PoolArea : 평방 피트의 수영장 면적)

In [6]:
# -----------------------------------
# one-hot encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------

# 학습 데이터와 테스트 데이터를 결합하여 get_dummies를 통한 원-핫 인코딩을 수행
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=cat_cols)

# Series.str.get_dummies
# 시리즈를 더미 코드로 변환합니다.  다시말해 train안에는 많은 변수들을 가지고 있는데 그중에 집갑의 영향이 있을 법한 특징들을 뽑아내어 원-핫 인코딩을 진행함

In [7]:
# 학습 데이터와 테스트 데이터의 재분할
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [8]:
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder로 인코딩
ohe = OneHotEncoder(sparse=False, categories='auto')    # 'auto' : 훈련 데이터에서 자동으로 범주를 결정합니다.
ohe.fit(train_x[cat_cols])


OneHotEncoder(sparse=False)

In [9]:
# 가변수의 컬럼명 생성
columns = []
for i, c in enumerate(cat_cols):
    columns += [f'{c}_{v}' for v in ohe.categories_[i]]

In [10]:
# 생성된 가변수를 데이터 프레임으로 변환
dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns)
dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns)


ValueError: Found unknown categories [1470, 1476, 1484, 1488, 1495, 1504, 1612, 1700, 1733, 1782, 1879, 1894, 1900, 2058, 2104, 2179, 2205, 2304, 2349, 2364, 2394, 2403, 2529, 2760, 2880, 2938, 2980, 2998, 3068, 3087, 3153, 3203, 3215, 3242, 3300, 3435, 3480, 3515, 3523, 3606, 3621, 3628, 3672, 3701, 3710, 3768, 3784, 3811, 3830, 3843, 3869, 3901, 3903, 3907, 3940, 3960, 4000, 4054, 4080, 4084, 4109, 4113, 4217, 4235, 4330, 4347, 4379, 4385, 4447, 4480, 4484, 4485, 4538, 4740, 4761, 4765, 4835, 4853, 4882, 4899, 4960, 5005, 5070, 5118, 5122, 5142, 5150, 5160, 5175, 5190, 5220, 5280, 5470, 5568, 5633, 5680, 5707, 5747, 5748, 5775, 5805, 5830, 5852, 5858, 5870, 5911, 5914, 5940, 5950, 5976, 6001, 6012, 6048, 6125, 6191, 6221, 6264, 6285, 6289, 6291, 6360, 6373, 6390, 6400, 6406, 6410, 6430, 6449, 6451, 6472, 6488, 6490, 6500, 6565, 6615, 6710, 6718, 6720, 6723, 6756, 6760, 6821, 6845, 6854, 6860, 6876, 6904, 6950, 6953, 6956, 6978, 7006, 7007, 7008, 7010, 7020, 7023, 7030, 7038, 7040, 7111, 7130, 7132, 7155, 7176, 7223, 7230, 7242, 7250, 7288, 7290, 7296, 7308, 7311, 7315, 7321, 7328, 7360, 7379, 7380, 7392, 7400, 7404, 7424, 7425, 7436, 7440, 7441, 7450, 7476, 7480, 7488, 7506, 7514, 7518, 7550, 7570, 7584, 7609, 7614, 7626, 7627, 7628, 7632, 7635, 7655, 7669, 7689, 7692, 7697, 7703, 7706, 7713, 7733, 7741, 7745, 7755, 7777, 7785, 7791, 7793, 7801, 7810, 7822, 7830, 7832, 7840, 7841, 7848, 7890, 7898, 7903, 7915, 7936, 7939, 7942, 7976, 7980, 7984, 7993, 8010, 8013, 8014, 8020, 8035, 8050, 8076, 8078, 8088, 8092, 8094, 8098, 8118, 8127, 8128, 8139, 8145, 8146, 8147, 8154, 8155, 8169, 8170, 8174, 8220, 8229, 8232, 8238, 8239, 8240, 8243, 8251, 8267, 8285, 8286, 8298, 8300, 8304, 8308, 8333, 8334, 8340, 8368, 8375, 8382, 8390, 8396, 8398, 8413, 8425, 8428, 8433, 8453, 8510, 8512, 8513, 8516, 8534, 8540, 8550, 8574, 8577, 8581, 8604, 8626, 8637, 8638, 8660, 8665, 8668, 8674, 8680, 8685, 8696, 8702, 8707, 8723, 8726, 8736, 8772, 8778, 8789, 8803, 8810, 8826, 8838, 8842, 8847, 8856, 8857, 8883, 8917, 8918, 8927, 8940, 8965, 8969, 8970, 8982, 8987, 8991, 8998, 9019, 9024, 9044, 9045, 9073, 9079, 9085, 9109, 9116, 9129, 9139, 9140, 9143, 9157, 9171, 9178, 9184, 9205, 9215, 9216, 9233, 9239, 9254, 9259, 9260, 9272, 9278, 9280, 9308, 9316, 9345, 9370, 9373, 9392, 9399, 9428, 9434, 9457, 9462, 9466, 9468, 9482, 9487, 9488, 9510, 9512, 9532, 9533, 9535, 9543, 9545, 9547, 9555, 9556, 9576, 9605, 9610, 9620, 9627, 9639, 9649, 9656, 9658, 9660, 9662, 9670, 9671, 9680, 9709, 9720, 9723, 9724, 9729, 9734, 9736, 9743, 9757, 9759, 9768, 9770, 9771, 9780, 9783, 9801, 9802, 9836, 9839, 9840, 9856, 9863, 9888, 9892, 9910, 9926, 9927, 9928, 9930, 9942, 9978, 9981, 9990, 10019, 10021, 10032, 10037, 10042, 10044, 10050, 10090, 10110, 10114, 10122, 10126, 10130, 10143, 10147, 10164, 10170, 10172, 10176, 10179, 10180, 10184, 10230, 10235, 10236, 10260, 10265, 10266, 10274, 10284, 10295, 10307, 10324, 10331, 10337, 10366, 10367, 10368, 10385, 10389, 10411, 10425, 10429, 10441, 10447, 10454, 10456, 10457, 10464, 10481, 10512, 10519, 10532, 10533, 10541, 10544, 10547, 10557, 10560, 10566, 10573, 10600, 10612, 10616, 10632, 10646, 10672, 10682, 10710, 10725, 10732, 10738, 10739, 10750, 10751, 10759, 10773, 10790, 10816, 10818, 10820, 10825, 10836, 10845, 10858, 10872, 10890, 10895, 10899, 10905, 10914, 10926, 10928, 10933, 10936, 10943, 10950, 10960, 10984, 10990, 11002, 11024, 11025, 11027, 11058, 11060, 11064, 11067, 11069, 11072, 11080, 11084, 11088, 11104, 11105, 11128, 11134, 11136, 11166, 11202, 11227, 11305, 11327, 11332, 11339, 11354, 11355, 11358, 11375, 11380, 11382, 11388, 11419, 11422, 11447, 11449, 11454, 11479, 11492, 11515, 11520, 11556, 11563, 11577, 11578, 11613, 11622, 11631, 11646, 11650, 11660, 11672, 11675, 11677, 11679, 11690, 11692, 11727, 11737, 11750, 11762, 11765, 11778, 11792, 11800, 11824, 11825, 11830, 11836, 11850, 11855, 11875, 11896, 11920, 11927, 11929, 11949, 11950, 11980, 12018, 12048, 12102, 12104, 12128, 12151, 12172, 12191, 12192, 12198, 12216, 12217, 12228, 12250, 12285, 12291, 12292, 12299, 12304, 12320, 12334, 12350, 12352, 12361, 12366, 12375, 12388, 12392, 12395, 12436, 12447, 12450, 12460, 12469, 12511, 12518, 12539, 12568, 12585, 12606, 12633, 12671, 12680, 12700, 12712, 12720, 12728, 12732, 12760, 12774, 12778, 12782, 12852, 12853, 12858, 12864, 12867, 12878, 12883, 12887, 12888, 12890, 12891, 12929, 12961, 13001, 13008, 13014, 13015, 13041, 13050, 13052, 13068, 13069, 13070, 13108, 13110, 13128, 13142, 13143, 13162, 13200, 13204, 13215, 13250, 13253, 13260, 13262, 13339, 13355, 13377, 13383, 13384, 13426, 13438, 13440, 13495, 13514, 13543, 13587, 13607, 13615, 13618, 13641, 13654, 13751, 13770, 13774, 13830, 13870, 13975, 14067, 14082, 14122, 14137, 14149, 14171, 14190, 14210, 14217, 14235, 14250, 14257, 14267, 14277, 14299, 14300, 14311, 14330, 14333, 14357, 14418, 14419, 14463, 14565, 14584, 14670, 14680, 14694, 14695, 14753, 14780, 14828, 14836, 14860, 15038, 15218, 15240, 15262, 15263, 15274, 15295, 15300, 15312, 15387, 15400, 15401, 15410, 15417, 15428, 15564, 15584, 15600, 15635, 15676, 15783, 15810, 15896, 15922, 15957, 16012, 16023, 16052, 16133, 16163, 16219, 16269, 16280, 16287, 16300, 16387, 16451, 16500, 16561, 16698, 16779, 16870, 16900, 17082, 17169, 17199, 17227, 17242, 17360, 17433, 17485, 17529, 17541, 17597, 17778, 17808, 17979, 18044, 18062, 18160, 18261, 18265, 18275, 18494, 18559, 18600, 18837, 19255, 19508, 19522, 19550, 19645, 19800, 19950, 19958, 20000, 20062, 20064, 20270, 20355, 20693, 21281, 21299, 21370, 21533, 21579, 22002, 22136, 22692, 23303, 23580, 23730, 23920, 24572, 25485, 26073, 26400, 27697, 31220, 31250, 33120, 33983, 39290, 39384, 41600, 43500, 47007, 47280, 50102, 51974, 56600] in column 0 during transform

In [11]:
train_x, test_x = load_data()
# -----------------------------------
from sklearn.preprocessing import LabelEncoder

# 범주형 변수를 for문 루프하여 반복적으로 레이블 인코딩 수행
for c in cat_cols:
    # 학습 데이터에 근거하여 정의한 후에 데이터 변환
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test_x[c] = le.transform(test_x[c])

ValueError: y contains previously unseen labels: [1470, 1476, 1484, 1488, 1495, 1504, 1612, 1700, 1733, 1782, 1879, 1894, 1900, 2058, 2104, 2179, 2205, 2304, 2349, 2364, 2394, 2403, 2529, 2760, 2880, 2938, 2980, 2998, 3068, 3087, 3153, 3203, 3215, 3242, 3300, 3435, 3480, 3515, 3523, 3606, 3621, 3628, 3672, 3701, 3710, 3768, 3784, 3811, 3830, 3843, 3869, 3901, 3903, 3907, 3940, 3960, 4000, 4054, 4080, 4084, 4109, 4113, 4217, 4235, 4330, 4347, 4379, 4385, 4447, 4480, 4484, 4485, 4538, 4740, 4761, 4765, 4835, 4853, 4882, 4899, 4960, 5005, 5070, 5118, 5122, 5142, 5150, 5160, 5175, 5190, 5220, 5280, 5470, 5568, 5633, 5680, 5707, 5747, 5748, 5775, 5805, 5830, 5852, 5858, 5870, 5911, 5914, 5940, 5950, 5976, 6001, 6012, 6048, 6125, 6191, 6221, 6264, 6285, 6289, 6291, 6360, 6373, 6390, 6400, 6406, 6410, 6430, 6449, 6451, 6472, 6488, 6490, 6500, 6565, 6615, 6710, 6718, 6720, 6723, 6756, 6760, 6821, 6845, 6854, 6860, 6876, 6904, 6950, 6953, 6956, 6978, 7006, 7007, 7008, 7010, 7020, 7023, 7030, 7038, 7040, 7111, 7130, 7132, 7155, 7176, 7223, 7230, 7242, 7250, 7288, 7290, 7296, 7308, 7311, 7315, 7321, 7328, 7360, 7379, 7380, 7392, 7400, 7404, 7424, 7425, 7436, 7440, 7441, 7450, 7476, 7480, 7488, 7506, 7514, 7518, 7550, 7570, 7584, 7609, 7614, 7626, 7627, 7628, 7632, 7635, 7655, 7669, 7689, 7692, 7697, 7703, 7706, 7713, 7733, 7741, 7745, 7755, 7777, 7785, 7791, 7793, 7801, 7810, 7822, 7830, 7832, 7840, 7841, 7848, 7890, 7898, 7903, 7915, 7936, 7939, 7942, 7976, 7980, 7984, 7993, 8010, 8013, 8014, 8020, 8035, 8050, 8076, 8078, 8088, 8092, 8094, 8098, 8118, 8127, 8128, 8139, 8145, 8146, 8147, 8154, 8155, 8169, 8170, 8174, 8220, 8229, 8232, 8238, 8239, 8240, 8243, 8251, 8267, 8285, 8286, 8298, 8300, 8304, 8308, 8333, 8334, 8340, 8368, 8375, 8382, 8390, 8396, 8398, 8413, 8425, 8428, 8433, 8453, 8510, 8512, 8513, 8516, 8534, 8540, 8550, 8574, 8577, 8581, 8604, 8626, 8637, 8638, 8660, 8665, 8668, 8674, 8680, 8685, 8696, 8702, 8707, 8723, 8726, 8736, 8772, 8778, 8789, 8803, 8810, 8826, 8838, 8842, 8847, 8856, 8857, 8883, 8917, 8918, 8927, 8940, 8965, 8969, 8970, 8982, 8987, 8991, 8998, 9019, 9024, 9044, 9045, 9073, 9079, 9085, 9109, 9116, 9129, 9139, 9140, 9143, 9157, 9171, 9178, 9184, 9205, 9215, 9216, 9233, 9239, 9254, 9259, 9260, 9272, 9278, 9280, 9308, 9316, 9345, 9370, 9373, 9392, 9399, 9428, 9434, 9457, 9462, 9466, 9468, 9482, 9487, 9488, 9510, 9512, 9532, 9533, 9535, 9543, 9545, 9547, 9555, 9556, 9576, 9605, 9610, 9620, 9627, 9639, 9649, 9656, 9658, 9660, 9662, 9670, 9671, 9680, 9709, 9720, 9723, 9724, 9729, 9734, 9736, 9743, 9757, 9759, 9768, 9770, 9771, 9780, 9783, 9801, 9802, 9836, 9839, 9840, 9856, 9863, 9888, 9892, 9910, 9926, 9927, 9928, 9930, 9942, 9978, 9981, 9990, 10019, 10021, 10032, 10037, 10042, 10044, 10050, 10090, 10110, 10114, 10122, 10126, 10130, 10143, 10147, 10164, 10170, 10172, 10176, 10179, 10180, 10184, 10230, 10235, 10236, 10260, 10265, 10266, 10274, 10284, 10295, 10307, 10324, 10331, 10337, 10366, 10367, 10368, 10385, 10389, 10411, 10425, 10429, 10441, 10447, 10454, 10456, 10457, 10464, 10481, 10512, 10519, 10532, 10533, 10541, 10544, 10547, 10557, 10560, 10566, 10573, 10600, 10612, 10616, 10632, 10646, 10672, 10682, 10710, 10725, 10732, 10738, 10739, 10750, 10751, 10759, 10773, 10790, 10816, 10818, 10820, 10825, 10836, 10845, 10858, 10872, 10890, 10895, 10899, 10905, 10914, 10926, 10928, 10933, 10936, 10943, 10950, 10960, 10984, 10990, 11002, 11024, 11025, 11027, 11058, 11060, 11064, 11067, 11069, 11072, 11080, 11084, 11088, 11104, 11105, 11128, 11134, 11136, 11166, 11202, 11227, 11305, 11327, 11332, 11339, 11354, 11355, 11358, 11375, 11380, 11382, 11388, 11419, 11422, 11447, 11449, 11454, 11479, 11492, 11515, 11520, 11556, 11563, 11577, 11578, 11613, 11622, 11631, 11646, 11650, 11660, 11672, 11675, 11677, 11679, 11690, 11692, 11727, 11737, 11750, 11762, 11765, 11778, 11792, 11800, 11824, 11825, 11830, 11836, 11850, 11855, 11875, 11896, 11920, 11927, 11929, 11949, 11950, 11980, 12018, 12048, 12102, 12104, 12128, 12151, 12172, 12191, 12192, 12198, 12216, 12217, 12228, 12250, 12285, 12291, 12292, 12299, 12304, 12320, 12334, 12350, 12352, 12361, 12366, 12375, 12388, 12392, 12395, 12436, 12447, 12450, 12460, 12469, 12511, 12518, 12539, 12568, 12585, 12606, 12633, 12671, 12680, 12700, 12712, 12720, 12728, 12732, 12760, 12774, 12778, 12782, 12852, 12853, 12858, 12864, 12867, 12878, 12883, 12887, 12888, 12890, 12891, 12929, 12961, 13001, 13008, 13014, 13015, 13041, 13050, 13052, 13068, 13069, 13070, 13108, 13110, 13128, 13142, 13143, 13162, 13200, 13204, 13215, 13250, 13253, 13260, 13262, 13339, 13355, 13377, 13383, 13384, 13426, 13438, 13440, 13495, 13514, 13543, 13587, 13607, 13615, 13618, 13641, 13654, 13751, 13770, 13774, 13830, 13870, 13975, 14067, 14082, 14122, 14137, 14149, 14171, 14190, 14210, 14217, 14235, 14250, 14257, 14267, 14277, 14299, 14300, 14311, 14330, 14333, 14357, 14418, 14419, 14463, 14565, 14584, 14670, 14680, 14694, 14695, 14753, 14780, 14828, 14836, 14860, 15038, 15218, 15240, 15262, 15263, 15274, 15295, 15300, 15312, 15387, 15400, 15401, 15410, 15417, 15428, 15564, 15584, 15600, 15635, 15676, 15783, 15810, 15896, 15922, 15957, 16012, 16023, 16052, 16133, 16163, 16219, 16269, 16280, 16287, 16300, 16387, 16451, 16500, 16561, 16698, 16779, 16870, 16900, 17082, 17169, 17199, 17227, 17242, 17360, 17433, 17485, 17529, 17541, 17597, 17778, 17808, 17979, 18044, 18062, 18160, 18261, 18265, 18275, 18494, 18559, 18600, 18837, 19255, 19508, 19522, 19550, 19645, 19800, 19950, 19958, 20000, 20062, 20064, 20270, 20355, 20693, 21281, 21299, 21370, 21533, 21579, 22002, 22136, 22692, 23303, 23580, 23730, 23920, 24572, 25485, 26073, 26400, 27697, 31220, 31250, 33120, 33983, 39290, 39384, 41600, 43500, 47007, 47280, 50102, 51974, 56600]

프리퀀시 인코딩 : 출연 빈도순으로 나열하는 인덱스를 만들기 위해 사용할 수 있습니다. (다만 동률의 값이 발생할 경우 주의)

In [12]:
# -----------------------------------
# frequency encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
# for문을 이용한 변수를 반복하여 프리퀀시 인코딩 수행
for c in cat_cols:
    freq = train_x[c].value_counts()
    # 카테고리 출현 횟수로 치환
    train_x[c] = train_x[c].map(freq)
    test_x[c] = test_x[c].map(freq)
print(freq)


0      1453
480       1
512       1
519       1
555       1
576       1
648       1
738       1
Name: PoolArea, dtype: int64


## 타킷 인코딩

- 목적변수를 이용하여 범주형 변수를 수치형 변수로 변환하는 방법입니다.

범주형 변수의 특정 레벨만 목적변수에 영향을 줄 때도 분기를 반복함으로써 예측값에 반영할 수 있으므로 학습에 활용할 수 있습니다.
특히 GBDT 모델에서 레이블 인코딩은 범주형 변수를 변환하는 기본적인 방법 (순서형 인코딩)

In [13]:
from sklearn.model_selection import KFold

# 교차 검증 폴드마다 타깃 인코딩 다시 적용
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):

    # 학습 데이터에서 학습 데이터와 검증 데이터 구분
    tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy()
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 변수를 반복하여 타깃 인코딩 수행
    for c in cat_cols:
        # 학습 데이터 전체에서 각 범주별 타깃 평균을 계산
        data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        # 검증 데이터의 카테고리 치환
        va_x.loc[:, c] = va_x[c].map(target_mean)

        # 학습 데이터 변환 후 값을 저장하는 배열 준비
        tmp = np.repeat(np.nan, tr_x.shape[0])
        kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
        for idx_1, idx_2 in kf_encoding.split(tr_x):
            # 아웃 오브 폴드에서 각 범주별 목적변수 평균 계산
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            # 변환 후 값을 날짜 배열에 저장
            tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)

        tr_x.loc[:, c] = tmp

In [14]:
print(cat_cols)

['LotArea', 'BldgType', 'PoolArea']


In [15]:
# -----------------------------------
# target encoding - 교차 검증의 fold와 target encoding의 fold 분할을 맞추는 경우
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold

# 교차 검증의 폴드를 정의
kf = KFold(n_splits=4, shuffle=True, random_state=71)

# 변수를 루프하여 타깃 인코딩 수행
for c in cat_cols:

    # 타깃을 추가
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    # 변환 후 값을 저장하는 배열을 준비
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 학습 데이터에서 검증 데이터를 나누기
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        # 학습 데이터에 대해 각 범주별 목적변수 평균 계산
        target_mean = data_tmp.iloc[tr_idx].groupby(c)['target'].mean()
        # 검증 데이터에 대해 변환 후 값을 날짜 배열에 저장
        tmp[va_idx] = train_x[c].iloc[va_idx].map(target_mean)

    # 변환 후의 데이터로 원래의 변수를 변경
    train_x[c] = tmp

-----------------------

캐글 공통 데이터

In [6]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_study/data'
train_data_path = join(data_dir, 'train3.csv')
test_data_path = join(data_dir, 'test3.csv')

train = pd.read_csv(train_data_path)
test_x = pd.read_csv(test_data_path)
train_x = train.drop(['target'], axis=1)
train_y = train['target']
print(train_y)

0       0
1       0
2       1
3       0
4       1
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: target, Length: 10000, dtype: int64


In [7]:
# 설명용으로 학습 데이터와 테스트 데이터의 원래 상태를 복제해 두기
train_x_saved = train_x.copy()
test_x_saved = test_x.copy()


In [8]:

# 학습 데이터와 테스트 데이터를 반환하는 함수
def load_data():
    train_x, test_x = train_x_saved.copy(), test_x_saved.copy()
    return train_x, test_x

# 변환할 수치 변수를 목록에 저장
cat_cols = ['sex', 'product', 'medical_info_b2', 'medical_info_b3']
print(train_x)

      age     sex      height     weight product   amount        date  \
0      50    Male  166.445608  65.016732      D1  7000000    2015/2/3   
1      68  Female  164.334615  56.544217      A1  7000000    2015/5/9   
2      77    Male  167.462917  54.242267      A3  6000000   2016/2/13   
3      17    Male  177.097725  71.147762      B1  8000000    2015/7/6   
4      62  Female  158.165788  65.240697      A2  9000000   2016/9/17   
...   ...     ...         ...        ...     ...      ...         ...   
9995   61    Male  182.729800  73.393777      A2  2000000  2015/10/21   
9996   33  Female  167.701136  75.006529      C3     9000   2015/5/28   
9997   44  Female  145.609998  47.739397      C3     1000   2016/2/29   
9998   34  Female  165.796017  57.567695      C1     5000   2016/2/27   
9999   31    Male  180.301762  71.425135      B2  1000000    2015/7/1   

      medical_info_a1  medical_info_a2  medical_info_a3  ...  \
0                 134              202                1  ..

In [9]:
# -----------------------------------
# one-hot encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------

# 학습 데이터와 테스트 데이터를 결합하여 get_dummies를 통한 원-핫 인코딩을 수행
all_x = pd.concat([train_x, test_x])
all_x = pd.get_dummies(all_x, columns=cat_cols)

# 학습 데이터와 테스트 데이터의 재분할
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)
print(all_x)

      age      height     weight   amount       date  medical_info_a1  \
0      50  166.445608  65.016732  7000000   2015/2/3              134   
1      68  164.334615  56.544217  7000000   2015/5/9              438   
2      77  167.462917  54.242267  6000000  2016/2/13              313   
3      17  177.097725  71.147762  8000000   2015/7/6              342   
4      62  158.165788  65.240697  9000000  2016/9/17              327   
...   ...         ...        ...      ...        ...              ...   
9995   21  185.174944  62.893499     3000  2015/3/11              277   
9996   34  157.581442  58.889901     2000  2016/3/27              184   
9997   36  177.676066  85.277018  6000000  2016/3/16              443   
9998   18  166.757782  64.254215  6000000  2015/6/17              267   
9999   42  158.656634  53.299956  3000000  2016/8/15              317   

      medical_info_a2  medical_info_a3  medical_info_b1  medical_info_c1  ...  \
0                 202                1    

In [10]:
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder로 인코딩
ohe = OneHotEncoder(sparse=False, categories='auto')
ohe.fit(train_x[cat_cols])


OneHotEncoder(sparse=False)

In [11]:
# 가변수의 컬럼명 생성
columns = []
for i, c in enumerate(cat_cols):
    columns += [f'{c}_{v}' for v in ohe.categories_[i]]
print(columns)

['sex_Female', 'sex_Male', 'product_A1', 'product_A2', 'product_A3', 'product_B1', 'product_B2', 'product_B3', 'product_C1', 'product_C2', 'product_C3', 'product_D1', 'product_E1', 'medical_info_b2_1', 'medical_info_b2_2', 'medical_info_b2_3', 'medical_info_b2_9', 'medical_info_b3_1', 'medical_info_b3_2', 'medical_info_b3_3', 'medical_info_b3_4', 'medical_info_b3_A', 'medical_info_b3_B', 'medical_info_b3_C', 'medical_info_b3_D', 'medical_info_b3_E', 'medical_info_b3_F', 'medical_info_b3_G', 'medical_info_b3_H', 'medical_info_b3_a', 'medical_info_b3_b', 'medical_info_b3_c', 'medical_info_b3_d', 'medical_info_b3_e']


In [12]:
# 생성된 가변수를 데이터 프레임으로 변환
dummy_vals_train = pd.DataFrame(ohe.transform(train_x[cat_cols]), columns=columns)
dummy_vals_test = pd.DataFrame(ohe.transform(test_x[cat_cols]), columns=columns)


In [13]:
# 나머지 변수와의 결합
train_x = pd.concat([train_x.drop(cat_cols, axis=1), dummy_vals_train], axis=1)
test_x = pd.concat([test_x.drop(cat_cols, axis=1), dummy_vals_test], axis=1)


In [14]:
print(test_x)

      age      height     weight   amount       date  medical_info_a1  \
0      49  187.431987  81.008363  1000000  2016/12/6              302   
1      79  171.632630  71.067812     2000   2016/9/3              197   
2      78  163.543983  64.032098  4000000  2015/4/10              247   
3      26  150.391858  52.322910  1000000  2016/4/17              108   
4      14  165.835167  67.008154  4000000  2015/1/26              181   
...   ...         ...        ...      ...        ...              ...   
9995   21  185.174944  62.893499     3000  2015/3/11              277   
9996   34  157.581442  58.889901     2000  2016/3/27              184   
9997   36  177.676066  85.277018  6000000  2016/3/16              443   
9998   18  166.757782  64.254215  6000000  2015/6/17              267   
9999   42  158.656634  53.299956  3000000  2016/8/15              317   

      medical_info_a2  medical_info_a3  medical_info_b1  medical_info_c1  ...  \
0                 212                1    

In [15]:
# -----------------------------------
# label encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.preprocessing import LabelEncoder

# 범주형 변수를 for문 루프하여 반복적으로 레이블 인코딩 수행
for c in cat_cols:
    # 학습 데이터에 근거하여 정의한 후에 데이터 변환
    le = LabelEncoder()
    le.fit(train_x[c])
    train_x[c] = le.transform(train_x[c])
    test_x[c] = le.transform(test_x[c])


In [16]:
print(test_x[c])

0        8
1        4
2       16
3        1
4       10
        ..
9995     8
9996     1
9997     9
9998     5
9999     8
Name: medical_info_b3, Length: 10000, dtype: int64


In [17]:
# -----------------------------------
# feature hashing
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.feature_extraction import FeatureHasher

# 범주형 변수를 반복적으로 특징 해싱 처리
for c in cat_cols:
    # FeatureHasher의 사용법은 다른 encoder와 조금 달라짐
    fh = FeatureHasher(n_features=5, input_type='string')

    # 변수를 문자열로 변환한 후 FeatureHasher 적용
    hash_train = fh.transform(train_x[[c]].astype(str).values)
    hash_test = fh.transform(test_x[[c]].astype(str).values)

    # 데이터 프레임으로 변환
    hash_train = pd.DataFrame(hash_train.todense(), columns=[f'{c}_{i}' for i in range(5)])
    hash_test = pd.DataFrame(hash_test.todense(), columns=[f'{c}_{i}' for i in range(5)])

    # 원래의 데이터 프레임과 결합
    train_x = pd.concat([train_x, hash_train], axis=1)
    test_x = pd.concat([test_x, hash_test], axis=1)
hash_test

Unnamed: 0,medical_info_b3_0,medical_info_b3_1,medical_info_b3_2,medical_info_b3_3,medical_info_b3_4
0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...
9995,0.0,0.0,1.0,0.0,0.0
9996,0.0,1.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,-1.0,0.0
9998,0.0,0.0,-1.0,0.0,0.0


- hash_test 를 보아하니 해당되는 데이터만 1을 가리키고 대부분의 데이터는 0을 가리킴 (확실한 변수 설정에만 진행하는거 같음)


In [18]:
# -----------------------------------
# target encoding
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold

# for문을 이용한 변수를 반복하여 타깃 인코딩 수행
for c in cat_cols:
    # 학습 데이터 전체에서 각 범주별 타깃 평균을 계산
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    target_mean = data_tmp.groupby(c)['target'].mean()

    # 테스트 데이터의 카테고리 변경
    test_x[c] = test_x[c].map(target_mean)

    # 학습 데이터 변환 후 값을 저장하는 배열을 준비
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 학습 데이터 분할
    kf = KFold(n_splits=4, shuffle=True, random_state=72)
    for idx_1, idx_2 in kf.split(train_x):
        # 아웃 오브 폴드로 각 범주형 목적변수 평균 계산
        target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
        # 변환 후의 값을 날짜 배열에 저장
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)

    # 변환 후의 데이터로 원래의 변수를 변경
    train_x[c] = tmp


In [19]:
# -----------------------------------
# target encoding - 교차 검증의 각 fold의 경우
# -----------------------------------
# 데이터 읽어오기
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold


In [20]:
# 교차 검증 폴드마다 타깃 인코딩 다시 적용
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):

    # 학습 데이터에서 학습 데이터와 검증 데이터 구분
    tr_x, va_x = train_x.iloc[tr_idx].copy(), train_x.iloc[va_idx].copy()
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
 # 변수를 반복하여 타깃 인코딩 수행
    for c in cat_cols:
        # 학습 데이터 전체에서 각 범주별 타깃 평균을 계산
        data_tmp = pd.DataFrame({c: tr_x[c], 'target': tr_y})
        target_mean = data_tmp.groupby(c)['target'].mean()
        # 검증 데이터의 카테고리 치환
        va_x.loc[:, c] = va_x[c].map(target_mean)

        # 학습 데이터 변환 후 값을 저장하는 배열 준비
        tmp = np.repeat(np.nan, tr_x.shape[0])
        kf_encoding = KFold(n_splits=4, shuffle=True, random_state=72)
        for idx_1, idx_2 in kf_encoding.split(tr_x):
            # 아웃 오브 폴드에서 각 범주별 목적변수 평균 계산
            target_mean = data_tmp.iloc[idx_1].groupby(c)['target'].mean()
            # 변환 후 값을 날짜 배열에 저장
            tmp[idx_2] = tr_x[c].iloc[idx_2].map(target_mean)

        tr_x.loc[:, c] = tmp

    # 필요에 따라 encode된 특징을 저장하고 나중에 읽을 수 있도록 해둠.

In [35]:
train_x, test_x = load_data()
# -----------------------------------
from sklearn.model_selection import KFold

# 교차 검증의 폴드를 정의
kf = KFold(n_splits=4, shuffle=True, random_state=71)

# 변수를 루프하여 타깃 인코딩 수행
for c in cat_cols:

    # 타깃을 추가
    data_tmp = pd.DataFrame({c: train_x[c], 'target': train_y})
    # 변환 후 값을 저장하는 배열을 준비
    tmp = np.repeat(np.nan, train_x.shape[0])

    # 학습 데이터에서 검증 데이터를 나누기
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        # 학습 데이터에 대해 각 범주별 목적변수 평균 계산
        target_mean = data_tmp.iloc[tr_idx].groupby(c)['target'].mean()
        # 검증 데이터에 대해 변환 후 값을 날짜 배열에 저장
        tmp[va_idx] = train_x[c].iloc[va_idx].map(target_mean)

    # 변환 후의 데이터로 원래의 변수를 변경
    train_x[c] = tmp
train_x[c]

0       0.190255
1       0.188209
2       0.197959
3       0.182609
4       0.163043
          ...   
9995    0.215385
9996    0.168293
9997    0.193103
9998    0.180617
9999    0.190255
Name: medical_info_b3, Length: 10000, dtype: float64

------------------table

In [28]:
import numpy as np
import pandas as pd

data_dir = os.getenv('HOME')+'/aiffel/kaggle_study/data'
train = join(data_dir, 'multi_table_train.csv')
product_master = join(data_dir, 'multi_table_product.csv')
user_log = join(data_dir, 'multi_table_log.csv')



In [29]:
# -----------------------------------
# 앞에서 설명한 그림 형식의 데이터 프레임이 있다고 가정
# train: 학습 데이터(사용자 ID, 상품 ID, 목적변수 등의 열이 있음)
# product_master: 상품 마스터(상품 ID와 상품의 정보를 나타내는 열이 있음)
# user_log: 사용자 행동의 로그 데이터(사용자 ID와 각 행동의 정보를 나타내는 열이 있음)

# 학습 데이터와 상품 마스터 데이터의 결합
train = train.merge(product_master, on='product_id', how='left')

# 로그 데이터의 사용자별 행의 수를 구하여, 학습 데이터와 결합
user_log_agg = user_log.groupby('user_id').size().reset_index().rename(columns={0: 'user_count'})
train = train.merge(user_log_agg, on='user_id', how='left')


AttributeError: 'str' object has no attribute 'merge'