In [1]:
import numpy as np
import pandas as pd

In [2]:
import traceback
import pickle
import numpy as np
from utils.data_handling import load_data, save_model_to_db
from utils.explainer import generate_shap_explainer
from preprocessing.preprocessing import label_target, impute_null_features, label_features, scale_features, select_features
from classification.param_search import optimize_hyperparameters_with_random_search
from classification.evaluation import evaluate_model
from classification.train import train_model
from loggers import logger

2021-09-07 22:22:29 [MainThread  ] [DEBUG] [loggers.py      : 11 ] ********** logger initiated


In [3]:
source_type = 'csv'
input_path = './train.csv'
target_col = 'Survived'
model_name = 'lgb'

In [6]:
# 데이터 불러오기
data = load_data(source_type=source_type, input_path=input_path)

# 타겟 컬럼 labeling
data, target_dict = label_target(data=data, target_col=target_col)

# 필요없는 컬럼 제거
# TODO: 데이터마다 바뀜
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, errors='ignore', inplace=True)

# null 값 impute
data, null_converter = impute_null_features(data=data)

# 파생변수 생성
# TODO: 데이터마다 바뀜
logger.info('## Create derived variables')
data["Fsize"] = data["SibSp"] + data["Parch"] + 1
# 파생변수 생성 후 필요없는 컬럼 제거
data = data.drop(["SibSp", "Parch"], axis=1)
cols = list(data.columns.difference(['target'])) + ['target']
data = data[cols]
logger.info('data: \n %s' % list(data.columns.difference(['target'])))

# categorical features 파악
# TODO: 데이터에 따라 직접 지정
categorical_feats_name = [col for col in data.columns if data[col].dtype == 'object']

# feature 컬럼 labeling
# lightgbm 이 categorical feats 도 int, float 형식으로 다 바꿔야 작동하므로 일괄 적용
data, label_encoder = label_features(data)

# scaling
if model_name in ['rf', 'xgb', 'lgb', 'catboost']:
    pass
else:
    data = scale_features(data)

# feature selection
features_selected = select_features(data=data, categorical_feats_name=categorical_feats_name)

# TODO: 데이터에 따라 직접 지정
cols = features_selected + ['target']
data = data[cols]
categorical_feats = [idx for idx, col in enumerate(data.columns) if col in categorical_feats_name]

2021-09-07 22:24:22 [MainThread  ] [INFO ] [data_handling.py: 28 ] ## Load data
2021-09-07 22:24:22 [MainThread  ] [INFO ] [data_handling.py: 44 ] raw_data: 
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2 

In [7]:
categorical_feats

[4, 5]

In [8]:
X = data[features_selected].values
y = data['target'].values

In [11]:
data

Unnamed: 0,Age,Fare,Fsize,Pclass,Embarked,Sex,target
0,22.0,7.2500,2,3,2.0,1.0,0
1,38.0,71.2833,2,1,0.0,0.0,1
2,26.0,7.9250,1,3,2.0,0.0,1
3,35.0,53.1000,2,1,2.0,0.0,1
4,35.0,8.0500,1,3,2.0,1.0,0
...,...,...,...,...,...,...,...
886,27.0,13.0000,1,2,2.0,1.0,0
887,19.0,30.0000,1,1,2.0,0.0,1
888,24.0,23.4500,4,3,2.0,0.0,0
889,26.0,30.0000,1,1,0.0,1.0,1


In [9]:
X

array([[22.    ,  7.25  ,  2.    ,  3.    ,  2.    ,  1.    ],
       [38.    , 71.2833,  2.    ,  1.    ,  0.    ,  0.    ],
       [26.    ,  7.925 ,  1.    ,  3.    ,  2.    ,  0.    ],
       ...,
       [24.    , 23.45  ,  4.    ,  3.    ,  2.    ,  0.    ],
       [26.    , 30.    ,  1.    ,  1.    ,  0.    ,  1.    ],
       [32.    ,  7.75  ,  1.    ,  3.    ,  1.    ,  1.    ]])

In [10]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [None]:
vif

In [None]:
categorical_feats

In [None]:
cols = list(data.columns.difference(['target'])) + ['target']

In [None]:
data

In [None]:
data[cols]

In [None]:
data

In [None]:
X

In [None]:
data.columns

In [None]:
data.iloc[:, [1,4]]

In [None]:
X[:, [1,4]]

In [None]:
X.type

#### Catboost 예제

In [None]:
from catboost import CatBoostRegressor

cat_features = [0, 1, 2]
train_data = [["a", "b", 1, 4, 5, 6], ["a", "b", 4, 5, 6, 7]]

train_labels = [10, 20]

model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)

model.fit(train_data, train_labels, cat_features)

In [None]:
model.predict(train_data)