### import

In [None]:
import numpy as np 
import pandas as pd 
import sys 
import os 
import logzero 
import wandb 
import pickle 
import seaborn as sns 
import matplotlib.pyplot as plt 
sns.set()

### config

In [None]:
from src.utils import noglobal, pickle_load, pickle_save, HydraConfig

In [None]:
class Config():
    version = '001'
    comment = 'test'
    input_dir = '/home/user/work/input/wherethereiscodethereisbug'
    output_dir = f'/home/user/work/output/{version}' 
    seed = 42
    target_col = 'label'
    wandb_init = {
        "project": "debug",
        "entity": "kuto5046",
        "group": f"exp{version}",
        "dir": output_dir,
        "tags": [],
        "mode": "disabled", 
    }
    n_splits = 5
    use_fold = [0]  # fold1つで終える場合[0], 全てのfoldを実行する場合[0,1,2,3,4]

    # model設定読み込み
    model_config_name = 'lgb_binary'  # タスクや使うモデルに応じて変更
    model_config = HydraConfig.get_cnf(config_path='/home/user/work/configs/model/', config_name=model_config_name)
    num_boost_round = model_config['num_boost_round']
    model_name = model_config.name
    model_params = dict(model_config['params'])

    

c = Config()
# c = HydraConfig.get_cnf(config_path='/home/user/work/configs/', config_name='config.yaml')
os.makedirs(c.output_dir, exist_ok=True)
logger = logzero.setup_logger(name='main', logfile=f'{c.output_dir}/result.log', level=10)

In [None]:
# 念の為check
c.model_name, c.num_boost_round, c.model_params

### read data

In [None]:
train = pd.read_csv(f'{c.input_dir}/train.csv')
test = pd.read_csv(f'{c.input_dir}/test.csv')
train.shape, test.shape

### preprocess

In [None]:
from src.features.base import Feature, generate_features, get_categorical_col, get_numerical_col, load_datasets
from src.features.encoder import count_encoder, ordinal_encoder, pp_for_categorical_encoding, target_encoder
from src.features.nlp import count_lda_vectorize, tfidf_svd_vectorize, UniversalSentenceEncoder, BertSequenceVectorizer, Sentence2Vec, SCDVEmbedder, get_embedding_model

In [None]:
whole = pd.concat([train, test]).reset_index(drop=True)
whole = pd.concat([whole, whole['code'].str.split('\n', expand=True).add_prefix('code_')], axis=1)

for i in range(5):
    whole[f'code_{i}'] = whole[f'code_{i}'].str.strip()

cat_cols = get_categorical_col(whole, skip_cols=['id', c.target_col])
numerical_cols = get_numerical_col(whole, skip_cols=['id', c.target_col])

In [None]:
cat_cols

In [None]:
numerical_cols

In [None]:
train = whole[~whole[c.target_col].isna()].reset_index(drop=True)
test = whole[whole[c.target_col].isna()].reset_index(drop=True)

In [None]:
train, test = pp_for_categorical_encoding(train, test, cat_cols)

### feature engineering

In [None]:
get_embedding_model()

In [None]:
# class TfidfSvdCode2(Feature):
#     def create_features(self):
#         col = 'code_2'
#         self.train = tfidf_svd_vectorize(train, col=col)
#         self.test = tfidf_svd_vectorize(test, col=col)

# class CoundLDACode2(Feature):
#     def create_features(self):
#         col = 'code_2'
#         self.train = count_lda_vectorize(train, col=col)
#         self.test = count_lda_vectorize(test, col=col)


# class OrdinalEncode(Feature):
#     def create_features(self):
#         self.train, self.test = ordinal_encoder(train, test, cat_cols)


# class BertVecCode2(Feature):
#     def create_features(self):
#         bert = BertSequenceVectorizer()
#         col = 'code_2'
#         self.train = bert.vectorize_to_df(train, col)
#         self.test = bert.vectorize_to_df(test, col)


# class USEncodeCode2(Feature):
#     def create_features(self):
#         col = 'code_2'
#         usencoder = UniversalSentenceEncoder()
#         self.train = usencoder.vectorize(train, col)
#         self.test = usencoder.vectorize(test, col)


class Sentence2VecCode2(Feature):
    def create_features(self):
        col = 'code_2'
        ndim = 160 
        encoder = Sentence2Vec(model_file='/home/user/work/input/resource/160/wikipedia-160.txt')
        self.train = encoder.vectorize_to_df(train, col, ndim)
        self.test = encoder.vectorize_to_df(test, col, ndim)

In [None]:
ext = 'pickle'
generate_features(globals(), ext=ext, overwrite=False)

In [None]:
feats = [
    # 'TfidfSvdCode2',
    # 'OrdinalEncode',
    # 'CoundLDACode2',
    # 'USEncodeCode2',
    # 'BertVecCode2',
    'Sentence2VecCode2'
]
train_data, test_data = load_datasets(feats, ext=ext)

In [None]:
cat_cols = get_categorical_col(train_data)
cat_cols 

In [None]:
# 使用する特徴量&label
for f in train_data.columns:
    print(f)

In [None]:
train_data[c.target_col] = train[c.target_col].to_numpy()

### cv

In [None]:
from src.cv import get_kfold, get_stratifiedkfold, get_groupkfold
cv = get_stratifiedkfold(train_data, c.target_col, n_splits=5)
cv

### callback

In [None]:
from src.models.gbdt import get_callbacks
callbacks = get_callbacks(c.model_name)
callbacks

### model

In [None]:
from src.models.gbdt import get_model # , LGBModel, XGBModel, CBModel 

model = get_model(c.model_name, c.model_params, c.num_boost_round, cat_cols, c.output_dir, callbacks)

### train

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
def calc_score(true, pred):
    return roc_auc_score(true, pred)

In [None]:
def train_pipeline(model, train, test, cv, config, cat_cols, target_col):
    oofs = []
    preds = []
    for i, (idx_train, idx_valid) in enumerate(cv):
        wandb.init(**config.wandb_init, name=f'exp{config.version}-fold{i}')

        if i not in [0]:
            break 

        logger.info("############")
        logger.info(f"fold {i}")
        logger.info("############")

        _train = train.loc[idx_train].reset_index(drop=True)
        _valid = train.loc[idx_valid].reset_index(drop=True)

        # target encoding
        # for col in cat_cols:
        #     _train, _valid = target_encoder(_train, _valid, col, target_col)
        #     _, test = target_encoder(train, test, col, target_col)

        X_train = _train.drop(target_col, axis=1)
        y_train = _train[target_col]
        X_valid = _valid.drop(target_col, axis=1)
        y_valid = _valid[target_col]
        X_test = test

        model.train(X_train, y_train, X_valid, y_valid)
        model.save(i)
        pred = model.predict(X_valid)

        # evaluate
        score = calc_score(y_valid, pred)
        logger.info(f'fold-{i} score: {score}')
        wandb.log({'CV': score})

        # create oof
        oof_df = pd.DataFrame(pred, index=idx_valid)
        oofs.append(oof_df)

        # pred
        pred_test = model.predict(X_test)
        np.save(f"{c.output_dir}/pred_test_{i}", pred_test)
        preds.append(pred_test)

        if i!=len(cv)-1:
            wandb.finish()

    # oofを保存
    oof = np.array(pd.concat(oofs).sort_index())
    np.save(f"{c.output_dir}/oof", oof)
    return model, oof

In [None]:
model, oof = train_pipeline(model, train_data, test_data, cv, c, cat_cols, c.target_col)

In [None]:
from src.visualize import plot_importance
# catboostは対応していない
plot_importance(model.models, output_dir=c.output_dir)

### inference

In [None]:
preds = []
for i in range(len(cv)):
    pred = np.load(f'{c.output_dir}/pred_test_{i}.npy')
    preds.append(pred)
pred_test = np.mean(preds, axis=0)

In [None]:
sns.distplot(train[c.target_col], label='train')
sns.distplot(pred_test, label='test')
plt.legend();

### submission

In [None]:
sub = pd.read_csv(f'{c.input_dir}/sample_submission.csv')
sub['label'] = pred_test
sub.to_csv(f'{c.output_dir}/submission_exp{c.version}.csv', index=False)