## import + seed + path

In [None]:
!pip install -q catboost category_encoders xfeat texthero

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.optimize import minimize
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import re
import os
import string
import torch
import random
import warnings

import category_encoders as ce
import xfeat
import texthero as hero
from lightgbm import LGBMModel, LGBMClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import QuantileTransformer

warnings.filterwarnings('ignore')

In [None]:
def seed_all(seed):
  random.seed(seed)
  np.random.seed(seed)
  os.environ["PYTHONHASHSEED"] = str(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.daterministic = True

SEED = 42
seed_all(SEED)

In [None]:
FOLDS = 5
NAME = "baseline001"

BASE_DIR = "./pretrained_models/selected_num_100_add_remove_urls_2/"
BASE_DIR2 = "./pretrained_models/selected_num_300_add_remove_urls_and_unique_diff/"
BASE_DIR_CB = "./pretrained_models/catboost_default_selected_num_100/"
BASE_DIR_XGB = "./pretrained_models/xgb_default_selected_num_100_2/"
BASE_DIR_MLP = "./pretrained_models/mlp/"
BASE_DIR_MLP2 = "./pretrained_models/mlp_2/"
BASE_DIR_BAGGING = "./pretrained_models/bagging/"
BASE_DIR_GB = "./pretrained_models/gradientboosting/"
BASE_DIR_ADA = "./pretrained_models/adaboost/"
BASE_DIR_LR = "./pretrained_models/logistic_regression/"
BASE_DIR_RF = "./pretrained_models/random_forest/"
BASE_DIR_ET = "./pretrained_models/extra_trees/"

MODEL_DIR = BASE_DIR + "models/"
DATA_DIR = BASE_DIR + "data/"
MODEL_DIR2 = BASE_DIR2 + "models/"
DATA_DIR2 = BASE_DIR2 + "data/"
MODEL_DIR_CB = BASE_DIR_CB + "models/"
DATA_DIR_CB = BASE_DIR_CB + "data/"
MODEL_DIR_XGB = BASE_DIR_XGB + "models/"
DATA_DIR_XGB = BASE_DIR_XGB + "data/"
MODEL_DIR_MLP = BASE_DIR_MLP2 + "models/"
DATA_DIR_MLP = BASE_DIR_MLP + "data/"
MODEL_DIR_BAGGING = BASE_DIR_BAGGING + "models/"
MODEL_DIR_GB = BASE_DIR_GB + "models/"
MODEL_DIR_ADA = BASE_DIR_ADA + "models/"
MODEL_DIR_LR = BASE_DIR_LR + "models/"
MODEL_DIR_RF = BASE_DIR_RF + "models/"
MODEL_DIR_ET = BASE_DIR_ET + "models/"

cols_path = DATA_DIR + "cols.pkl"
cols_path2 = DATA_DIR2 + "cols.pkl"
cols_path_xgb = DATA_DIR_XGB + "cols.pkl"
cols_path_cb = DATA_DIR_CB + "cols.pkl"

## loading data

In [None]:
DATA_PATH = "./data/"
train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
train_y = train["state"]

In [None]:
train_x = pickle.load(open(DATA_DIR + "train_x.pkl", 'rb'))
test_x = pickle.load(open(DATA_DIR + "test_x.pkl", 'rb'))

train_x2 = pickle.load(open(DATA_DIR2 + "train_x.pkl", 'rb'))
test_x2 = pickle.load(open(DATA_DIR2 + "test_x.pkl", 'rb'))

train_x_mlp = pickle.load(open(DATA_DIR_MLP + "train_x.pkl", 'rb'))
test_x_mlp = pickle.load(open(DATA_DIR_MLP + "test_x.pkl", 'rb'))

train_x_rf = train_x.fillna(0)
test_x_rf = test_x.fillna(0)

## single

In [None]:
def make_skf(train_x, train_y, random_state=2021):
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=random_state)
    folds_idx = [(t, v) for (t, v) in skf.split(train_x, train_y)]
    return folds_idx

In [None]:
def threshold_optimization(y_true, y_pred, metrics=None):
    def f1_opt(x):
        if metrics is not None:
            score = -metrics(y_true, y_pred >= x)
        else:
            raise NotImplementedError
        return score
    result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead')
    best_threshold = result['x'].item()
    return best_threshold

def optimized_f1(y_true, y_pred):
    bt = threshold_optimization(y_true, y_pred, metrics=f1_score)
    score = f1_score(y_true, y_pred >= bt)
    return score

In [None]:
class MyLGBMModel:
    def __init__(self, name=None, params=None, fold=None, train_x=None, train_y=None, test_x=None, metrics=None, seeds=None):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.name = name
        self.params = params
        self.metrics = metrics
        self.kfold = fold
        self.oof = None
        self.preds = None
        self.seeds = seeds if seeds is not None else [2020]  
        self.models = {}  

    def build_model(self):
        model = LGBMModel(**self.params)
        return model

    def predict_cv(self, pretrained=False, model_dir="./"):
        oof_seeds = []
        scores_seeds = []
        for seed in self.seeds:
            oof = []
            va_idxes = []
            scores = []
            train_x = self.train_x.values
            train_y = self.train_y.values
            fold_idx = self.kfold(self.train_x, self.train_y, random_state=seed) 

            for cv_num, (tr_idx, va_idx) in enumerate(fold_idx):
                tr_x, va_x = train_x[tr_idx], train_x[va_idx]
                tr_y, va_y = train_y[tr_idx], train_y[va_idx]
                va_idxes.append(va_idx)
                model = self.build_model()
                model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"
                model_path = model_dir + model_name
    
                if pretrained == False:
                  model.fit(tr_x, tr_y,
                            eval_set=[[va_x, va_y]],
                            early_stopping_rounds=100,
                            verbose=False)  
                  pickle.dump(model, open(model_name, 'wb'))
                else:
                  model = pickle.load(open(model_path, 'rb'))
                
                self.models[model_name] = model  
                
                pred = model.predict(va_x)
                oof.append(pred)

                score = self.get_score(va_y, pred)
                scores.append(score)
                print(f"SEED:{seed}, FOLD:{cv_num} =====> val_score:{score}")

            va_idxes = np.concatenate(va_idxes)
            oof = np.concatenate(oof)
            order = np.argsort(va_idxes)
            oof = oof[order]
            oof_seeds.append(oof)
            scores_seeds.append(np.mean(scores))
            
        oof = np.mean(oof_seeds, axis=0)
        self.oof = oof
        print(f"model:{self.name} score:{self.get_score(self.train_y, oof)}\n")
        return oof      

    def inference(self, pretrained=False, model_dir="./"):
        preds_seeds = []
        for seed in self.seeds:
            preds = []
            test_x = self.test_x.values
            for cv_num in range(FOLDS):
                print(f"-INFERENCE- SEED:{seed}, FOLD:{cv_num}")

                model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"
                model_path = model_dir + model_name
                if pretrained == False:
                  model = self.models[model_name]                
                else:
                  model = pickle.load(open(model_path, 'rb'))

                pred = model.predict(test_x)
                preds.append(pred)
            preds = np.mean(preds, axis=0)
            preds_seeds.append(preds)
        preds = np.mean(preds_seeds, axis=0)
        self.preds = preds
        return preds

    def tree_importance(self):
        feature_importance_df = pd.DataFrame()
        for i, (tr_idx, va_idx) in enumerate(self.kfold(self.train_x, self.train_y)):
            tr_x, va_x = self.train_x.values[tr_idx], self.train_x.values[va_idx]
            tr_y, va_y = self.train_y.values[tr_idx], self.train_y.values[va_idx]
            model = self.build_model()
            model.fit(tr_x, tr_y,
                      eval_set=[[va_x, va_y]],
                      early_stopping_rounds=100,
                      verbose=False) 
            _df = pd.DataFrame()
            _df['feature_importance'] = model.feature_importances_
            _df['column'] = self.train_x.columns
            _df['fold'] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, _df], axis=0, ignore_index=True)
        order = feature_importance_df.groupby('column') \
                    .sum()[['feature_importance']] \
                    .sort_values('feature_importance', ascending=False).index[:50]
        fig, ax = plt.subplots(figsize=(12, max(4, len(order) * .2)))
        sns.boxenplot(data=feature_importance_df, y='column', x='feature_importance', order=order, ax=ax,
                      palette='viridis')
        fig.tight_layout()
        ax.grid()
        ax.set_title('feature importance')
        fig.tight_layout()
        plt.show()
        return fig, feature_importance_df
    
    def get_score(self, y_true, y_pred):
        score = self.metrics(y_true, y_pred)
        return score


In [None]:
model_params = {
    "n_estimators": 10000,
    "objective": 'binary',
    "learning_rate": 0.01,
    "num_leaves": 31,
    "random_state": 2021,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .5,
    "reg_lambda": 5,
}

model = MyLGBMModel(name=NAME, 
                    params=model_params,
                    fold=make_skf,
                    train_x=train_x,
                    train_y=train_y,
                    test_x=test_x,
                    metrics=optimized_f1, 
                    seeds=[0, 1, 2]
                   )

selected_num = 100 

cols = pickle.load(open(cols_path, "rb"))
selected_cols = cols[:selected_num]

model.train_x = model.train_x[selected_cols]
model.test_x = model.test_x[selected_cols]

oof_lgb = model.predict_cv(pretrained=True, model_dir=MODEL_DIR)  
preds_lgb = model.inference() 

In [None]:
model_params = {
    "n_estimators": 10000,
    "objective": 'binary',
    "learning_rate": 0.01,
    "num_leaves": 31,
    "random_state": 2021,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .5,
    "reg_lambda": 5,
}

model = MyLGBMModel(name=NAME, 
                    params=model_params,
                    fold=make_skf,
                    train_x=train_x2,
                    train_y=train_y,
                    test_x=test_x2,
                    metrics=optimized_f1, 
                    seeds=[0, 1, 2]
                   )


selected_num = 300

cols = pickle.load(open(cols_path2, "rb"))
selected_cols = cols[:selected_num]

model.train_x = model.train_x[selected_cols]
model.test_x = model.test_x[selected_cols]

oof_lgb2 = model.predict_cv(pretrained=True, model_dir=MODEL_DIR2)
preds_lgb2 = model.inference()

## stacking

In [None]:
def add_mean(df, model_cols):
  _df = df.copy()
  _df["mean"] = df.mean(axis=1)
  for model in model_cols:
    group_cols = [col for col in df.columns.values if col.startswith(f"{model}_")]
    _df[f"{model}_mean"] = df[group_cols].mean(axis=1)
  for i in range(3):
      fold_cols = [col for col in df.columns.values if col.endswith(f"{i}")]
      _df[f"fold{i}_mean"] = df[fold_cols].mean(axis=1)
  return _df

In [None]:
class StackingModel:
    def __init__(self, 
                 name=None, 
                 params_dict=None,
                 fold=None, 
                 train_x=None, 
                 train_y=None, 
                 test_x=None, 
                 train_x_mlp=None, 
                 test_x_mlp=None,
                 metrics=None, 
                 seeds=None, 
                 model_path_dict=None, 
                 cols_dict=None):
        self.train_x = train_x        
        self.train_y = train_y
        self.test_x = test_x
        self.train_x_mlp = train_x_mlp
        self.test_x_mlp = test_x_mlp
        self.name = name
        self.params_dict = params_dict
        self.metrics = metrics 
        self.kfold = fold 
        self.oof = None
        self.preds = None
        self.seeds = seeds if seeds is not None else [2020] 
        self.model_path_dict = model_path_dict
        self.models_inference = {} 
        self.cols_dict = cols_dict


    def build_models(self):
        models = {}
        models["lgb"] = LGBMModel(**self.params_dict["lgb"])
        models["xgb"] = XGBClassifier(**self.params_dict["xgb"])        
        models["catboost"] = CatBoostClassifier(**self.params_dict["catboost"])              
        models["rf"] = RandomForestClassifier(**self.params_dict["rf"])
        models["et"] = ExtraTreesClassifier(**self.params_dict["et"])
        models["bagging"] = BaggingClassifier(**self.params_dict["bagging"])
        models["gb"] = GradientBoostingClassifier(**self.params_dict["gb"])
        models["ada"] = AdaBoostClassifier(**self.params_dict["ada"])
        models["lr"] = LogisticRegression(**self.params_dict["lr"])
        models["mlp"] = MLPClassifier(**self.params_dict["mlp"])
        return models


    def predict_cv(self):
        df_ = pd.DataFrame()
        models = self.build_models()
        for key, model in models.items():
          print(key)
          oof_seeds = []
          scores_seeds = []
          for seed in self.seeds:
              oof = []
              va_idxes = []
              scores = []
              train_x = None              
              if key == "mlp" or key == "lr":
                train_x = self.train_x_mlp.values
              elif key == "gb":
                train_x = self.train_x.values
              else:
                cols = self.cols_dict[key]
                selected_num = 100
                selected_cols = cols[:selected_num]
                train_x = self.train_x[selected_cols].values
              train_y = self.train_y.values
              fold_idx = self.kfold(self.train_x, self.train_y, random_state=seed) 

              for cv_num, (tr_idx, va_idx) in enumerate(fold_idx):
                  tr_x, va_x = train_x[tr_idx], train_x[va_idx]
                  tr_y, va_y = train_y[tr_idx], train_y[va_idx]
                  va_idxes.append(va_idx)


                  model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"

                  if key in self.model_path_dict.keys():
                    model_path = self.model_path_dict[key] + model_name
                    model = pickle.load(open(model_path, 'rb'))
                  else:
                    model.fit(tr_x, tr_y)
                  self.models_inference[key + model_name] = model
                  pred = None
                  if key == "lgb":
                    pred = model.predict(va_x)
                  else:
                    pred = model.predict_proba(va_x)[:, 1]
                  oof.append(pred)

                  score = self.get_score(va_y, pred)
                  scores.append(score)
                  print(f"SEED:{seed}, FOLD:{cv_num} =====> val_score:{score}")

              va_idxes = np.concatenate(va_idxes)
              oof = np.concatenate(oof)
              order = np.argsort(va_idxes)
              oof = oof[order]
              oof_seeds.append(oof)
              scores_seeds.append(np.mean(scores))
              df_[f'{key}_{seed}'] = oof
              
          oof = np.mean(oof_seeds, axis=0)
          self.oof = oof
          print(f"model:{key} score:{self.get_score(self.train_y, oof)}\n")

        return df_
        

    def inference(self):
        df_ = pd.DataFrame()
        models = self.build_models()
        for key, model in models.items():
          print(key)
          preds_seeds = []
          for seed in self.seeds:
              preds = []              
              test_x = None
              if key == "mlp" or key == "lr":
                test_x = self.test_x_mlp.values
              elif key == "gb":
                test_x = self.test_x.values
              else:
                cols = self.cols_dict[key]
                selected_num = 100
                selected_cols = cols[:selected_num]
                test_x = self.test_x[selected_cols].values
              for cv_num in range(FOLDS):
                  print(f"-INFERENCE- SEED:{seed}, FOLD:{cv_num}")
                  model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"
                  model = self.models_inference[key + model_name]
                  pred = None
                  if key == "lgb":
                    pred = model.predict(test_x)
                  else:
                    pred = model.predict_proba(test_x)[:, 1]
                  preds.append(pred)
              preds = np.mean(preds, axis=0)
              preds_seeds.append(preds)
              df_[f'{key}_{seed}'] = preds
          preds = np.mean(preds_seeds, axis=0)
        return df_

    
    def get_score(self, y_true, y_pred):
        score = self.metrics(y_true, y_pred)
        return score

In [None]:
params_lgb = {
    "n_estimators": 10000,
    "objective": 'binary',
    "learning_rate": 0.01,
    "num_leaves": 31,
    "random_state": 2021,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': .5,
    "reg_lambda": 5,
}
params_xgb = {
    'n_estimators': 500,
    'random_state': 2021,
    'objective': 'binary:logistic',
    'n_jobs': -1,
    'importance_type': 'gain',    
}
params_cb = {
    "verbose": False,
    "random_state": 2021
}
params_rf = {
    'random_state': 2021, 
}
params_et = {
    'random_state': 2021, 
}    
params_bagging = {
    'base_estimator': None,
    'n_estimators': 100,
    'max_samples': 1.0,
    'max_features': 1.0,
    'bootstrap': True,
    'bootstrap_features': False,
    'oob_score': False,
    'warm_start': False,
    'n_jobs': -1,
    'random_state': 2021,
    'verbose': 0
}
params_gb = {
    'loss': 'deviance',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 1.0,
    'criterion': 'mse',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_depth': 3,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'init': None,
    'random_state': 2021,
    'max_features': None,
    'verbose': False,
    'max_leaf_nodes': None,
    'warm_start': False,
    'validation_fraction': 0.1,
    'n_iter_no_change': None,
    'tol': 1e-4,
    'ccp_alpha': 0.0
}
params_ada = {
    'base_estimator': None,
    'n_estimators': 50,
    'learning_rate': 1,
    'algorithm': 'SAMME.R',
    'random_state': 2021
}
params_lr = {
    'penalty': 'l2',
    'dual': False,
    'tol': 1e-4,
    'C': 1.0,
    'fit_intercept': True,
    'intercept_scaling': 1,
    'class_weight': None,
    'random_state': 2021,
    'solver': 'lbfgs',
    'max_iter': 100,
    'multi_class': 'auto',
    'verbose': 0,
    'warm_start': False,
    'n_jobs': -1, 
    'l1_ratio': None
}
params_mlp = {
    'hidden_layer_sizes': (300,),
    'activation': 'relu',
    'solver': 'adam', 
    'alpha': 0.0001,
    'batch_size': 'auto',
    'learning_rate': 'constant',
    'learning_rate_init': 0.001,
    'power_t': 0.5,
    'max_iter': 500,
    'shuffle': True,
    'random_state': 2021,
    'tol': 1e-4,
    'verbose': False,
    'warm_start': False,
    'momentum': 0.9,
    'nesterovs_momentum': True,
    'early_stopping': True,
    'validation_fraction': 0.1,
    'beta_1': 0.9,
    'beta_2': 0.999,
    'epsilon': 1e-8,
    'n_iter_no_change': 10,
    'max_fun': 15000
}

params_dict = {
    "lgb": params_lgb,
    "xgb": params_xgb,
    "catboost": params_cb,
    "rf": params_rf,
    "et": params_et,
    "bagging": params_bagging,
    "gb": params_gb,
    "ada": params_ada,
    "lr": params_lr,
    "mlp": params_mlp,
}

model_path_dict = {
               "lgb": MODEL_DIR,
               "xgb": MODEL_DIR_XGB,
               "catboost": MODEL_DIR_CB,           
               "rf": MODEL_DIR_RF,
               "et": MODEL_DIR_ET,
               "bagging": MODEL_DIR_BAGGING,
               "gb": MODEL_DIR_GB,
               "ada": MODEL_DIR_ADA,
               "lr": MODEL_DIR_LR,
               "mlp": MODEL_DIR_MLP,                                             
}

cols = pickle.load(open(cols_path, "rb"))
cols_xgb = pickle.load(open(cols_path_xgb, "rb"))
cols_cb = pickle.load(open(cols_path_cb, "rb"))
cols_dict = {
    "lgb": cols,
    "xgb": cols_xgb,
    "catboost": cols_cb,
    "rf": cols,
    "et": cols,
    "lr": cols,
    "bagging": cols,
    "ada": cols
}

model = StackingModel(name=NAME,      
                    params_dict=params_dict,
                    fold=make_skf,
                    train_x=train_x_rf,
                    train_y=train_y,
                    test_x=test_x_rf,
                    train_x_mlp=train_x_mlp,
                    test_x_mlp=test_x_mlp,
                    metrics=optimized_f1, 
                    seeds=[0, 1, 2],
                    model_path_dict=model_path_dict,
                    cols_dict=cols_dict
                   )

train_preds_df = model.predict_cv() 
test_preds_df = model.inference()  

In [None]:
model_names = list(set([col.split("_")[0] for col in train_preds_df.columns]))
train_preds_df = add_mean(train_preds_df, model_names)
test_preds_df = add_mean(test_preds_df, model_names)

In [None]:
params_lgb = {
    "n_estimators": 10000,
    "objective": 'binary',
    "learning_rate": 0.01,
    "num_leaves": 31,
    "random_state": 2021,
    "n_jobs": -1,
    "importance_type": "gain",
    'colsample_bytree': 1.0,
    "reg_lambda": 0.0,
}
model = MyLGBMModel(name=NAME, 
                    params=params_lgb,
                    fold=make_skf,
                    train_x=train_preds_df,
                    train_y=train_y,
                    test_x=test_preds_df,
                    metrics=optimized_f1, 
                    seeds=[0, 1, 2]
                   )


oof_l2 = model.predict_cv()  
preds_l2 = model.inference() 

## blending

In [None]:
oof_lst = []
oof_lst.append(oof_lgb)
oof_lst.append(oof_lgb2)
oof_lst.append(oof_l2)
preds_lst = []
preds_lst.append(preds_lgb)
preds_lst.append(preds_lgb2)
preds_lst.append(preds_l2)

oof_avg = np.mean(oof_lst, axis=0)
preds_avg = np.mean(preds_lst, axis=0)

best_threshold = threshold_optimization(y_true=train_y, y_pred=oof_avg, metrics=f1_score) 
print(f"best_threshold is {best_threshold}\n")

labels_avg = preds_avg >= best_threshold

In [None]:
weights = [0.4, 0.4, 0.2]
oof_avg2 = weights[0] * oof_lgb + weights[1] * oof_lgb2 + weights[2] * oof_l2
preds_avg2 = weights[0] * preds_lgb + weights[1] * preds_lgb2 + weights[2] * preds_l2

best_threshold = threshold_optimization(y_true=train_y, y_pred=oof_avg2, metrics=f1_score) 
print(f"best_threshold is {best_threshold}\n")

labels_avg2 = preds_avg2 >= best_threshold

In [None]:
oof_lst = []
oof_lst.append(oof_avg)
oof_lst.append(oof_avg2)
preds_lst = []
preds_lst.append(preds_avg)
preds_lst.append(preds_avg2)

oof_avg = np.mean(oof_lst, axis=0)
preds_avg = np.mean(preds_lst, axis=0)

best_threshold = threshold_optimization(y_true=train_y, y_pred=oof_avg, metrics=f1_score) 
print(f"best_threshold is {best_threshold}\n")

labels_avg = preds_avg >= best_threshold

## submission

In [None]:
sub = pd.read_csv(DATA_PATH + 'sample_submit.csv', header=None)
sub.head()

In [None]:
sub[1] = labels_avg
sub = sub.astype(int)
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False, header=False)