In [None]:
!pip install -q category_encoders xfeat texthero

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy.optimize import minimize
import pickle

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import re
import os
import torch
import string
import random
import warnings

import category_encoders as ce
import xfeat
import texthero as hero
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.preprocessing import QuantileTransformer

warnings.filterwarnings('ignore')

In [None]:
def seed_all(seed):
  random.seed(seed)
  np.random.seed(seed)
  os.environ["PYTHONHASHSEED"] = str(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.daterministic = True

SEED = 42
seed_all(SEED)

In [None]:
DATA_PATH = "./data"
train = pd.read_csv(DATA_PATH + 'train.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')

In [None]:
TRAIN_LEN = len(train)
FOLDS = 5
NAME = "baseline001"

BASE_DIR = "./pretrained_models/selected_num_100_add_remove_urls_2/"
MODEL_DIR = BASE_DIR + "models/"
DATA_DIR = BASE_DIR + "data/"

In [None]:
train_x = pickle.load(open(DATA_DIR + "train_x.pkl", 'rb'))
test_x = pickle.load(open(DATA_DIR + "test_x.pkl", 'rb'))

train_y = train["state"]

In [None]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [None]:
def make_skf(train_x, train_y, random_state=2021):
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=random_state)
    folds_idx = [(t, v) for (t, v) in skf.split(train_x, train_y)]
    return folds_idx


def threshold_optimization(y_true, y_pred, metrics=None):
    def f1_opt(x):
        if metrics is not None:
            score = -metrics(y_true, y_pred >= x)
        else:
            raise NotImplementedError
        return score
    result = minimize(f1_opt, x0=np.array([0.5]), method='Nelder-Mead')
    best_threshold = result['x'].item()
    return best_threshold


def optimized_f1(y_true, y_pred):
    bt = threshold_optimization(y_true, y_pred, metrics=f1_score)
    score = f1_score(y_true, y_pred >= bt)
    return score    

In [None]:
class MyGradientBoostingModel:
    def __init__(self, name=None, params=None, fold=None, train_x=None, train_y=None, test_x=None, metrics=None, seeds=None):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.name = name
        self.params = params
        self.metrics = metrics 
        self.kfold = fold 
        self.oof = None
        self.preds = None
        self.seeds = seeds if seeds is not None else [2020] 
        self.models = {}  

    def build_model(self):
        model = GradientBoostingClassifier(**self.params)
        return model    

    def predict_cv(self, pretrained=False, model_dir=MODEL_DIR):
        oof_seeds = []
        scores_seeds = []
        for seed in self.seeds:
            oof = []
            va_idxes = []
            scores = []
            train_x = self.train_x.values
            train_y = self.train_y.values
            fold_idx = self.kfold(self.train_x, self.train_y, random_state=seed) 

            for cv_num, (tr_idx, va_idx) in enumerate(fold_idx):
                tr_x, va_x = train_x[tr_idx], train_x[va_idx]
                tr_y, va_y = train_y[tr_idx], train_y[va_idx]
                va_idxes.append(va_idx)
                model = self.build_model()

                model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"
                model_path = model_dir + model_name
    
                if pretrained == False:
                  model.fit(tr_x, tr_y)                            
                  pickle.dump(model, open(model_name, 'wb'))
                else:
                  model = pickle.load(open(model_path, 'rb'))
                
                self.models[model_name] = model 
                
                pred = model.predict_proba(va_x)[:, 1]
                oof.append(pred)
                
                score = self.get_score(va_y, pred)
                scores.append(score)
                print(f"SEED:{seed}, FOLD:{cv_num} =====> val_score:{score}")

            va_idxes = np.concatenate(va_idxes)
            oof = np.concatenate(oof)
            order = np.argsort(va_idxes)
            oof = oof[order]
            oof_seeds.append(oof)
            scores_seeds.append(np.mean(scores))
            
        oof = np.mean(oof_seeds, axis=0)
        self.oof = oof
        print(f"model:{self.name} score:{self.get_score(self.train_y, oof)}\n")
        return oof      

    def inference(self, pretrained=False, model_dir=MODEL_DIR):
        preds_seeds = []
        for seed in self.seeds:
            preds = []
            test_x = self.test_x.values
            for cv_num in range(FOLDS):
                print(f"-INFERENCE- SEED:{seed}, FOLD:{cv_num}")
                model_name = f"{self.name}_SEED{seed}_FOLD{cv_num}_model.pkl"
                model_path = model_dir + model_name
                if pretrained == False:
                  model = self.models[model_name]                
                else:
                  model = pickle.load(open(model_path, 'rb'))
                pred = model.predict_proba(test_x)[:, 1]
                preds.append(pred)
            preds = np.mean(preds, axis=0)
            preds_seeds.append(preds)
        preds = np.mean(preds_seeds, axis=0)
        self.preds = preds
        return preds    
    
    def get_score(self, y_true, y_pred):
        score = self.metrics(y_true, y_pred)
        return score


In [None]:
params = {
    'loss': 'deviance',
    'learning_rate': 0.1,
    'n_estimators': 100,
    'subsample': 1.0,
    'criterion': 'mse',
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'min_weight_fraction_leaf': 0.0,
    'max_depth': 3,
    'min_impurity_decrease': 0.0,
    'min_impurity_split': None,
    'init': None,
    'random_state': 2021,
    'max_features': None,
    'verbose': False,
    'max_leaf_nodes': None,
    'warm_start': False,
    'validation_fraction': 0.1,
    'n_iter_no_change': None,
    'tol': 1e-4,
    'ccp_alpha': 0.0
}

model = MyGradientBoostingModel(name=NAME, 
                    params=params,
                    fold=make_skf,
                    train_x=train_x,
                    train_y=train_y,
                    test_x=test_x,
                    metrics=optimized_f1, 
                    seeds=[0, 1, 2]
                   )

oof = model.predict_cv() 
preds_bagging = model.inference()

best_threshold = threshold_optimization(y_true=train_y, y_pred=oof, metrics=f1_score) 
print(f"best_threshold is {best_threshold}\n")

labels = preds_bagging >= best_threshold