## Libraries

In [1]:
import sys

import numpy as np
import pandas as pd

import lightgbm as lgb

from pathlib import Path

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
sys.path.append("../../..")
sys.path.append("../..")
sys.path.append("../")
sys.path.append("./")
sys.path.append("../script/")

In [3]:
from script.trainer.adversarial_trainer import AdversarialTrainer
from script.utils.logging import get_logger

## Data Loading

In [4]:
features_dir = Path("../../features/katayama/")

train_df = pd.read_csv(features_dir / "train_features_denoised_50000.csv")
test_df = pd.read_csv(features_dir / "test_features_denoised.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,Hann_window_mean_150,Hann_window_mean_1500,Hann_window_mean_15000,Hann_window_mean_50,Hilbert_mean,abs_max,abs_max_roll_mean_10,abs_max_roll_mean_100,abs_max_roll_mean_1000,...,std_roll_std_10000,std_roll_std_50,std_roll_std_500,sum,time_rev_asym_stat_1,time_rev_asym_stat_10,time_rev_asym_stat_100,time_rev_asym_stat_5,time_rev_asym_stat_50,trend
0,0,-0.001664,-0.001636,-0.001576,-0.001692,1.571311,103.719287,65.848239,5.124389,0.514386,...,1.51415,3.613,3.381701,-246.545064,-0.09642,-24.507658,3.706442,-6.521118,-0.00914,-6.00564e-08
1,1,0.000369,0.000462,0.000463,0.000217,1.647351,178.994821,154.684428,9.753921,0.994938,...,2.82276,4.082913,3.869233,72.88652,0.181659,96.858716,103.994552,44.335567,55.526124,1.566934e-08
2,2,0.001567,0.00156,0.001476,0.001568,1.787125,180.156173,155.364527,9.979735,1.011858,...,3.603752,4.938187,4.748707,235.178154,-0.385474,81.294718,139.348472,67.572556,65.423807,5.049579e-08
3,3,-0.000538,-0.000535,-0.000507,-0.000538,2.056154,179.071607,154.927313,10.235463,1.02566,...,3.455554,5.382792,5.158599,-80.714161,0.2582,88.924023,143.980044,84.043279,68.351271,-2.005114e-08
4,4,-0.000121,-0.000165,-0.000194,-0.000208,1.92583,148.083397,77.805596,7.182646,0.548552,...,2.349992,4.901521,4.659875,-29.744275,-0.62364,-39.077055,24.278237,29.943744,4.467025,4.934947e-10


In [5]:
train_df = train_df.iloc[:, 1:]
test_df = test_df.iloc[:, 1:]

## Prepare target

In [6]:
y_train = np.zeros(len(train_df))
y_test = np.ones(len(test_df))

y = np.concatenate([y_train, y_test])

## Configuration

In [None]:
logger = get_logger(log_dir="log/", name="Main", tag="adversarial-validation")

trainer = AdversarialTrainer(logger,
                             kwargs={
                                 "num_leaves": 255,
                                 "learning_rate": 0.01,
                                 "min_child_weight": 1e-3,
                                 "subsample": 0.8,
                                 "subsample_freq": 5,
                                 "colsample_bytree": 0.8,
                                 "reg_alpha": 0.01,
                                 "reg_lambda": 0.01,
                                 "n_jobs": 4
                             })

## Training

In [8]:
X = pd.concat([train_df, test_df], axis=0, ignore_index=True)
trainer.fit(X, y, n_epochs=5000)

2019-05-16 00:12:27,583 Main INFO Fold 1
2019-05-16 00:12:50,056 Main INFO [50]	valid_0's binary_logloss: 0.524788
2019-05-16 00:13:09,589 Main INFO [100]	valid_0's binary_logloss: 0.51528
2019-05-16 00:13:29,865 Main INFO [150]	valid_0's binary_logloss: 0.510021
2019-05-16 00:13:49,615 Main INFO [200]	valid_0's binary_logloss: 0.510047
2019-05-16 00:14:09,735 Main INFO [250]	valid_0's binary_logloss: 0.511792
2019-05-16 00:14:19,916 Main INFO Fold 2
2019-05-16 00:14:41,832 Main INFO [50]	valid_0's binary_logloss: 0.526533
2019-05-16 00:15:01,086 Main INFO [100]	valid_0's binary_logloss: 0.514714
2019-05-16 00:15:21,357 Main INFO [150]	valid_0's binary_logloss: 0.509
2019-05-16 00:15:40,937 Main INFO [200]	valid_0's binary_logloss: 0.507879
2019-05-16 00:16:00,680 Main INFO [250]	valid_0's binary_logloss: 0.510464
2019-05-16 00:16:13,662 Main INFO Fold 3
2019-05-16 00:16:34,318 Main INFO [50]	valid_0's binary_logloss: 0.538045
2019-05-16 00:16:52,922 Main INFO [100]	valid_0's binary_lo

## Feature importances

In [11]:
importances = pd.DataFrame(
    columns=["names", "values0", "values1", "values2", "values3", "values4"])
importances["names"] = X.columns
for i in range(5):
    importances[f"values{i}"] = trainer.trees[i].feature_importances_
    
importances.head()

Unnamed: 0,names,values0,values1,values2,values3,values4
0,Hann_window_mean_150,12,6,2,7,14
1,Hann_window_mean_1500,2,1,4,0,2
2,Hann_window_mean_15000,1,2,5,4,3
3,Hann_window_mean_50,3,1,2,1,1
4,Hilbert_mean,2,0,0,1,2


In [15]:
importances["values"] = (importances["values1"] + importances["values0"] + 
                         importances["values2"] + importances["values3"] +
                         importances["values4"]) / 5

importances_df = importances[["names", "values"]].sort_values(by="values", ascending=False)

In [16]:
importances_df.to_csv("adversarial_validation_results.csv", index=False)