In [None]:
import gc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
import torch
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.model_selection import StratifiedKFold


def MacroF1MetricClassification(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.argmax(preds).astype(np.int16)
    score = f1_score(labels, preds, average='macro')
    return ('MacroF1Metric', score, True)


def MacroF1MetricRegression(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(np.int16)
    score = f1_score(labels, preds, average='macro')
    return ('MacroF1Metric', score, True)


def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=True).sum() / 1024**2  # just added
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    percent = 100 * (start_mem - end_mem) / start_mem
    print(
        'Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'
        .format(start_mem, end_mem, percent))
    return df


feval = {
    'classification': MacroF1MetricClassification,
    'regression': MacroF1MetricRegression
}

plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
train = pd.read_csv('liverpool-ion-switching/train.csv',
                    dtype={
                        'time': np.float32,
                        'signal': np.float32,
                        'open_channels': np.int32
                    })
test = pd.read_csv('liverpool-ion-switching/test.csv',
                    dtype={
                        'time': np.float32,
                        'signal': np.float32,
                    })
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   time    float32
 1   signal  float32
dtypes: float32(2)
memory usage: 15.3 MB


## Features

In [3]:
def feature_eng(df: pd.DataFrame, train: bool = False):
    # 500k
    df['batch_500k'] = np.int32(df.index // 500_000)
    df['batch_500k_max'] = df.groupby(['batch_500k'
                                       ])['signal'].transform(np.max)
    df['batch_500k_min'] = df.groupby(['batch_500k'
                                       ])['signal'].transform(np.min)
    df['batch_500k_mean'] = df.groupby(['batch_500k'
                                        ])['signal'].transform(np.mean)
    df['batch_500k_std'] = df.groupby(['batch_500k'
                                       ])['signal'].transform(np.std)
    df['batch_500k_median'] = df.groupby(['batch_500k'
                                          ])['signal'].transform(np.median)
    df['batch_500k_diff_max'] = df.groupby(
        ['batch_500k'])['signal'].transform(lambda x: np.max(np.diff(x)))
    df['batch_500k_diff_min'] = df.groupby(
        ['batch_500k'])['signal'].transform(lambda x: np.min(np.diff(x)))
    df['batch_500k_range'] = np.abs(df['batch_500k_max'] -
                                    df['batch_500k_min'])
    df['batch_500k_maxtomin'] = np.abs(
        (df['batch_500k_max'] + 1e-6) / (df['batch_500k_min'] + 1e-6))
    df['batch_500k_shift_1'] = df.groupby(['batch_500k']).shift(1)['signal']
    df['batch_500k_shift_-1'] = df.groupby(['batch_500k']).shift(-1)['signal']
    df['batch_500k_shift_2'] = df.groupby(['batch_500k']).shift(2)['signal']
    df['batch_500k_shift_-2'] = df.groupby(['batch_500k']).shift(-2)['signal']

    # 50k
    df['batch_50k'] = np.int32(df.index // 50_000)
    df['batch_50k_max'] = df.groupby(['batch_50k'])['signal'].transform(np.max)
    df['batch_50k_min'] = df.groupby(['batch_50k'])['signal'].transform(np.min)
    df['batch_50k_mean'] = df.groupby(['batch_50k'
                                       ])['signal'].transform(np.mean)
    df['batch_50k_std'] = df.groupby(['batch_50k'])['signal'].transform(np.std)
    df['batch_50k_median'] = df.groupby(['batch_50k'
                                         ])['signal'].transform(np.median)
    df['batch_50k_diff'] = df.groupby(
        ['batch_50k'])['signal'].transform(lambda x: np.max(np.diff(x)))
    df['batch_50k_range'] = np.abs(df['batch_50k_max'] - df['batch_50k_min'])
    df['batch_50k_maxtomin'] = np.abs(
        (df['batch_50k_max'] + 1e-6) / (df['batch_50k_min'] + 1e-6))
    df['batch_50k_shift_1'] = df.groupby(['batch_50k']).shift(1)['signal']
    df['batch_50k_shift_-1'] = df.groupby(['batch_50k']).shift(-1)['signal']
    df['batch_50k_shift_2'] = df.groupby(['batch_50k']).shift(2)['signal']
    df['batch_50k_shift_-2'] = df.groupby(['batch_50k']).shift(-2)['signal']

    # 5k
    df['batch_5k'] = np.int32(df.index // 5_000)
    df['batch_5k_max'] = df.groupby(['batch_5k'])['signal'].transform(np.max)
    df['batch_5k_min'] = df.groupby(['batch_5k'])['signal'].transform(np.min)
    df['batch_5k_mean'] = df.groupby(['batch_5k'])['signal'].transform(np.mean)
    df['batch_5k_std'] = df.groupby(['batch_5k'])['signal'].transform(np.std)
    df['batch_5k_median'] = df.groupby(['batch_5k'
                                        ])['signal'].transform(np.median)
    df['batch_5k_diff'] = df.groupby(
        ['batch_5k'])['signal'].transform(lambda x: np.max(np.diff(x)))
    df['batch_5k_range'] = np.abs(df['batch_5k_max'] - df['batch_5k_min'])
    df['batch_5k_maxtomin'] = np.abs(
        (df['batch_5k_max'] + 1e-6) / (df['batch_5k_min'] + 1e-6))
    df['batch_5k_shift_1'] = df.groupby(['batch_5k']).shift(1)['signal']
    df['batch_5k_shift_-1'] = df.groupby(['batch_5k']).shift(-1)['signal']
    df['batch_5k_shift_2'] = df.groupby(['batch_5k']).shift(2)['signal']
    df['batch_5k_shift_-2'] = df.groupby(['batch_5k']).shift(-2)['signal']

    # 1k
    df['batch_1k'] = np.int32(df.index // 1_000)
    df['batch_1k_max'] = df.groupby(['batch_1k'])['signal'].transform(np.max)
    df['batch_1k_min'] = df.groupby(['batch_1k'])['signal'].transform(np.min)
    df['batch_1k_mean'] = df.groupby(['batch_1k'])['signal'].transform(np.mean)
    df['batch_1k_std'] = df.groupby(['batch_1k'])['signal'].transform(np.std)
    df['batch_1k_median'] = df.groupby(['batch_1k'
                                        ])['signal'].transform(np.median)
    df['batch_1k_diff'] = df.groupby(
        ['batch_1k'])['signal'].transform(lambda x: np.max(np.diff(x)))
    df['batch_1k_range'] = np.abs(df['batch_1k_max'] - df['batch_1k_min'])
    df['batch_1k_maxtomin'] = np.abs(
        (df['batch_1k_max'] + 1e-6) / (df['batch_1k_min'] + 1e-6))
    df['batch_1k_shift_1'] = df.groupby(['batch_1k']).shift(1)['signal']
    df['batch_1k_shift_-1'] = df.groupby(['batch_1k']).shift(-1)['signal']
    df['batch_1k_shift_2'] = df.groupby(['batch_1k']).shift(2)['signal']
    df['batch_1k_shift_-2'] = df.groupby(['batch_1k']).shift(-2)['signal']

    if train:
        feats = df.drop(
            ['time', 'open_channels', 'batch_500k', 'batch_50k', 'batch_5k'],
            axis=1).columns.to_list()
    else:
        feats = df.drop(['time', 'batch_500k', 'batch_50k', 'batch_5k'],
                        axis=1).columns.to_list()
    return df, feats

In [4]:
train, features = feature_eng(train, True)
test, _ = feature_eng(test)

## Signal Analysis

In [5]:
from scipy import signal as sps
from numpy import fft
import pywt
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

signal = train.signal.values
time = train.time.values


def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

In [None]:
p = figure(plot_width=800, plot_height=400)

p.line(train.time.values[::10000], signal[::10000])
show(p)

## Light GBM

In [6]:
sub = pd.read_csv('liverpool-ion-switching/sample_submission.csv')

In [7]:
TARGET = 'open_channels'

RANDOM_SEED = 42
MODEL_TYPE = 'LGBM'
LEARNING_RATE = 0.009
NUM_BOOST_ROUND = 500_000
EARLY_STOPPING_ROUNDS = 50
N_THREADS = -1
OBJECTIVE = 'regression'
NUM_CLASS = 1 if OBJECTIVE=='regression' else 11
METRIC = 'rmse'
NUM_LEAVES = 2**8+1
MAX_DEPTH = -1
FEATURE_FRACTION = 1
BAGGING_FRACTION = 1
BAGGING_FREQ = 0

params_lgb = {'learning_rate': LEARNING_RATE,
          'max_depth': MAX_DEPTH,
          'num_leaves': NUM_LEAVES,
          'feature_fraction': FEATURE_FRACTION,
          'bagging_fraction': BAGGING_FRACTION,
          'bagging_freq': BAGGING_FREQ,
          'n_jobs': N_THREADS,
          'seed': RANDOM_SEED,
          'metric': METRIC,
          'objective': OBJECTIVE,
          'num_class': NUM_CLASS}


In [19]:
def cross_validate(params: dict,
                   model_type: str = 'lgb',
                   feval: dict = feval,
                   objective=OBJECTIVE):
    kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    X = train[features]
    X_test = test[features]
    y = train['open_channels']
    oof_df = train[['time', 'open_channels']].copy()
    feat_importance_df = pd.DataFrame(index=features)
    fold = 0
    if model_type == 'lgb':
        for train_id, valid_id in kfold.split(X, y):
            fold += 1
            x_train, y_train = X.iloc[train_id, :], y[train_id]
            x_val, y_val = X.iloc[valid_id, :], y[valid_id]

            train_set = lgb.Dataset(x_train, y_train)
            valid_set = lgb.Dataset(x_val, y_val)

            model = lgb.train(params=params,
                              feval=feval[objective],
                              train_set=train_set,
                              num_boost_round=NUM_BOOST_ROUND,
                              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                              valid_sets=[train_set, valid_set],
                              verbose_eval=1000)
            if objective == 'regression':
                pred = model.predict(x_val, num_iteration=model.best_iteration)
                pred = np.round(np.clip(pred, 0, 10)).astype(np.int16)
                test_preds = model.predict(X_test,
                                           num_iteration=model.best_iteration)
                test_preds = np.round(np.clip(test_preds, 0,
                                              10)).astype(np.int16)
            elif objective == 'classification':
                pred = model.predict(x_val, num_iteration=model.best_iteration)
                pred = np.argmax(pred).astype(np.int16)
                test_preds = model.predict(X_test,
                                           num_iteration=model.best_iteration)
                test_preds = np.argmax(test_preds, 0, 10).astype(np.int16)

            oof_df.loc[oof_df.iloc[valid_id].index, 'oof'] = pred
            sub[f'{model_type}_open_channels_fold_{fold}'] = test_preds

            f1 = f1_score(
                oof_df.loc[oof_df.iloc[valid_id].index]['open_channels'],
                oof_df.loc[oof_df.iloc[valid_id].index]['oof'],
                average='macro')
            rmse = np.sqrt(
                mean_squared_error(
                    oof_df.loc[oof_df.index.isin(valid_id)]['open_channels'],
                    oof_df.loc[oof_df.index.isin(valid_id)]['oof']))
#             print(
#                 f"\n***\nFold: {fold}/{kfold.n_splits},\n\tOOF F1 score: {f1:.2f},\n\tOOF RMSE: {rmse:.2f}\n***\n"
#             )

            feat_importance_df[
                f'{model_type}_importance_{fold}'] = model.feature_importance()

        oof_f1 = f1_score(oof_df['open_channels'],
                          oof_df['oof'],
                          average='macro')
        oof_rmse = np.sqrt(
            mean_squared_error(oof_df['open_channels'], oof_df['oof']))
    elif model_type == 'xgb':
        for train_id, valid_id in kfold.split(X, y):
            fold += 1
            x_train, y_train = X.iloc[train_id, :], y[train_id]
            x_val, y_val = X.iloc[valid_id, :], y[valid_id]

            train_set = xgb.DMatrix(x_train, y_train)
            valid_set = xgb.DMatrix(x_val, y_val)

            model = xgb.train(params=params,
                              dtrain=train_set,
                              feval=feval[objective],
                              maximize=True,
                              num_boost_round=NUM_BOOST_ROUND,
                              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                              evals=((train_set, 'train'), (valid_set, 'val')),
                              verbose_eval=1000)
            if objective == 'regression':
                pred = model.predict(x_val, ntree_limit=model.best_ntree_limit)
                pred = np.round(np.clip(pred, 0, 10)).astype(np.int16)
                test_preds = model.predict(X_test,
                                           ntree_limit=model.best_ntree_limit)
                test_preds = np.round(np.clip(test_preds, 0,
                                              10)).astype(np.int16)
            elif objective == 'classification':
                pred = model.predict(x_val, ntree_limit=model.best_ntree_limit)
                pred = np.argmax(pred).astype(np.int16)
                test_preds = model.predict(X_test,
                                           ntree_limit=model.best_ntree_limit)
                test_preds = np.argmax(test_preds).astype(np.int16)

            oof_df.loc[oof_df.iloc[valid_id].index, 'oof'] = pred
            sub[f'{model_type}_open_channels_fold_{fold}'] = test_preds

            f1 = f1_score(
                oof_df.loc[oof_df.iloc[valid_id].index]['open_channels'],
                oof_df.loc[oof_df.iloc[valid_id].index]['oof'],
                average='macro')
            rmse = np.sqrt(
                mean_squared_error(
                    oof_df.loc[oof_df.index.isin(valid_id)]['open_channels'],
                    oof_df.loc[oof_df.index.isin(valid_id)]['oof'])).astype(np.float32)
#             print(
#                 f"\n***\nFold: {fold}/{kfold.n_splits},\n\tOOF F1 score: {f1:.2f},\n\tOOF RMSE: {rmse:.2f}\n***\n"
#             )

            feat_importance_df[
                f'{model_type}_importance_{fold}'] = model.feature_importances_

        oof_f1 = f1_score(oof_df['open_channels'],
                          oof_df['oof'],
                          average='macro')
        oof_rmse = np.sqrt(
            mean_squared_error(oof_df['open_channels'], oof_df['oof']))
    elif model_type=='sklearn':
        pass
    return oof_df.copy(), feat_importance_df.copy(), sub.copy()


In [13]:
oof_df_lgb, feat_importance_df_lgb, sub_lgb = cross_validate(
    params_lgb,
    feval=feval,
    model_type='lgb',
    objective=OBJECTIVE,
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[947]	training's rmse: 0.16443	training's MacroF1Metric: 0.929394	valid_1's rmse: 0.166628	valid_1's MacroF1Metric: 0.928888
Training until validation scores don't improve for 50 rounds
[1000]	training's rmse: 0.163996	training's MacroF1Metric: 0.929733	valid_1's rmse: 0.16699	valid_1's MacroF1Metric: 0.928086
Early stopping, best iteration is:
[1256]	training's rmse: 0.163	training's MacroF1Metric: 0.930483	valid_1's rmse: 0.166604	valid_1's MacroF1Metric: 0.928432
Training until validation scores don't improve for 50 rounds
[1000]	training's rmse: 0.164024	training's MacroF1Metric: 0.929503	valid_1's rmse: 0.16682	valid_1's MacroF1Metric: 0.928063
Early stopping, best iteration is:
[976]	training's rmse: 0.164143	training's MacroF1Metric: 0.929461	valid_1's rmse: 0.166876	valid_1's MacroF1Metric: 0.928086
Training until validation scores don't improve for 50 rounds
[1000]	training's rmse: 

In [26]:
oof_df_lgb.to_csv('oof_df_lgb.csv', index=False)
feat_importance_df_lgb.to_csv('feat_importance_df_lgb.csv', index=False)
sub_lgb.to_csv('sub_lgb_with_folds.csv', index=False)


## XGBoost

In [20]:
params_xgb = {'colsample_bytree': 0.375,'learning_rate': 0.1,'max_depth': 10, 'subsample': 1, 'objective':'reg:squarederror',
          'eval_metric':'rmse', 'n_estimators':22222,   'tree_method':'gpu_hist',}
oof_df_xgb, feat_importance_df_xgb, sub_xgb = cross_validate(
    params_xgb,
    feval=feval,
    model_type='xgb',
    objective=OBJECTIVE,
)

XGBoostError: [11:13:26] src/learner.cc:180: XGBoost version not compiled with GPU support.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000001a22c28319 dmlc::LogMessageFatal::~LogMessageFatal() + 57
  [bt] (1) 2   libxgboost.dylib                    0x0000001a22c2bcec xgboost::LearnerImpl::ConfigureUpdaters() + 2156
  [bt] (2) 3   libxgboost.dylib                    0x0000001a22c23904 xgboost::LearnerImpl::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > > const&) + 3700
  [bt] (3) 4   libxgboost.dylib                    0x0000001a22c4305b XGBoosterUpdateOneIter + 139
  [bt] (4) 5   libffi.6.dylib                      0x000000010e2b8884 ffi_call_unix64 + 76
  [bt] (5) 6   ???                                 0x00007ffee31aa750 0x0 + 140732708595536



## PyTorch ?

## Combine and Submit

In [None]:
s_cols = [s for s in sub.columns if 'open_channels' in s]

sub['open_channels'] = sub[s_cols].median(axis=1).astype(int)

sub.head()