In [None]:
import pandas as pd
import numpy as np

from scipy import stats
import math
import time

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import xgboost as xgb

In [2]:
train_df = pd.read_csv('/DataScience/liverpool-ion-switching/train.csv')
test_df = pd.read_csv('/DataScience/liverpool-ion-switching/test.csv')

In [3]:
train_df['batch'] = 0
for i in range(0, 10):
    train_df.iloc[i * 500000: 500000 * (i + 1), 3] = i

In [4]:
test_df['batch'] = 0
for i in range(0, 4):
    test_df.iloc[i * 500000: 500000 * (i + 1), 2] = i

In [5]:
for i in range(1, 9):
    prev = []
    for batch in train_df.batch.unique():
        prev += list(train_df[train_df.batch == batch].signal.iloc[:i])
        prev += list(train_df[train_df.batch == batch].signal.iloc[:-i])
    train_df['{}_prev'.format(i)] = prev
    train_df['{}_prev'.format(i)] -= train_df.signal

In [6]:
for i in range(1, 9):
    prev = []
    for batch in test_df.batch.unique():
        prev += list(test_df[test_df.batch == batch].signal.iloc[:i])
        prev += list(test_df[test_df.batch == batch].signal.iloc[:-i])
    test_df['{}_prev'.format(i)] = prev
    test_df['{}_prev'.format(i)] -= test_df.signal

In [24]:
def add_recent_max_feature(win_size, df):
    total_max = []
    for batch in df.batch.unique():
        local_max = []
        batch_df = df[df.batch == batch].copy()
        batch_df.reset_index(inplace=True, drop=True)
        for row in range(len(batch_df)):
            start_index = max(row - win_size, 0)
            local_max.append(max(batch_df.signal.iloc[start_index : row + 1]))
        total_max += local_max
    return total_max

def add_recent_min_feature(win_size, df):
    total_min = []
    for batch in df.batch.unique():
        local_min = []
        batch_df = df[df.batch == batch].copy()
        batch_df.reset_index(inplace=True, drop=True)
        for row in range(len(batch_df)):
            start_index = max(row - win_size, 0)
            local_min.append(min(batch_df.signal.iloc[start_index : row + 1]))
        total_min += local_min
    return total_min

def add_recent_mean_feature(win_size, df):
    total_mean = []
    for batch in df.batch.unique():
        local_mean = []
        batch_df = df[df.batch == batch].copy()
        batch_df.reset_index(inplace=True, drop=True)
        for row in range(len(batch_df)):
            start_index = max(row - win_size, 0)
            local_mean.append(np.mean(batch_df.signal.iloc[start_index : row + 1]))
        total_mean += local_mean
    return total_mean

def add_recent_jitter_feature(win_size, df):
    total_jit = []
    for batch in df.batch.unique():
        local_jit = []
        batch_df = df[df.batch == batch].copy()
        batch_df.reset_index(inplace=True, drop=True)
        for row in range(len(batch_df)):
            start_index = max(row - win_size, 0)
            local_jit.append(sum([abs(i) for i in batch_df['1_prev'].iloc[start_index : row + 1]]))
        total_jit += local_jit
    return total_jit

In [12]:
for window in [5, 50, 500, 5000]:
    train_df['LocalMax{}'.format(window)] = add_recent_max_feature(window, train_df)
    test_df['LocalMax{}'.format(window)] = add_recent_max_feature(window, test_df)

In [18]:
for window in [5, 50, 500, 5000]:
    train_df['LocalMin{}'.format(window)] = add_recent_min_feature(window, train_df)
    test_df['LocalMin{}'.format(window)] = add_recent_min_feature(window, test_df)
    train_df['LocalMean{}'.format(window)] = add_recent_mean_feature(window, train_df)
    test_df['LocalMean{}'.format(window)] = add_recent_mean_feature(window, test_df)

In [25]:
for window in [2, 3, 5, 8, 50, 100, 500, 10000]:
    train_df['LocalJit{}'.format(window)] = add_recent_jitter_feature(window, train_df)
    test_df['LocalJit{}'.format(window)] = add_recent_jitter_feature(window, test_df)

In [30]:
train_df['gap5'] = train_df['LocalMax5'] - train_df['LocalMin5']
train_df['gap50'] = train_df['LocalMax50'] - train_df['LocalMin50']
train_df['gap500'] = train_df['LocalMax500'] - train_df['LocalMin500']
train_df['gap5000'] = train_df['LocalMax5000'] - train_df['LocalMin5000']

test_df['gap5'] = test_df['LocalMax5'] - test_df['LocalMin5']
test_df['gap50'] = test_df['LocalMax50'] - test_df['LocalMin50']
test_df['gap500'] = test_df['LocalMax500'] - test_df['LocalMin500']
test_df['gap5000'] = test_df['LocalMax5000'] - test_df['LocalMin5000']

In [31]:
test_df.columns

Index(['time', 'signal', 'batch', '1_prev', '2_prev', '3_prev', '4_prev',
       '5_prev', '6_prev', '7_prev', '8_prev', 'LocalMax5', 'LocalMax50',
       'LocalMax500', 'LocalMax5000', 'LocalMin5', 'LocalMean5', 'LocalMin50',
       'LocalMean50', 'LocalMin500', 'LocalMean500', 'LocalMin5000',
       'LocalMean5000', 'LocalJit2', 'LocalJit3', 'LocalJit5', 'LocalJit8',
       'LocalJit50', 'LocalJit100', 'LocalJit500', 'LocalJit10000', 'gap5',
       'gap50', 'gap500', 'gap5000'],
      dtype='object')

In [32]:
train_df.columns

Index(['time', 'signal', 'open_channels', 'batch', '1_prev', '2_prev',
       '3_prev', '4_prev', '5_prev', '6_prev', '7_prev', '8_prev', 'LocalMax5',
       'LocalMax50', 'LocalMax500', 'LocalMax5000', 'LocalMin5', 'LocalMean5',
       'LocalMin50', 'LocalMean50', 'LocalMin500', 'LocalMean500',
       'LocalMin5000', 'LocalMean5000', 'LocalJit2', 'LocalJit3', 'LocalJit5',
       'LocalJit8', 'LocalJit50', 'LocalJit100', 'LocalJit500',
       'LocalJit10000', 'gap5', 'gap50', 'gap500', 'gap5000'],
      dtype='object')

In [None]:
X = train_df.drop(['open_channels', 'time', 'batch'], 1)
Y = train_df['open_channels']

dtrain = xgb.DMatrix(X)
dtest = xgb.DMatrix(test_df.drop(['time', 'batch'], 1))

classifier = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1, 
                  learning_rate=0.07, max_delta_step=0, max_depth=6,
                  min_child_weight=1, missing=None, n_estimators=250, n_jobs=-1,
                  nthread=None, objective='multi:softmax', num_classes=11, random_state=0,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                  silent=None, subsample=1, verbosity=1)
classifier.fit(X, Y)

y_pred = classifier.predict_proba(X_test)


