In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


import xgboost 
import lightgbm

In [None]:
train = pd.read_csv('../input/liverpool-ion-switching/train.csv')
test = pd.read_csv('../input/liverpool-ion-switching/test.csv')
sample = pd.read_csv('../input/liverpool-ion-switching/sample_submission.csv')
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
print(train.isna().any().sum(), test.isna().any().sum())

Cool, we have no missing values and all columns are numeric type.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,8))
sns.countplot(train.open_channels, ax=ax[0])
sns.distplot(train.open_channels, ax=ax[1])

Classification. Higher number of open channels are less frequent. Let's check how open_channels vary with time. 

In [None]:
plt.figure(figsize=(20,8))
sns.scatterplot(x='time', y='open_channels', data=train)

Insights:
1. Values 0 and 1 for open_channels is very frequent as compared to other values.
2. Values >= 6 occur between 200s and 250s and then between 450s and 500s. Seasonality? Maybe
3. Values 2 and 3 occur between 150s and 300s and then between 350s and 500s. 

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x='time', y='signal', hue='open_channels', data=train[train['time'] < 100])

Cool. So, we can say that a single channel is opened only when the value of signal > -2.

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x='time', y='signal', hue='open_channels', data=train[(train['time'] > 100) & (train['time'] < 200)])

Clearly, the number of open channels increases with increase in signal. This graph is in tune with our time vs open_channels graph.

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x='time', y='signal', hue='open_channels', data=train[(train['time'] > 200) & (train['time'] < 300)])

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x='time', y='signal', hue='open_channels', data=train[(train['time'] > 300) & (train['time'] < 400)])

In [None]:
plt.figure(figsize=(20,8))
sns.lineplot(x='time', y='signal', hue='open_channels', data=train[train['time'] > 400])

In [None]:
train.describe()

## Feature Engineering

In [None]:
train['train'] = 1
test['train'] = 0

In [None]:
all_data = pd.concat([train,test]).drop(['open_channels'], axis=1).reset_index(drop=True)
all_data

In [None]:
all_data['bw-5and-6'] = all_data['signal'].apply(lambda x : 1 if x < -5 and x > -6 else 0)
all_data['bw-4and-5'] = all_data['signal'].apply(lambda x : 1 if x < -4 and x > -5 else 0)
all_data['bw-3and-4'] = all_data['signal'].apply(lambda x : 1 if x < -3 and x > -4 else 0)
all_data['bw-2and-3'] = all_data['signal'].apply(lambda x : 1 if x < -2 and x > -3 else 0)
all_data['bw-1and-2'] = all_data['signal'].apply(lambda x : 1 if x < -1 and x > -2 else 0)
all_data['bw0and-1'] = all_data['signal'].apply(lambda x : 1 if x < 0 and x > -1 else 0)
all_data['bw1and0'] = all_data['signal'].apply(lambda x : 1 if x < 1 and x > 0 else 0)
all_data['bw2and1'] = all_data['signal'].apply(lambda x : 1 if x < 2 and x > 1 else 0)
all_data['bw3and2'] = all_data['signal'].apply(lambda x : 1 if x < 3 and x > 2 else 0)
all_data['bw4and3'] = all_data['signal'].apply(lambda x : 1 if x < 4 and x > 3 else 0)
all_data['bw5and4'] = all_data['signal'].apply(lambda x : 1 if x < 5 and x > 4 else 0)
all_data['bw6and5'] = all_data['signal'].apply(lambda x : 1 if x < 6 and x > 5 else 0)
all_data['bw7and6'] = all_data['signal'].apply(lambda x : 1 if x < 7 and x > 6 else 0)
all_data['bw8and7'] = all_data['signal'].apply(lambda x : 1 if x < 8 and x > 7 else 0)
all_data['bw9and8'] = all_data['signal'].apply(lambda x : 1 if x < 9 and x > 8 else 0)
all_data['bw10and9'] = all_data['signal'].apply(lambda x : 1 if x < 10 and x > 9 else 0)
all_data['bw11and10'] = all_data['signal'].apply(lambda x : 1 if x < 11 and x > 10 else 0)
all_data['bw12and11'] = all_data['signal'].apply(lambda x : 1 if x < 12 and x > 11 else 0)
all_data['bw13and12'] = all_data['signal'].apply(lambda x : 1 if x < 13 and x > 12 else 0)
all_data['bw14and13'] = all_data['signal'].apply(lambda x : 1 if x < 14 and x > 13 else 0)

In [None]:
all_data['bw-4and-6'] = all_data['signal'].apply(lambda x : 1 if x < -4 and x > -6 else 0)
all_data['bw-2and-4'] = all_data['signal'].apply(lambda x : 1 if x < -2 and x > -4 else 0)
all_data['bw0and-2'] = all_data['signal'].apply(lambda x : 1 if x < 0 and x > -2 else 0)
all_data['bw2and0'] = all_data['signal'].apply(lambda x : 1 if x < 2 and x > 0 else 0)
all_data['bw4and2'] = all_data['signal'].apply(lambda x : 1 if x < 4 and x > 2 else 0)
all_data['bw6and4'] = all_data['signal'].apply(lambda x : 1 if x < 6 and x > 4 else 0)
all_data['bw8and6'] = all_data['signal'].apply(lambda x : 1 if x < 8 and x > 6 else 0)
all_data['bw10and8'] = all_data['signal'].apply(lambda x : 1 if x < 10 and x > 8 else 0)
all_data['bw12and10'] = all_data['signal'].apply(lambda x : 1 if x < 12 and x > 10 else 0)
all_data['bw14and12'] = all_data['signal'].apply(lambda x : 1 if x < 14 and x > 12 else 0)

In [None]:
all_data

## Modeling

In [None]:
training_data = all_data[all_data['train'] == 1]
training_data = training_data.join(train['open_channels'])

In [None]:
data_corr = training_data.corr()
data_corr

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(y=data_corr['open_channels'].index, x=data_corr['open_channels'].values)

In [None]:
KFOLDS = 5
cv = KFold(n_splits=KFOLDS, shuffle=True, random_state=108)

In [None]:
# xgb = xgboost.XGBClassifier(tree_method='hist', objective='multi:softmax')
param_grid ={
    'min_child_weight': [25],
    'subsample': [0.7],
    'colsample_bytree': [1],
    'max_depth': [7],
    'learning_rate': [0.01],
    'n_estimators':[100]
    }

In [None]:
lgb = lightgbm.LGBMClassifier(objective='multiclass')

In [None]:
clf = GridSearchCV(
        estimator=lgb,
        param_grid=param_grid,
        cv=cv,
        iid=True,
        return_train_score=True,
        scoring='f1_macro',
        verbose=100,
        refit=False
    )

In [None]:
X = training_data.drop(['open_channels','train','time'], axis=1)
y = training_data['open_channels']

In [None]:
X

In [None]:
clf.fit(X,y)

In [None]:
plt.figure(figsize=(20,6))
sns.barplot(x=clf.best_estimator_.feature_importances_, y=X.columns)

In [None]:
X_test = all_data[all_data['train'] == 0]
X_test

In [None]:
predictions = clf.predict(X_test)
predictions

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(predictions)

In [None]:
test['open_channels'] = predictions

In [None]:
test = test[['time', 'open_channels']]
test.to_csv('submission.csv', index=False, float_format='%.4f')