# Tabular Playground Series - Apr 2022

In this competition, you'll classify 60-second sequences of sensor data, indicating whether a subject was in either of two activity states for the duration of the sequence.

### Setup

In [None]:
%%capture

from typing import Tuple

from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
from scipy.stats import kurtosis
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
sns.set(rc = {'figure.figsize':(10, 6)})

from ml_utils.preprocess.pipeline import preprocessing_pipeline

### Load data

In [None]:
pipeline = {}

train = pd.read_csv("Data/train.csv")
train_labels = pd.read_csv("Data/train_labels.csv")
train = pd.merge(train, train_labels, on="sequence")

test = pd.read_csv("Data/test.csv")

print(f"Train shape {train.shape}")
print(f"Test shape {test.shape}")

In [None]:
train.head()

There is no missing data

### Feature distributions

In [None]:
sensor_cols = [x for x in train.columns if 'sensor' in x]

sns.displot(
    data=pd.melt(train[sensor_cols]),
    x="value",
    col="variable",
    bins=100,
    col_wrap=5,
    common_bins=False,
    facet_kws=dict(
        sharex=False,
        sharey=False,
    )
)

### Feature engineering

Add columns for summary statistics grouped by each subject and sequence (which maps to activity)

In [None]:
def get_aggregate_features(df):
    """
    Add some summary statistic features grouped by sequencea
    """

    summary_statistics = {
        "sum": sum,
        "mean": np.mean,
        "std": np.std,
        "kurtosis": kurtosis,
        'median': np.median,
        'min': np.min,
        'max': np.max
    }

    aggregated_dfs = []

    for agg_name, agg_func in summary_statistics.items():
        print(f"Aggregating {agg_name}...")
        _ = df.groupby(['sequence'])[sensor_cols].transform(kurtosis, axis=0)
        _.columns = [f"{agg_name}_{x}" for x in _.columns]
        aggregated_dfs.append(_.reset_index())

    return pd.concat(aggregated_dfs)

print("Aggregating train...")
agg_features_train = get_aggregate_features(train)
print("Aggregating test...")
agg_features_test = get_aggregate_features(test)

### Base model

In [None]:
model = LGBMClassifier(max_depth=10)

def stratify_train_test(df) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Get test/train split based on subjects
    """
    # Simple test/train split based on subject
    subjects_shuffled = np.random.permutation(df['subject'].unique())

    train_cutoff = int(np.floor(len(subjects_shuffled) * 0.8))

    train_subjects = subjects_shuffled[:train_cutoff]
    test_subjects = subjects_shuffled[train_cutoff:]

    train = df[df.subject.isin(train_subjects)].drop('subject', axis=1)
    test = df[df.subject.isin(test_subjects)].drop('subject', axis=1)

    return train, test

stratified_train, stratified_test = stratify_train_test(train)

X_train, y_train = stratified_train.drop('state', axis=1), stratified_train['state']
X_test, y_test = stratified_test.drop('state', axis=1), stratified_test['state']

In [None]:
%%time

model.fit(X_train,y_train)
y_hat = model.predict_proba(X_test)[:, 1]

In [None]:
roc_auc_score(y_test, y_hat)

In [None]:
from ml_utils.visualise.features import plot_importances

plot_importances(model, X_train.columns, 20)

### Train final model

In [None]:
N_CV = 5

kfold = KFold(n_splits=N_CV)

train_score = []
valid_score = []
test_set_preds = []

X, y = train.drop('state', axis=1), train['state']

subjects_shuffled = np.random.permutation(train['subject'].unique())

# KFolds split by unique
for fold, (train_subj, valid_subj) in enumerate(kfold.split(subjects_shuffled)):
    print(f"Running fold: {fold}...")

    train_fold = train[train['subject'].isin(train_subj)].drop(['subject'], axis=1)
    valid_fold = train[train['subject'].isin(valid_subj)].drop(['subject'], axis=1)

    X_train, y_train = train_fold.drop('state', axis=1), train_fold['state']
    X_valid, y_valid = valid_fold.drop('state', axis=1), valid_fold['state']

    model.fit(X_train, y_train)

    y_hat_train = model.predict_proba(X_train)[:, 1]
    y_hat_test = model.predict_proba(X_valid)[:, 1]

    train_score.append(roc_auc_score(y_train, y_hat_train))
    valid_score.append(roc_auc_score(y_valid, y_hat_test))

    # Fold prediction on test set
    y_hat = model.predict_proba(test.drop('sequence', axis=1))[:,1]
    test_set_preds.append(y_hat)

In [None]:
results_df = pd.DataFrame({
    "fold": list(range(N_CV)),
    "train": train_score,
    "valid": valid_score
})
results_df = pd.melt(results_df, id_vars="fold", var_name="set",value_name="accuracy")
print(f"Mean train score {np.mean(train_score)}")
print(f"Mean test score {np.mean(valid_score)}")
sns.barplot(data=results_df, x="fold", y="accuracy", hue="set")

### Submission

In [None]:
submission = test[['sequence']].copy()
submission['sequence'] = submission['sequence'].astype(int)
submission["state"] = np.mean(test_set_preds, axis=0)
submission = submission.groupby('sequence')['state'].max().reset_index()
submission.to_csv('Data/test_preds.csv', index=False)