In [2]:
import pickle
import numpy as np

import random
import pandas as pd

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [53]:
def get_data(col: str):
    with open('data.pickle', 'rb') as f:
        data = pickle.load(f)

    data = [d for d in data if d.bezier_features is not None]
    groups = [d.user_id for d in data]

    X = [getattr(d, col) for d in data]
    y = [d.label for d in data]

    return X, y, groups

def split(X, y, groups):
    gss = GroupShuffleSplit(n_splits=1, train_size=0.7, random_state=SEED)
    train_idx, test_idx = next(gss.split(X, y, groups))

    trainX = [X[i] for i in train_idx]
    trainY = [y[i] for i in train_idx]
    testX = [X[i] for i in test_idx]
    testY = [y[i] for i in test_idx]

    return trainX, testX, trainY, testY

# Baseline

In [56]:
X, y, groups = get_data('data')

max_len = max([arr.shape[0] for arr in X])
X = [np.pad(arr, ((0, max_len - arr.shape[0]), (0, 0)), 'constant', constant_values=0) for arr in X]
X = [np.concatenate(x) for x in X]
X = np.asarray(X)

print(X.shape)
trainX, testX, trainY, testY = split(X, y, groups)

clf = RandomForestClassifier(n_estimators=60, min_samples_split=4, random_state=SEED)

clf.fit(trainX, trainY)
print("Train accuracy:", accuracy_score(trainY, clf.predict(trainX)))
print("Test accuracy:", accuracy_score(testY, clf.predict(testX)))

(959, 29802)
Train accuracy: 1.0
Test accuracy: 0.9512195121951219


# Bezier

In [64]:
X, y, groups = get_data('bezier_features')

X = [x[~np.isnan(x).any(axis=1)] for x in X]

max_len = max([arr.shape[0] for arr in X])
X = [np.pad(arr, ((0, max_len - arr.shape[0]), (0, 0)), 'constant', constant_values=0) for arr in X]
X = [np.concatenate(x) for x in X]
X = np.asarray(X)

print(X.shape)
trainX, testX, trainY, testY = split(X, y, groups)

clf = RandomForestClassifier(n_estimators=60, min_samples_split=4, random_state=SEED)

clf.fit(trainX, trainY)
print("Train accuracy:", accuracy_score(trainY, clf.predict(trainX)))
print("Test accuracy:", accuracy_score(testY, clf.predict(testX)))

(959, 1316)
Train accuracy: 1.0
Test accuracy: 0.9442508710801394


In [9]:
gs = GridSearchCV(RandomForestClassifier(), {
    'n_estimators': [10, 100, 250, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 25],
    'min_samples_split': [0.1, 0.3, 0.7, 1.0],
    'max_features': ['sqrt', 'log2']
})

gs.fit(trainX, trainY)
print("Train accuracy:", accuracy_score(trainY, gs.predict(trainX)))
print("Test accuracy:", accuracy_score(testY, gs.predict(testX)))

Train accuracy: 0.96875
Test accuracy: 0.9442508710801394
Train accuracy: 0.9568452380952381
Test accuracy: 0.9512195121951219


## Late fusion

In [108]:
X, y, groups = get_data('bezier_features')
X = [x[~np.isnan(x).any(axis=1)] for x in X]

trainX, testX, trainY, testY = split(X, y, groups)

fusion = []
results = []

for col_id in range(X[0].shape[1]):
    trainX_ = [ x[:, col_id].astype('float64') for x in trainX ]
    testX_ = [ x[:, col_id].astype('float64') for x in testX ]

    max_len = max(max([arr.shape[0] for arr in trainX_]), max([arr.shape[0] for arr in testX_]))
    trainX_ = [np.pad(arr, (0, max_len-arr.shape[0])) for arr in trainX_]

    testX_ = [np.pad(arr, (0, max_len-arr.shape[0])) for arr in testX_]

    clf = RandomForestClassifier(n_estimators=40, min_samples_split=4, random_state=SEED)
    clf.fit(trainX_, trainY)

    results.append({
        'train_accuracy': np.round(accuracy_score(trainY, clf.predict(trainX_)), 2),
        'test_accuracy': np.round(accuracy_score(testY, clf.predict(testX_)), 2)
    })

    fusion.append(clf.predict_proba(testX_))

fusion_preds = np.argmax(np.mean(np.asarray(fusion), axis=0), axis=1)
print(pd.DataFrame(results))
print('\nTotal test accuracy:', np.round(accuracy_score(testY, fusion_preds), 2))

   train_accuracy  test_accuracy
0             1.0           0.60
1             1.0           0.59
2             1.0           0.51
3             1.0           0.54
4             1.0           0.60
5             1.0           0.60
6             1.0           0.97

Total test accuracy: 0.84


## Per feature

In [110]:
X, y, groups = get_data('bezier_features')
classes, _, _ = get_data('type_')
X = [x[~np.isnan(x).any(axis=1)] for x in X]

results = []

for i in np.unique(classes):
    row_idx = np.where(classes==i)[0]
    
    X_ = [X[i] for i in row_idx]
    y_ = [y[i] for i in row_idx]
    groups_ = [groups[i] for i in row_idx]
    
    max_len = max([arr.shape[0] for arr in X_])
    X_ = [np.pad(arr, ((0, max_len - arr.shape[0]), (0, 0)), 'constant', constant_values=0) for arr in X_]
    X_ = [np.concatenate(x) for x in X_]
    X_ = np.asarray(X_)

    trainX, testX, trainY, testY = split(X_, y_, groups_)

    clf = RandomForestClassifier(n_estimators=60, min_samples_split=4, random_state=SEED)

    clf.fit(trainX, trainY)
    results.append({
        'shape': X_.shape,
        'train_accuracy': np.round(accuracy_score(trainY, clf.predict(trainX)), 2),
        'test_accuracy': np.round(accuracy_score(testY, clf.predict(testX)), 2)
    })

print(pd.DataFrame(results))

         shape  train_accuracy  test_accuracy
0   (120, 322)             1.0           0.97
1   (119, 350)             1.0           0.94
2   (120, 434)             1.0           0.97
3   (120, 378)             1.0           0.92
4   (120, 623)             1.0           0.97
5  (120, 1043)             1.0           0.97
6  (120, 1316)             1.0           0.94
7   (120, 735)             1.0           0.97
