### Imports

In [46]:
from __future__ import print_function

# For number crunching
import numpy as np
import pandas as pd
from collections import OrderedDict
from collections import Counter

# For visualisation
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns 

# For prediction 
import keras
from keras.utils import np_utils 
from keras.models import Sequential 
from keras.layers import Dense, Activation 
from keras import metrics

import networkx as nx
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import explained_variance_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Misc
import itertools as it
from itertools import cycle
import json 
import os

# Formatting
% matplotlib inline

sns.set_context('poster')
sns.set_style('darkgrid')

current_palette = cycle(sns.color_palette())

In [68]:
df_xy = pd.read_csv('df_xy_with_loc.csv', dtype={'participant': object})
df_xy.head(3)

Unnamed: 0,Kitchen_AP_mean,Lounge_AP_mean,Study_AP_mean,Upstairs_AP_mean,bath,bed1,bed2,hall,index,kitchen,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,-73.4,-85.2,-500.0,-76.75,0.0,0.0,0.0,0.593548,3.0,0.329032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-90.384615,-75.45,-500.0,-90.8,0.215734,0.046293,0.438427,0.032375,5.0,0.002421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-500.0,-68.9,-500.0,-93.0,0.215734,0.046293,0.438427,0.032375,5.0,0.002421,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [69]:
df_xy.columns

Index(['Kitchen_AP_mean', 'Lounge_AP_mean', 'Study_AP_mean',
       'Upstairs_AP_mean', 'bath', 'bed1', 'bed2', 'hall', 'index', 'kitchen',
       'living', 'name', 'participant', 'pir_index', 'pir_name', 'stairs',
       'start', 'study', 'toilet', 'x_max', 'x_mean', 'x_median', 'x_min',
       'x_std', 'y_max', 'y_mean', 'y_median', 'y_min', 'y_std', 'z_max',
       'z_mean', 'z_median', 'z_min', 'z_std', 'end', 'a_ascend', 'a_descend',
       'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel', 'p_lie', 'p_sit',
       'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',
       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',
       't_straighten', 't_turn'],
      dtype='object')

### Train/Val/Test Split

In [71]:
# all participants

participants = ['01','02','03','04','05','06','07','08','09','10']

np.random.seed(42)

# train_val/test

test_participants = np.random.choice(participants, size=int(len(participants) * .20), replace=False)

train_participants = np.setdiff1d(np.array(participants), test_participants)

# train/val participants

val_participants = np.random.choice(train_participants, size=int(len(participants) * .20), replace=False)

train6_participants = np.setdiff1d(np.array(train_participants), val_participants)

#### Populate X_train and Y_train

In [72]:
# populate training data with users not in the test set
# populate holdout test data with users from the test set

df_xy_train_val = df_xy[~df_xy['participant'].isin(test_participants)]

df_xy_test = df_xy[df_xy['participant'].isin(test_participants)] 

# train/validation split

df_xy_train = df_xy_train_val[~df_xy_train_val['participant'].isin(val_participants)]

df_xy_val = df_xy_train_val[df_xy_train_val['participant'].isin(val_participants)]

X_train = df_xy_train.drop(['a_ascend', 'a_descend', 'a_jump', 'a_loadwalk', 'a_walk', 'p_bent',\
                            'p_kneel', 'p_lie', 'p_sit', 'p_squat', 'p_stand', 't_bend', 't_kneel_stand',\
                            't_lie_sit', 't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',\
                            't_straighten', 't_turn','name','index','participant','pir_name','start',\
                            'end'],axis=1)

y_train = df_xy_train[['a_ascend', 'a_descend', 'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel',\
                       'p_lie', 'p_sit', 'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',\
                       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',\
                       't_straighten', 't_turn']]

X_val = df_xy_val.drop(['a_ascend', 'a_descend', 'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel',\
                        'p_lie', 'p_sit', 'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',\
                        't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit', 't_straighten',\
                        't_turn','name','index','participant','pir_name','start','end'],axis=1)

y_val = df_xy_val[['a_ascend', 'a_descend', 'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel',\
                       'p_lie', 'p_sit', 'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',\
                       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',\
                       't_straighten', 't_turn']]

In [73]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(9448, 29)
(9448, 20)
(2468, 29)
(2468, 20)


### Evaluation Metrics

In [43]:
def simple_brier_score(given, predicted): 
    return np.power(given - predicted, 2.0).sum(axis=1).mean()

In [44]:
def brier_score(given, predicted, weights): 
    return np.power(given - predicted, 2.0).dot(weight_vector).mean()

### Keras Sequential Model

In [74]:
output_dim = nb_classes = y_train.shape[1] 
model = Sequential() 
model.add(Dense(output_dim, input_dim=X_train.shape[1], activation='softmax')) 
batch_size = 128 
epochs = 12

model.compile(optimizer='sgd', loss='categorical_crossentropy') 
history = (model.fit(X_train.values, y_train.values, 
                     batch_size=batch_size, 
                     epochs=epochs,
                     verbose=1, 
                     validation_data=(X_val.values, y_val.values))) 

Train on 9448 samples, validate on 2468 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [75]:
y_pred = model.predict(X_val.values)
y_true = y_val.values

In [76]:
unweighted_sequential_score = simple_brier_score(y_true, y_pred)
unweighted_sequential_score

1.6550117774839868

#### Class Weights

In [77]:
class_weights = {
                    0:1.352985,
                    1:1.386846,
                    2:1.595874,
                    3:1.353187,
                    4:0.347784,
                    5:0.661082,
                    6:1.047236,
                    7:0.398865,
                    8:0.207586,
                    9:1.505783,
                    10:0.110181,
                    11:1.078033,
                    12:1.365604,
                    13:1.170241,
                    14:1.193364,
                    15:1.180370,
                    16:1.344149,
                    17:1.116838,
                    18:1.080839,
                    19:0.503152
                }

weight_vector = [v for v in class_weights.values()]

In [85]:
# now with weighted classes

output_dim = nb_classes = y_train.shape[1] 
wmodel = Sequential() 
wmodel.add(Dense(output_dim, input_dim=X_train.shape[1], activation='softmax')) 
batch_size = 128 
epochs = 12

wmodel.compile(optimizer='sgd', loss='categorical_crossentropy') 
history = (wmodel.fit(X_train.values, y_train.values, 
                     batch_size=batch_size, 
                     epochs=epochs,
                     verbose=1, 
                     validation_data=(X_val.values, y_val.values),
                     class_weight=class_weights))

Train on 9448 samples, validate on 2468 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [86]:
y_true = y_val.values
y_pred = wmodel.predict(X_val.values)

In [87]:
weighted_sequential_score = brier_score(y_true, y_pred, weight_vector)
weighted_sequential_score

1.2754997380880049

### Random Forest

In [81]:
max_depth = 8
multirf = MultiOutputRegressor(RandomForestRegressor(max_depth=max_depth, random_state=42))
multirf.fit(X_train, y_train)

MultiOutputRegressor(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1)

In [82]:
y_multirf = multirf.predict(X_val)
simple_brier_score(y_val, y_multirf)

0.6016969967758322

In [83]:
rf = RandomForestRegressor(max_depth=max_depth, random_state=42)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [84]:
y_rf = rf.predict(X_val)
simple_brier_score(y_val, y_rf)

0.5824893349757798

In [88]:
from sklearn.neighbors import NearestNeighbors

"""
Define a simple class that inherits from sklearn.neighbors.NearestNeighbors. 
We will adjust the fit/predict as necessary
"""
class ProbabilisticKNN(NearestNeighbors): 
    def __init__(self, n_neighbors): 
        super(ProbabilisticKNN, self).__init__(n_neighbors)
        
        self.y_train = None
        
    def fit(self, X_train, y_train): 
        """
        The fit function requires both X_train and y_train. 
        See 'The selected model' section above for explanation
        """
        
        self.y_train = np.copy(y_train)
        
        super(ProbabilisticKNN, self).fit(X_train)
        
    def predict_proba(self, X_val): 
        """
        This function finds the k closest instances to the unseen test data, and 
        averages the train_labels of the closest instances. 
        """
        
        # Find the nearest neighbors for the test set
        test_neighbors = self.kneighbors(X_val, return_distance=False)
        
        # Average the labels of these for prediction
        return np.asarray(
            [self.y_train[inds].mean(0) for inds in test_neighbors]
        )

# Learn the KNN model 
nn = ProbabilisticKNN(n_neighbors=11)
nn.fit(X_train, y_train)

# Predict on the test instances
y_predicted = nn.predict_proba(X_val)

knn_brier_score = brier_score(y_val, y_predicted, class_weights)

print ("Brier score on test set with the KNN model: ", knn_brier_score)

Brier score on test set with the KNN model:  0.2947101824329915


In [89]:
brier_scores = []

k_range = np.power(2, range(8))
for k in k_range: 
    print ("Learning model for k={:3d}".format(k)), 
    
    nn = ProbabilisticKNN(n_neighbors=k)
    nn.fit(X_train, y_train)
    
    predicted = nn.predict_proba(X_val)
    
    brier_scores.append(brier_score(y_val, predicted, class_weights))
    
    print ("score={:.5f}".format(brier_scores[-1]))

Learning model for k=  1
score=0.48900
Learning model for k=  2
score=0.39094
Learning model for k=  4
score=0.33355
Learning model for k=  8
score=0.30239
Learning model for k= 16
score=0.28820
Learning model for k= 32
score=0.28351
Learning model for k= 64
score=0.28112
Learning model for k=128
score=0.28015
