# Imports

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sliding_window import SlidingWindow

# Globals

In [2]:
df = pd.read_csv('full_data.gz', compression='gzip')
df = df.drop(['Unnamed: 0'], axis=1).set_index('time')

In [3]:
num_experiments = 16
num_participants = 24
exclude = 10
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
labels_dict = {'wlk': 0, 'sit': 1, "std": 2, "ups": 3, "jog": 4, "dws": 5}

# Helper Functions

In [4]:
def data_allocation(df):
    # Define X,Y
    df = df.sample(frac=1).reset_index(drop=True)
    X, y = df.drop(["action"], axis=1), df["action"]
    y = y.replace(labels_dict)
    
    # Divide to training, validation and test set 70%, 10%, 20%
    num_training = int(df.shape[0] * 0.7)
    num_validation = int(df.shape[0] * 0.1)
    X_train, y_train = X[:num_training], y[:num_training]
    X_vald, y_vald = X[num_training:num_training + num_validation], y[num_training:num_training + num_validation]
    X_test, y_test = X[num_training + num_validation:], y[num_training + num_validation:]
    
    return X_train, y_train, X_vald, y_vald, X_test, y_test

In [5]:
def data_allocation_train_dev(df):
    # Define X,Y
    df = df.sample(frac=1).reset_index(drop=True)
    X, y = df.drop(["action"], axis=1), df["action"]
    y = y.replace(labels_dict)
    
    # Divide to training, and validation 90%, 10%
    num_training = int(df.shape[0] * 0.9)
    X_train, y_train = X[:num_training], y[:num_training]
    X_vald, y_vald = X[num_training:], y[num_training:]
    
    return X_train, y_train, X_vald, y_vald

In [6]:
def data_allocation_only_train(df):
    # Define X,Y
    df = df.sample(frac=1).reset_index(drop=True)
    X, y = df.drop(["action"], axis=1), df["action"]
    y = y.replace(labels_dict)
    
    return X, y

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

def create_classes(labels_dict):
    classes_indexs = labels_dict.items()
    classes_indexs = sorted(classes_indexs, key=lambda x: x[1])
    classes_names = [label for label, index in classes_indexs]
    return classes_names

def evaluate_results(y_true, y_pred, classes):
        print("---- Printing classification report ----")
        print(classification_report(y_true, y_pred, target_names=classes))
        
#         accuracy = sum([1 for i in range(len(y_true)) if y_true.tolist()[i] == y_pred[i]]) * 1.0
#         accuracy /= len(y_true)
#         return accuracy

# Model Evaluation

## Prepare Dataframe to Classify

In [8]:
window_size = 20
window = SlidingWindow(df, window_size, num_experiments, num_participants, exclude, analytic_functions_list)
sld_df = window.df
X_train, y_train, X_vald, y_vald = data_allocation_train_dev(sld_df)
classes_names = create_classes(labels_dict)

## Random Forest Classifier

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

### Evaluate Results

In [10]:
prediction = rf.predict(X_vald)
evaluate_results(y_vald, prediction, classes_names)

---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.99      1.00      0.99     34390
        sit       1.00      1.00      1.00     33794
        std       1.00      1.00      1.00     30738
        ups       0.99      0.99      0.99     15476
        jog       1.00      1.00      1.00     13214
        dws       1.00      0.98      0.99     12955

avg / total       1.00      1.00      1.00    140567



[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished


### Train over the entire training set (evaluation on the out of time data)

In [11]:
window_size = 20
window = SlidingWindow(df, window_size, num_experiments, num_participants, exclude, analytic_functions_list)
sld_df = window.df
X_train, y_train = data_allocation_only_train(sld_df)
classes_names = create_classes(labels_dict)

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

### Save trained object as pickle

In [13]:
from sklearn.externals import joblib
joblib.dump(rf, 'rf_trained.pkl')

['rf_trained.pkl']