# MotiononSense Dataset
### Problem definition: predict user's activity based on smartphone sensors data

---
## Part 3:

* Extracting Real World Data 
* Evaluation on Real World Data 
* Neural Models - Training and Evaluation on Real World Data
* Applicative Predictions Smoothing

---
## Extracting Real World Data


* We used the [Core Motion Framework for iOS devices](https://developer.apple.com/documentation/coremotion/cmdevicemotion) to extract sensors data from our phones
* More details on the app we built can be found in our final document
* We recorded sensors data while performing different activties and extracted labeled data samples 
* On the next section we will load our data and use it as a test set to evaluate the performace of our Random Forest model

In [1]:
import numpy as np
import pandas as pd
import os

class NewDataLoader():
    
    def __init__(self, folder_path):
        self.data_path = folder_path
    
    def load_all_expirements(self):
        df = None
        exp_index = 1
        for filename in os.listdir(self.data_path):
            file_path = os.path.join(self.data_path, filename)
            extension = os.path.splitext(file_path)[1]
            if extension == '.csv':
                current_df = self.load_single_test_expirement(file_path, exp_index)
                exp_index += 1
                if df is None:
                    df = current_df
                else:
                    df = df.append(current_df)
        return df

    def load_single_test_expirement(self, path_to_file, exp_index, partc_id=1):
        cols_to_drop = ["timestamp", "timeIntervalSince1970", 'magneticField.x', 
                        'magneticField.y', 'magneticField.z', 'magneticField.accuracy']
        file_name = path_to_file.split(os.sep)[-1]
        name, file_type = file_name.split('.')
        action = name[:3]
        exp_df = pd.read_csv(path_to_file)
        exp_df = exp_df.drop(cols_to_drop, axis=1)
        exp_df["partc"] = partc_id
        exp_df["action"] = action
        exp_df["action_file_index"] = exp_index
        return exp_df

In [2]:
PROJECT_MAIN_DIR = os.getcwd()
path = os.path.join(PROJECT_MAIN_DIR, 'real-data')
data_loader = NewDataLoader(path)
real_test_df = data_loader.load_all_expirements()

We will load also our original data set and use it as a training data 

In [3]:
train_df = pd.read_csv(os.path.join(PROJECT_MAIN_DIR,'full_data.gz'), compression='gzip') # we will load our data saved as a compressed csv file
train_df = train_df.drop(['Unnamed: 0'], axis=1).set_index('time')

---
## Evaluation on Real World Data

Now, we will encode both samples with our Sliding Window encoding, train our Random Forest model over the entire old data and evaluate it's performance on the real world data

In [4]:
class SlidingWindow:
    
    def __init__(self, orig_df, window_size, num_experiments, num_participants, exclude, fnlist):
        exps = [i for i in range(1,num_experiments + 1) if i != exclude]
        parts = [i for i in range(1,num_participants + 1)]
        smp_df = self.create_sliding_df(orig_df, window_size, fnlist, exps, parts)
        self.window_size = window_size
        self.df = smp_df

    def create_sld_df_single_exp(self, orig_df, window_size, analytic_functions_list):
        dfs_to_concate = []
        base_df = orig_df.drop('action', axis=1)
        for func in analytic_functions_list:
            method_to_call = getattr(base_df.rolling(window=window_size), func)
            analytic_df = method_to_call()
            analytic_df = analytic_df[window_size:]
            analytic_df.columns = [col + "_sld_" + func for col in analytic_df.columns]
            dfs_to_concate.append(analytic_df)

        action_df = orig_df[['action']][window_size:] # [[]] syntax to return DataFrame and not Series
        dfs_to_concate.append(action_df)
        return pd.concat(dfs_to_concate,axis=1)

    def create_sliding_df(self, orig_df, window_size, analytic_functions_list, expirements, participants):
        dfs_to_concate = []
        cols_to_drop = ['partc', 'action_file_index']
        for e in expirements:
            for p in participants:
                exp_df = orig_df[(orig_df['partc'] == p) & (orig_df['action_file_index'] == e)]
                exp_df = exp_df.drop(cols_to_drop, axis=1)
                exp_roll_df = self.create_sld_df_single_exp(exp_df, window_size, analytic_functions_list)

                dfs_to_concate.append(exp_roll_df)
        return pd.concat(dfs_to_concate, axis=0, ignore_index=True)

In [5]:
# defining variables for the sliding window data frame creation
num_experiments = 16
num_participants = 24
exclude = 10
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
WINDOW_SIZE = 10

# create the sliding window data frame
train_win_df = SlidingWindow(train_df, WINDOW_SIZE, num_experiments, num_participants, exclude, analytic_functions_list)
train_win_df = train_win_df.df

In [6]:
num_experiments = 18
num_participants = 1
exclude = 0
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
WINDOW_SIZE = 10

real_test_df["partc"] = 1
test_win_df = SlidingWindow(real_test_df, WINDOW_SIZE, num_experiments, num_participants, exclude, analytic_functions_list)
test_win_df = test_win_df.df

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

class DataProcessingEval():
    
    def __init__(self, origin_df, labels_dict):
        self.labels_dict = labels_dict
        self.classes_names = self.create_classes(labels_dict)
        self.df = origin_df
    
    def create_samples(self, division_ratio=[0.7, 0.1, 0.2]):
        # Define X, y
        df = self.df.sample(frac=1).reset_index(drop=True)
        X, y = df.drop(["action"], axis=1), df["action"]
        y = y.replace(self.labels_dict)

        # Divide to training, validation and test set
        train_ratio, dev_ratio = division_ratio[0], division_ratio[1]
        num_training = int(df.shape[0] * train_ratio)
        num_validation = int(df.shape[0] * dev_ratio)
        
        X_train, y_train = X[:num_training], y[:num_training]
        X_vald, y_vald = X[num_training:num_training + num_validation], y[num_training:num_training + num_validation]
        X_test, y_test = X[num_training + num_validation:], y[num_training + num_validation:]

        return X_train, y_train, X_vald, y_vald, X_test, y_test

    def create_classes(self, labels_dict):
        classes_indexs = labels_dict.items()
        classes_indexs = sorted(classes_indexs, key=lambda x: x[1])
        classes_names = [label for label, index in classes_indexs]
        return classes_names

    def evaluate_results(self, y_true, y_pred):
            print("---- Printing classification report ----")
            print(classification_report(y_true, y_pred, target_names=self.classes_names))

In [8]:
labels = {'wlk': 0, 'sit': 1, "std": 2, "ups": 3, "jog": 4, "dws": 5}

win_train_processor = DataProcessingEval(train_win_df, labels_dict=labels)
X_train, y_train, _, _, _, _  = win_train_processor.create_samples([1.0, 0, 0])

win_test_processor = DataProcessingEval(test_win_df, labels_dict=labels)
X_test_real, y_test_real, _, _, _, _ = win_test_processor.create_samples([1.0, 0, 0])

Training over the entire original data and evaluating on new test data

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)
rf_test_predictions = rf.predict(X_test_real)
win_test_processor.evaluate_results(y_test_real, rf_test_predictions)

[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   55.2s remaining:   36.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.5min finished


---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.79      0.48      0.60     52652
        sit       0.98      0.70      0.82     35225
        std       0.97      0.97      0.97     36561
        ups       0.44      0.75      0.55     21800
        jog       0.00      0.00      0.00         0
        dws       0.29      0.48      0.36     19547

avg / total       0.76      0.67      0.69    165785



[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.2s finished
  'recall', 'true', average, warn_for)


**Conclusions so far:**

* We excluded the "jogging" activity because we didn't perform this activity in the data we created from our app
* As predicted, the results on real world data are much worse compared to results over our original test set
* We are still predicting "sit" and "stand" activities quite well but our current model is having hard time identifying "upstairs" and "down stairs"
* Next, we will try to use a stronger, neural models, hoping that it will help us increasing our performance over the real test data

---
## Neural Models - Training and Evaluation

**Encoding for Neural Models**

* The first model we will try is a simple feed forward network with one hidden layer
* Feed forward nets, like classic ML models, cannot use sequence as input so we will have to use one of our previous encodings 
* We will choose our sliding window encoding first, since it out-performed our raw history encoding
* We hope that our model can create a better representation of the data in it's hidden layer and thus increase the generalization ability of the model

### Feed Forward Neural Network

In [10]:
train_win_df.shape

(1409265, 73)

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

dropout_rate = 0.5

ff_model = Sequential()
ff_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # hidden layer size is 32
ff_model.add(Dropout(dropout_rate))  # adding dropout layer
ff_model.add(Dense(6, activation='softmax'))  # applying softmax and cross entorpy loss
ff_model.compile(loss='categorical_crossentropy',optimizer='adam')

Using TensorFlow backend.


In [12]:
from keras.utils import to_categorical

num_activities = 6
y_train_one_hot = np.array([to_categorical(t, num_activities) for t in y_train])

# tranform y to one hot encoding vector of length 6 (we have 6 activities)
ff_model.fit(X_train, y_train_one_hot, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11e117518>

In [13]:
y_test_real_one_hot = np.array([to_categorical(t, num_activities) for t in y_test_real])
ff_predictions = ff_model.predict(X_test_real)
ff_test_predictions = np.array([np.argmax(prediction) for prediction in ff_predictions])
win_test_processor.evaluate_results(y_test_real, ff_test_predictions)

---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.79      0.56      0.66     52652
        sit       0.96      0.97      0.96     35225
        std       0.90      0.97      0.93     36561
        ups       0.36      0.72      0.48     21800
        jog       0.00      0.00      0.00         0
        dws       0.42      0.20      0.27     19547

avg / total       0.75      0.72      0.71    165785



  'recall', 'true', average, warn_for)


**Evaluate results:**
* Results for Feed Forward neural network look like the Random Forest ones
* Try to add another hidden layer and see if significant improvement occurs

### Two Hidden Layers

In [14]:
ff2_model = Sequential()
ff2_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # first hidden layer size is 32
ff2_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # second hidden layer size is 32
ff2_model.add(Dropout(dropout_rate))  # adding dropout layer
ff2_model.add(Dense(6, activation='softmax'))  # applying softmax and cross entorpy loss
ff2_model.compile(loss='categorical_crossentropy',optimizer='adam')
ff2_model.fit(X_train, y_train_one_hot, batch_size=32, epochs=5)

ff2_predictions = ff2_model.predict(X_test_real)
ff2_test_predictions = np.array([np.argmax(prediction) for prediction in ff2_predictions])
win_test_processor.evaluate_results(y_test_real, ff2_test_predictions)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.84      0.52      0.64     52652
        sit       0.93      0.98      0.95     35225
        std       0.95      0.97      0.96     36561
        ups       0.39      0.74      0.51     21800
        jog       0.00      0.00      0.00         0
        dws       0.41      0.35      0.38     19547

avg / total       0.78      0.72      0.73    165785



  'recall', 'true', average, warn_for)


**Intermediate neural conclusions:**
* Normal feed forward does not perform much better than random forest
* Adding another layer did not improve at all the F1-score
* This might be because our sliding window is not an ideal input for time series neural network
* Possibly, other architectures like RNN would have worked better, but we do not focus these

---
## Applicative Predictions Smoothing

* Here we present a simple concept which works well in practice
* Instead of adding complexity to our model, we smooth the predictions
* We assume that real world activities last at least **3 seconds**
* Thus, we use a majority vote smoothing technique, with a factor of **30 predictions**

Reload real data experiments

In [None]:
PROJECT_MAIN_DIR = os.getcwd()
path = os.path.join(PROJECT_MAIN_DIR, 'real-data')
data_loader = NewDataLoader(path)
real_test_df = data_loader.load_all_expirements()

In [None]:
num_experiments = 18
num_participants = 1
exclude = 0
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
WINDOW_SIZE = 10

real_test_df["partc"] = 1
test_win_df = SlidingWindow(real_test_df, WINDOW_SIZE, num_experiments, num_participants, exclude, analytic_functions_list)
test_win_df = test_win_df.df

The new preprocessig should not shuffle activities

In [None]:
class SmoothDataEval:
    
    def __init__(self, origin_df, labels_dict, smooth_factor):
        self.labels_dict = labels_dict
        self.classes_names = self.create_classes(labels_dict)
        self.df = origin_df
        self.smooth_factor = smooth_factor
        
    def create_samples(self):
        # Define X, y
        X, y = self.df.drop(["action"], axis=1), self.df["action"]
        y = y.replace(self.labels_dict)
        return X, y
        
    def create_classes(self, labels_dict):
        classes_indexs = labels_dict.items()
        classes_indexs = sorted(classes_indexs, key=lambda x: x[1])
        classes_names = [label for label, index in classes_indexs]
        return classes_names
    
    def smooth_predictions(predictions, smooth_factor):
    
        new_predictions = []
        
        def batch_predictions():
            batches = []
            gap = len(predictions) % smooth_factor
            num_groups = len(predictions) // smooth_factor
            random_groups = np.random.choice(num_groups, gap, replace=False)
            
            current_group = 0
            i = 0
            
            while i < len(predictions):
                batch_size = smooth_factor
                if current_group in random_groups:
                    batch_size += 1
                
                batches.append(predictions[i:i + batch_size])
                i += batch_size
                current_group += 1
            
            return batches

        def majority_smooth(batch):
            batch = batch.tolist()
            most_common = max(set(batch), key=batch.count)
            return [most_common] * len(batch)

        batches = batch_predictions()
        for batch in batches:
            new_predictions.extend(majority_smooth(batch))

        return new_predictions

    def evaluate_results(self, y_true, y_pred):
        print("---- Printing classification report ----")
        print(classification_report(y_true, y_pred, target_names=self.classes_names))

Define our smooth factor and smooth RF predictions, then re-evaluate

In [None]:
SMOOTH_FACTOR = 30
labels = {'wlk': 0, 'sit': 1, "std": 2, "ups": 3, "jog": 4, "dws": 5}
smooth_data_processor = SmoothDataEval(test_win_df, labels, SMOOTH_FACTOR)
X, y = smooth_data_processor.create_samples()

In [None]:
rf_predictions = rf.predict(X)
smooth_rf_predictions = SmoothDataEval.smooth_predictions(rf_predictions, SMOOTH_FACTOR)
smooth_data_processor.evaluate_results(y ,smooth_rf_predictions)

**Conclusion:**
* The results now look better than the naïve Random Forest.
* We used a very simple smoothing technique with a constant factor
* It's reasonable to assume that more sophisticaed filters would result in even better result
* The applicative solution works well in practice
* We now provide the user with a prediction every 3 seconds instead of every 0.1 seconds