# MotiononSense Dataset
### Problem definition: predict user's activity based on smartphone sensors data

---
## Part 3:

* Extracting Real World Data 
* Evaluation on Real World Data 
* Neural Models - Training and Evaluation on Real World Data
* Applicative Predictions Smoothing
* Final Results, Conculsions & Application

---
## Extracting Real World Data


* We used the [Core Motion Framework for iOS devices](https://developer.apple.com/documentation/coremotion/cmdevicemotion) to extract sensors data from our phones
* More details on the app we built can be found in our final document
* We recorded sensors data while performing different activties and extracted labeled data samples 
* On the next section we will load our data and use it as a test set to evaluate the performace of our Random Forest model

In [1]:
import numpy as np
import pandas as pd
import os

class NewDataLoader():
    
    def __init__(self, folder_path):
        self.data_path = folder_path
    
    def load_all_expirements(self):
        df = None
        exp_index = 1
        for filename in os.listdir(self.data_path):
            file_path = os.path.join(self.data_path, filename)
            extension = os.path.splitext(file_path)[1]
            if extension == '.csv':
                current_df = self.load_single_test_expirement(file_path, exp_index)
                exp_index += 1
                if df is None:
                    df = current_df
                else:
                    df = df.append(current_df)
        return df

    def load_single_test_expirement(self, path_to_file, exp_index, partc_id=1):
        cols_to_drop = ["timestamp", "timeIntervalSince1970", 'magneticField.x', 
                        'magneticField.y', 'magneticField.z', 'magneticField.accuracy']
        file_name = path_to_file.split(os.sep)[-1]
        name, file_type = file_name.split('.')
        action = name[:3]
        exp_df = pd.read_csv(path_to_file)
        exp_df = exp_df.drop(cols_to_drop, axis=1)
        exp_df["partc"] = partc_id
        exp_df["action"] = action
        exp_df["action_file_index"] = exp_index
        return exp_df

In [2]:
PROJECT_MAIN_DIR = os.getcwd()
path = os.path.join(PROJECT_MAIN_DIR, 'real-data')
data_loader = NewDataLoader(path)
real_test_df = data_loader.load_all_expirements()

We will load also our original data set and use it as a training data 

In [3]:
train_df = pd.read_csv(os.path.join(PROJECT_MAIN_DIR,'full_data.gz'), compression='gzip') # we will load our data saved as a compressed csv file
train_df = train_df.drop(['Unnamed: 0'], axis=1).set_index('time')

---
## Evaluation on Real World Data

Now, we will encode both samples with our Sliding Window encoding, train our Random Forest model over the entire old data and evaluate it's performance on the real world data

In [4]:
class SlidingWindow:
    
    def __init__(self, orig_df, window_size, num_experiments, num_participants, exclude, fnlist):
        exps = [i for i in range(1,num_experiments + 1) if i != exclude]
        parts = [i for i in range(1,num_participants + 1)]
        smp_df = self.create_sliding_df(orig_df, window_size, fnlist, exps, parts)
        self.window_size = window_size
        self.df = smp_df

    def create_sld_df_single_exp(self, orig_df, window_size, analytic_functions_list):
        dfs_to_concate = []
        base_df = orig_df.drop('action', axis=1)
        for func in analytic_functions_list:
            method_to_call = getattr(base_df.rolling(window=window_size), func)
            analytic_df = method_to_call()
            analytic_df = analytic_df[window_size:]
            analytic_df.columns = [col + "_sld_" + func for col in analytic_df.columns]
            dfs_to_concate.append(analytic_df)

        action_df = orig_df[['action']][window_size:] # [[]] syntax to return DataFrame and not Series
        dfs_to_concate.append(action_df)
        return pd.concat(dfs_to_concate,axis=1)

    def create_sliding_df(self, orig_df, window_size, analytic_functions_list, expirements, participants):
        dfs_to_concate = []
        cols_to_drop = ['partc', 'action_file_index']
        for e in expirements:
            for p in participants:
                exp_df = orig_df[(orig_df['partc'] == p) & (orig_df['action_file_index'] == e)]
                exp_df = exp_df.drop(cols_to_drop, axis=1)
                exp_roll_df = self.create_sld_df_single_exp(exp_df, window_size, analytic_functions_list)

                dfs_to_concate.append(exp_roll_df)
        return pd.concat(dfs_to_concate, axis=0, ignore_index=True)

In [5]:
# defining variables for the sliding window data frame creation
num_experiments = 16
num_participants = 24
exclude = 10
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
WINDOW_SIZE = 10

# create the sliding window data frame
train_win_df = SlidingWindow(train_df, WINDOW_SIZE, num_experiments, num_participants, exclude, analytic_functions_list)
train_win_df = train_win_df.df

In [6]:
num_experiments = 18
num_participants = 1
exclude = 0
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std']
WINDOW_SIZE = 10

real_test_df["partc"] = 1
test_win_df = SlidingWindow(real_test_df, WINDOW_SIZE, num_experiments, num_participants, exclude, analytic_functions_list)
test_win_df = test_win_df.df

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

class DataProcessingEval():
    
    def __init__(self, origin_df, labels_dict):
        self.labels_dict = labels_dict
        self.classes_names = self.create_classes(labels_dict)
        self.df = origin_df
    
    def create_samples(self, division_ratio=[0.7, 0.1, 0.2]):
        # Define X, y
        df = self.df.sample(frac=1).reset_index(drop=True)
        X, y = df.drop(["action"], axis=1), df["action"]
        y = y.replace(self.labels_dict)

        # Divide to training, validation and test set
        train_ratio, dev_ratio = division_ratio[0], division_ratio[1]
        num_training = int(df.shape[0] * train_ratio)
        num_validation = int(df.shape[0] * dev_ratio)
        
        X_train, y_train = X[:num_training], y[:num_training]
        X_vald, y_vald = X[num_training:num_training + num_validation], y[num_training:num_training + num_validation]
        X_test, y_test = X[num_training + num_validation:], y[num_training + num_validation:]

        return X_train, y_train, X_vald, y_vald, X_test, y_test

    def create_classes(self, labels_dict):
        classes_indexs = labels_dict.items()
        classes_indexs = sorted(classes_indexs, key=lambda x: x[1])
        classes_names = [label for label, index in classes_indexs]
        return classes_names

    def evaluate_results(self, y_true, y_pred):
            print("---- Printing classification report ----")
            print(classification_report(y_true, y_pred, target_names=self.classes_names))

In [8]:
labels = {'wlk': 0, 'sit': 1, "std": 2, "ups": 3, "jog": 4, "dws": 5}

win_train_processor = DataProcessingEval(train_win_df, labels_dict=labels)
X_train, y_train, _, _, _, _  = win_train_processor.create_samples([1.0, 0, 0])

win_test_processor = DataProcessingEval(test_win_df, labels_dict=labels)
X_test_real, y_test_real, _, _, _, _ = win_test_processor.create_samples([1.0, 0, 0])

Training over the entire original data and evaluating on new test data

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)
rf_test_predictions = rf.predict(X_test_real)
win_test_processor.evaluate_results(y_test_real, rf_test_predictions)

[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   48.1s remaining:   32.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.3min finished


---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.65      0.48      0.55     52652
        sit       0.98      0.69      0.81     35225
        std       0.96      0.96      0.96     36561
        ups       0.40      0.74      0.52     21800
        jog       0.00      0.00      0.00         0
        dws       0.40      0.45      0.42     19547

avg / total       0.72      0.66      0.68    165785



[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.1s finished
  'recall', 'true', average, warn_for)


**Conclusions so far:**

* We excluded the "jogging" activity because we didn't perform this activity in the data we created from our app
* As predicted, the results on real world data are much worse compared to results over our original test set
* We are still predicting "sit" and "stand" activities quite well but our current model is having hard time identifying "upstairs" and "down stairs"
* Next, we will try to use a stronger, neural models, hoping that it will help us increasing our performance over the real test data

---
## Neural Models - Training and Evaluation

**Encoding for Neural Models**

* The first model we will try is a simple feed forward network with one hidden layer
* Feed forward nets, like classic ML models, cannot use sequence as input so we will have to use one of our previous encodings 
* We will choose our sliding window encoding first, since it out-performed our raw history encoding
* We hope that our model can create a better representation of the data in it's hidden layer and thus increase the generalization ability of the model

### Feed Forward Neural Network

In [10]:
train_win_df.shape

(1409265, 73)

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

dropout_rate = 0.5

ff_model = Sequential()
ff_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # hidden layer size is 32
ff_model.add(Dropout(dropout_rate))  # adding dropout layer
ff_model.add(Dense(6, activation='softmax'))  # applying softmax and cross entorpy loss
ff_model.compile(loss='categorical_crossentropy',optimizer='adam')

Using TensorFlow backend.


In [12]:
from keras.utils import to_categorical

num_activities = 6
y_train_one_hot = np.array([to_categorical(t, num_activities) for t in y_train])

# tranform y to one hot encoding vector of length 6 (we have 6 activities)
ff_model.fit(X_train, y_train_one_hot, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11e5bcda0>

In [13]:
y_test_real_one_hot = np.array([to_categorical(t, num_activities) for t in y_test_real])
ff_predictions = ff_model.predict(X_test_real)
ff_test_predictions = np.array([np.argmax(prediction) for prediction in ff_predictions])
win_test_processor.evaluate_results(y_test_real, ff_test_predictions)

---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.80      0.52      0.63     52652
        sit       0.88      0.97      0.92     35225
        std       0.89      0.97      0.93     36561
        ups       0.32      0.66      0.43     21800
        jog       0.00      0.00      0.00         0
        dws       0.39      0.16      0.23     19547

avg / total       0.73      0.69      0.68    165785



  'recall', 'true', average, warn_for)


**Evaluate results:**
* Results for Feed Forward neural network look like the Random Forest ones
* Try to add another hidden layer and see if significant improvement occurs

In [14]:
ff2_model = Sequential()
ff2_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # first hidden layer size is 32
ff2_model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu'))  # second hidden layer size is 32
ff2_model.add(Dropout(dropout_rate))  # adding dropout layer
ff2_model.add(Dense(6, activation='softmax'))  # applying softmax and cross entorpy loss
ff2_model.compile(loss='categorical_crossentropy',optimizer='adam')
ff2_model.fit(X_train, y_train_one_hot, batch_size=32, epochs=5)

ff2_predictions = ff2_model.predict(X_test_real)
ff2_test_predictions = np.array([np.argmax(prediction) for prediction in ff2_predictions])
win_test_processor.evaluate_results(y_test_real, ff2_test_predictions)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.84      0.46      0.59     52652
        sit       0.94      0.97      0.96     35225
        std       0.95      0.97      0.96     36561
        ups       0.42      0.74      0.54     21800
        jog       0.00      0.00      0.00         0
        dws       0.35      0.44      0.39     19547

avg / total       0.77      0.71      0.72    165785



  'recall', 'true', average, warn_for)


**Intermediate neural conclusions:**
* Normal feed forward does not perform much better than random forest
* Adding another layer did not improve at all the F1-score
* This might be because our sliding window is not an ideal input for time series neural network
* Try a recurrent neural network such as LSTM instead

### LSTM Recurrent Neural Network
* We would need to use raw data, and not summarized like Sliding Window
* Results may differ between vector sizes of **raw history** encoding
* Ideally we would predict each experiment separately but our data only contains dozens of experiments
* This means that another discreteziation method is required
* We will again use a **predefined vector size** (i.e. 10) in our raw history encoding
* Thus, We do not expect significant improvement comparing to the Feed Forward models

In [15]:
class RawHistory:
    
    def __init__(self, origin_df, history_length, num_experiments, num_participants, exclude):
        exps = [i for i in range(1,num_experiments + 1) if i != exclude]
        parts = [i for i in range(1,num_participants + 1)]
        smp_df = self.create_history_encoded_df(origin_df, history_length, expirements=exps, participants=parts)
        self.history_length = history_length
        self.df = smp_df

    def create_history_encoded_single_exp(self, orig_df, history_length):
        hist_df = orig_df.copy(deep=True) # later operations are "in place" so we need to avoid changing original dataframe
        columns_to_shift = hist_df.columns[:-1] # omit the action column, we don't want to duplicate it
        for i in range(1,history_length + 1):
            shift_df = orig_df.shift(i)
            for col_name in columns_to_shift:
                new_col_name = "prev_{0}_".format(i) + col_name
                hist_df[new_col_name] = shift_df[col_name] # add shifted column, aka history, as a column to orignal dataframe

        hist_df = hist_df[history_length:] # we don't return the first "history_length" sample - they have missing history data
        return hist_df

    def create_history_encoded_df(self, orig_df, history_length, expirements, participants):
        dfs_to_concate = []
        cols_to_drop = ['partc', 'action_file_index']
        for e in expirements:
            for p in participants:
                exp_df = orig_df[(orig_df['partc'] == p) & (orig_df['action_file_index'] == e)]
                exp_df = exp_df.drop(cols_to_drop, axis=1)
                exp_histoy_df = self.create_history_encoded_single_exp(exp_df, history_length)
                dfs_to_concate.append(exp_histoy_df)
        return pd.concat(dfs_to_concate, axis=0, ignore_index=True)

In [16]:
# defining variables for the raw history data frame creation
num_experiments = 16
num_participants = 24
exclude = 10
HISTORY_LEN = 10

# create the raw history data frame for training
train_hist_df = RawHistory(train_df, HISTORY_LEN, num_experiments, num_participants, exclude)
train_hist_df = train_hist_df.df

In [17]:
# create the raw history data frame for testing
num_experiments = 18
num_participants = 1
exclude = 0
HISTORY_LEN = 10

test_hist_df = RawHistory(real_test_df, HISTORY_LEN, num_experiments, num_participants, exclude)
test_hist_df = test_hist_df.df

In [18]:
labels = {'wlk': 0, 'sit': 1, "std": 2, "ups": 3, "jog": 4, "dws": 5}

hist_train_processor = DataProcessingEval(train_hist_df, labels_dict=labels)
X_train, y_train, _, _, _, _  = hist_train_processor.create_samples([1.0, 0, 0])

hist_test_processor = DataProcessingEval(test_hist_df, labels_dict=labels)
X_test_real, y_test_real, _, _, _, _ = hist_test_processor.create_samples([1.0, 0, 0])

* The LSTM can deal with sequence of inputs, thus we don't need to explicitly encode our raw history as one long vectort as before.
* We will pass to the LSTM model a sequence of 11 data points (10 history and actual data point)
* Each data point is a vector of length 12 (our 12 origianl features)
* Thus, we need to transform our raw encoding long vector to a sequence of data points

In [27]:
NUM_FEATURES = 12
X_train = np.array(X_train)
X_train = np.flip(X_train, axis=1)  # reverse the sequence from past to present
X_train = X_train.reshape(-1, HISTORY_LEN + 1, NUM_FEATURES)  # reshape for LSTM sequence input (num_samples, 11, 12)

X_test_real = np.array(X_test_real)
X_test_real = np.flip(X_test_real, axis=1)
X_test_real = X_test_real.reshape(-1, HISTORY_LEN + 1, NUM_FEATURES)

In [25]:
from keras.layers import LSTM, Embedding

dropout_rate = 0.5
max_input_len,  data_point_dim = X_train.shape[1], X_train.shape[2]

lstm_model = Sequential()
lstm_model.add(LSTM(32, return_sequences=False, dropout=dropout_rate, input_shape=(max_input_len, data_point_dim, )))
lstm_model.add(Dense(6, activation='softmax'))  # applying softmax and cross entorpy loss
lstm_model.compile(loss='categorical_crossentropy',optimizer='adam')

In [26]:
from keras.utils import to_categorical

num_activities = 6
y_train_one_hot = np.array([to_categorical(t, num_activities) for t in y_train])

# tranform y to one hot encoding vector of length 6 (we have 6 activities)
lstm_model.fit(X_train, y_train_one_hot, batch_size=32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11eabec88>

Evaluating the LSTM model results on the real test data

In [63]:
lstm_model_predictions = lstm_model.predict(X_test_real)
lstm_model_test_predictions = np.argmax(lstm_model_predictions, axis=1)
hist_test_processor.evaluate_results(y_test_real, lstm_model_test_predictions)

---- Printing classification report ----
             precision    recall  f1-score   support

        wlk       0.81      0.48      0.60     52652
        sit       0.48      0.97      0.64     35225
        std       0.13      0.00      0.00     36561
        ups       0.39      0.81      0.52     21800
        jog       0.00      0.00      0.00         0
        dws       0.43      0.38      0.40     19547

avg / total       0.49      0.51      0.44    165785



  'recall', 'true', average, warn_for)


---
## Applicative Predictions Smoothing

In [20]:
PROJECT_MAIN_DIR = os.getcwd()
path = os.path.join(PROJECT_MAIN_DIR, 'real-data')
data_loader = NewDataLoader(path)
real_test_df = data_loader.load_all_expirements()

In [21]:
real_test_df

Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,partc,action,action_file_index
0,-0.167633,0.179202,0.030154,-0.164177,-0.178245,-0.970193,-0.613843,1.339701,0.479796,0.118248,0.071449,-0.070273,1,ups,1
1,-0.154363,0.171377,0.034900,-0.151499,-0.170540,-0.973635,-0.738818,1.409082,0.606134,0.117502,0.042686,0.038011,1,ups,1
2,-0.141519,0.160889,0.039637,-0.139226,-0.160196,-0.977217,-1.048560,1.327002,0.602302,0.103520,0.058405,0.177809,1,ups,1
3,-0.130109,0.146934,0.043551,-0.128345,-0.146405,-0.980863,-1.415643,1.152735,0.554232,0.072650,0.022306,0.264677,1,ups,1
4,-0.120020,0.128703,0.046674,-0.118741,-0.128348,-0.984595,-1.875311,1.009978,0.522600,-0.014910,-0.006723,0.293112,1,ups,1
5,-0.110287,0.106239,0.048990,-0.109443,-0.106040,-0.988321,-2.280296,1.012389,0.464374,-0.050790,0.031104,0.224405,1,ups,1
6,-0.097879,0.081424,0.050295,-0.097399,-0.081334,-0.991916,-2.466738,1.343851,0.356050,-0.032926,0.112645,0.153538,1,ups,1
7,-0.081616,0.056942,0.051152,-0.081394,-0.056911,-0.995056,-2.372062,1.697859,0.293400,0.027988,0.159221,0.006546,1,ups,1
8,-0.063591,0.035518,0.052560,-0.063509,-0.035510,-0.997349,-2.003520,1.807552,0.296102,0.098329,0.103732,-0.231075,1,ups,1
9,-0.045582,0.019461,0.054475,-0.045557,-0.019460,-0.998772,-1.429766,1.786230,0.263532,0.114100,0.047566,-0.373146,1,ups,1


In [14]:
def majority_smooth(predictions):
    most_common = max(set(predictions), key=predictions.count)
    return [most_common] * len(predictions)