In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df = pd.read_csv('full_data.gz', compression='gzip')
df = df.drop(['Unnamed: 0'], axis=1).set_index('time')

<h5> Creating rolling windows data frame <br> <br>
Perfrom separately for each expirement and participant </h5>

In [11]:
def create_sld_df_single_exp(orig_df, window_size, analytic_functions_list):
    dfs_to_concate = []
    base_df = orig_df.drop('action', axis=1)
    for func in analytic_functions_list:
        method_to_call = getattr(base_df.rolling(window=window_size), func)
        analytic_df = method_to_call()
        analytic_df = analytic_df[window_size:]
        analytic_df.columns = [col + "_sld_" + func for col in analytic_df.columns]
        dfs_to_concate.append(analytic_df)
    
    action_df = orig_df[['action']][window_size:] # [[]] syntax to return DataFrame and not Series
    dfs_to_concate.append(action_df)
    return pd.concat(dfs_to_concate,axis=1)

In [12]:
def create_sliding_df(orig_df, window_size, analytic_functions_list, expirements, participants):
    dfs_to_concate = []
    cols_to_drop = ['partc', 'action_file_index']
    for e in expirements:
        for p in participants:
            exp_df = orig_df[(orig_df['partc'] == p) & (orig_df['action_file_index'] == e)]
            exp_df = exp_df.drop(cols_to_drop, axis=1)
            exp_roll_df = create_sld_df_single_exp(exp_df, window_size, analytic_functions_list)
#             print "finished rolling for expirement {0} and participant {1}".format(e, p)
            dfs_to_concate.append(exp_roll_df)
    return pd.concat(dfs_to_concate, axis=0, ignore_index=True) 

In [13]:
analytic_functions_list = ['mean', 'sum', 'median', 'min', 'max', 'std', ]
exps = range(1,9+1) + range(11,16+1) # no expirement 10
parts = range(1,24+1) # 24 participants
smp_df = create_sliding_df(df, 5, analytic_functions_list, exps, parts)

#### Sanity check

There are 15 expirements and 24 participants in each expirement <br>
for sliding window of 10 samples we are loosing 10 data samples of each expirement <br>
that sums up to 15 \* 24 \* 10 = 3600 <br>
and indeed in the new data set there are exactly 3600 rows fewer than the origial data set <br>
and on the other hand exacly 12 * {num_analytical_function} + label column

In [14]:
print smp_df.shape
print df.shape
print smp_df["action"].value_counts()
print df["action"].value_counts()

(1411065, 73)
(1412865, 15)
wlk    343928
sit    338538
std    306187
ups    156925
jog    133991
dws    131496
Name: action, dtype: int64
wlk    344288
sit    338778
std    306427
ups    157285
jog    134231
dws    131856
Name: action, dtype: int64


In [15]:
smp_df.head(10)

Unnamed: 0,attitude.roll_sld_mean,attitude.pitch_sld_mean,attitude.yaw_sld_mean,gravity.x_sld_mean,gravity.y_sld_mean,gravity.z_sld_mean,rotationRate.x_sld_mean,rotationRate.y_sld_mean,rotationRate.z_sld_mean,userAcceleration.x_sld_mean,...,gravity.x_sld_std,gravity.y_sld_std,gravity.z_sld_std,rotationRate.x_sld_std,rotationRate.y_sld_std,rotationRate.z_sld_std,userAcceleration.x_sld_std,userAcceleration.y_sld_std,userAcceleration.z_sld_std,action
0,1.508554,-0.706678,0.673377,0.758879,0.649301,-0.04735,0.196176,-0.468312,0.259279,0.117882,...,0.003272,0.004684,0.017458,0.382297,0.632768,0.250362,0.10831,0.116636,0.114529,dws
1,1.493986,-0.702162,0.670343,0.760919,0.645864,-0.058632,0.096389,-0.713965,0.186258,0.089878,...,0.001428,0.003722,0.022704,0.186595,0.392238,0.138518,0.092437,0.135215,0.121979,dws
2,1.476773,-0.698904,0.667445,0.761873,0.643369,-0.071936,0.159258,-0.740368,0.128828,0.101607,...,0.001863,0.004684,0.0232,0.13311,0.336004,0.114425,0.083133,0.139628,0.110567,dws
3,1.462108,-0.69635,0.662537,0.76249,0.641413,-0.083269,0.219216,-0.441307,0.095103,0.117863,...,0.001892,0.004412,0.017546,0.08985,0.574062,0.163152,0.061337,0.141907,0.077013,dws
4,1.451934,-0.694174,0.655578,0.763086,0.639746,-0.091161,0.3231,-0.195455,0.062607,0.097644,...,0.001318,0.003033,0.011454,0.186474,0.584735,0.189152,0.041021,0.072323,0.088291,dws
5,1.44351,-0.692718,0.645077,0.76327,0.63863,-0.097683,0.458693,-0.006869,-0.00869,0.060476,...,0.000953,0.001109,0.005775,0.284283,0.501848,0.179057,0.045057,0.081946,0.054787,dws
6,1.434988,-0.692222,0.631006,0.762689,0.638249,-0.104221,0.592232,0.021458,-0.067833,0.026447,...,0.001598,0.000847,0.009983,0.362475,0.44657,0.084216,0.078813,0.094388,0.054921,dws
7,1.419934,-0.691447,0.612276,0.761246,0.637651,-0.115761,0.803665,-0.310815,-0.06123,0.008124,...,0.00262,0.002061,0.02493,0.367425,0.955397,0.096754,0.07697,0.082838,0.063028,dws
8,1.390893,-0.688407,0.588771,0.758659,0.635295,-0.138175,0.985352,-0.958218,0.029531,-0.007472,...,0.004881,0.005469,0.046122,0.244593,1.335131,0.18525,0.067937,0.039975,0.135531,dws
9,1.346832,-0.681855,0.563726,0.754544,0.630187,-0.17245,1.087661,-1.708406,0.159823,-0.01535,...,0.007962,0.010706,0.067513,0.149034,1.485272,0.268468,0.053306,0.029768,0.163452,dws


##### Shuffle the data and divide to train and test

In [16]:
# smp_df = smp_df.sample(frac=1).reset_index(drop=True) # shuffle the dataset
# X, y = smp_df.drop(["action"], axis=1), smp_df["action"]

In [17]:
# num_training = int(smp_df.shape[0] * 0.8)
# # use 80% for training and 20% for test. if parameters tunning is needed use cross-validation not the test data!
# X_train, y_train = X[:num_training], y[:num_training]
# X_test, y_test = X[num_training:], y[num_training:]

##### Save as pickle file

In [18]:
smp_df.to_pickle("sliding_window_5.pkl")