In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df = pd.read_csv('full_data.gz', compression='gzip')
df = df.drop(['Unnamed: 0'], axis=1).set_index('time')

In [11]:
def create_history_encoded_single_exp(orig_df, history_length):
    hist_df = orig_df.copy(deep=True) # later operations are "in place" so we need to avoid changing original dataframe
    columns_to_shift = hist_df.columns[:-1] # omit the action column, we don't want to duplicate it
    for i in range(1,history_length + 1):
        shift_df = orig_df.shift(i)
        for col_name in columns_to_shift:
            new_col_name = "prev_{0}_".format(i) + col_name
            hist_df[new_col_name] = shift_df[col_name] # add shifted column, aka history, as a column to orignal dataframe
            
    hist_df = hist_df[history_length:] # we don't return the first "history_length" sample - they have missing history data
    return hist_df

In [12]:
def create_history_encoded_df(orig_df, history_length, expirements, participants):
    dfs_to_concate = []
    cols_to_drop = ['partc', 'action_file_index']
    for e in expirements:
        for p in participants:
            exp_df = orig_df[(orig_df['partc'] == p) & (orig_df['action_file_index'] == e)]
            exp_df = exp_df.drop(cols_to_drop, axis=1)
            exp_histoy_df = create_history_encoded_single_exp(exp_df, history_length)
#             print "finished history encoding for expirement {0} and participant {1}".format(e, p)
            dfs_to_concate.append(exp_histoy_df)
    return pd.concat(dfs_to_concate, axis=0, ignore_index=True) 

In [13]:
exps = range(1,9+1) + range(11,16+1) # no expirement 10
parts = range(1,24+1) # 24 participants

# use history_length=10 just to be consistent with sliding window, can try encoding more/less history
hist_df = create_history_encoded_df(df, history_length=5, expirements=exps, participants=parts)

#### Sanity check

There are 15 expirements and 24 participants in each expirement <br>
for history encoded data with history length of 10 samples we are loosing 10 data samples of each expirement <br>
that sums up to 15 \* 24 \* 10 = 3600 <br>
and indeed in the new data set there are exactly 3600 rows fewer than the origial data set <br>
and on the other hand exacly 12 * {history_length + 1 (for original data)} + label columns

In [14]:
print hist_df.shape
print df.shape
print hist_df["action"].value_counts()
print df["action"].value_counts()

(1411065, 73)
(1412865, 15)
wlk    343928
sit    338538
std    306187
ups    156925
jog    133991
dws    131496
Name: action, dtype: int64
wlk    344288
sit    338778
std    306427
ups    157285
jog    134231
dws    131856
Name: action, dtype: int64


##### Shuffle the data and divide to train and test

In [15]:
# hist_df = hist_df.sample(frac=1).reset_index(drop=True) # shuffle the dataset
# X, y = hist_df.drop(["action"], axis=1), hist_df["action"]

In [16]:
# num_training = int(hist_df.shape[0] * 0.8)
# # use 80% for training and 20% for test. if parameters tunning is needed use cross-validation not the test data!
# X_train, y_train = X[:num_training], y[:num_training]
# X_test, y_test = X[num_training:], y[num_training:]

##### Save the history data frame as pickle

In [17]:
hist_df.to_pickle("history_5_encoded.pkl")