In [1]:
import pandas as pd

df = pd.read_csv('data/WISDM_ar_v1.1_raw.txt', header=None, names=['user', 'label', 'timestamp', 'x', 'y', 'z'], comment=';')
df = df.sort_values('timestamp')


In [2]:
# The sampling rate to use
SAMPLING_RATE = 20.0

# Window length to use for classification (Samples @ 10 Hz)
WINDOW_LENGTH = 100 

# Stride length to use between windows (decrease to increase dataset size)
WINDOW_STRIDE = WINDOW_LENGTH * 5

# Mapping from label index to label name
IDX_TO_LABELS = df.label.unique()

# Mapping from label name to label index
LABELS_TO_IDX = {label:idx for idx, label in enumerate(IDX_TO_LABELS)}

In [3]:
import numpy as np
from utils import sliding_window, split_user_df
from preprocessing import resample_df

# Split the label DataFrame per user
users = df.groupby(('label', 'user'))

wdws = []
y = []
ids = []
for (label, user), user_df in users:
    
    # Split the user DataFrame per recording session
    splits = split_user_df(user_df)
    
    for split_df in splits:

        # Calculate the timestamp index in seconds
        ts =(split_df.timestamp - split_df.timestamp.iloc[0]) / 1e9

        if ts.iloc[-1] < 10:
            # Skip sessions that are smaller than 10 seconds
            continue
            
        # Resample the DataFrame to SAMPLING_RATE
        split_df = resample_df(split_df, SAMPLING_RATE)
        
        # Calculate sliding windows
        for i in xrange(0, len(split_df) - WINDOW_LENGTH, WINDOW_STRIDE):
            wdws.append(split_df.iloc[i:i+WINDOW_LENGTH])
            y.append(LABELS_TO_IDX[label])
            ids.append(int(user))
         
        
y = np.asarray(y)
ids = np.asarray(ids)

In [5]:
y

array([3, 3, 3, ..., 0, 0, 0])

In [149]:
from sklearn.decomposition import PCA
import math


DATA_COLUMNS = ['x', 'y', 'z']

X = []

for wdw_df in wdws:
    # Calculate PCA for window
    pca_values = PCA(len(DATA_COLUMNS)).fit_transform(wdw_df.loc[:, DATA_COLUMNS].fillna(0))
    pca_wdw_df = pd.DataFrame(pca_values, columns=DATA_COLUMNS)
    
    # calculate FFT for axis X
    ffts_x = np.abs(np.fft.fft(pca_wdw_df.x.values))
    ffts_x = ffts_x[1:ffts_x.shape[-1]/2 + 1]
    
    ffts_y = np.abs(np.fft.fft(pca_wdw_df.y.values))
    ffts_y = ffts_y[1:ffts_y.shape[-1]/2 + 1]
    
    ffts_z = np.abs(np.fft.fft(pca_wdw_df.z.values))
    ffts_z = ffts_x[1:ffts_z.shape[-1]/2 + 1]
    
    #calculate the mean of the amplitudes of the vectors
    vector_ampli = 0.0
    x1=0.0
    y1=0.0
    z1=0.0

    
    for i in range(0,len(wdw_df.index)):
              #print('-------------')
              #print(i)
              x1 = wdw_df.x.iloc[i]**2
              #print(x)
              y1 = wdw_df.y.iloc[i]**2
              #print(y)
              z1= wdw_df.z.iloc[i]**2
              #print(z)
              vector_ampli1= math.sqrt(x1+y1+z1)
              vector_ampli += vector_ampli1
              
              #print('---')
              #print(vector_ampli)

              #print(vector_ampli1)
              #print(vector_ampli)
          
    
    vector_ampli_mean=np.float64(vector_ampli/(len(wdw_df.index)))
    
    
    # Calculate features #wdws is a list of dataframes. 
    features = np.array([
        wdw_df.x.mean(), # Orientation features
        wdw_df.y.mean(), # Orientation features
        wdw_df.z.mean(), # Orientation features
        
        #1
        #wdw_df.x.std(),
        #wdw_df.y.std(),
        #wdw_df.z.std(),
        
        wdw_df.x.diff(periods=2).mean(),
        wdw_df.y.diff(periods=2).mean(),
        wdw_df.x.diff(periods=2).mean(),
        ffts_x.mean(),
        ffts_y.mean(),
        ffts_z.mean(),
        ffts_x.max(),
        ffts_y.max(),
        ffts_z.mean(),
        
        
        #2
        vector_ampli_mean,
        
        pca_wdw_df.x.std(),
        pca_wdw_df.y.std(),
        pca_wdw_df.z.std(),

    ])
    #print(type(wdw_df.x.mean()))
    #print(type(vector_ampli_mean))
    # Concatenate fft X with features
    features = np.concatenate((features, ffts_x))
    
    X.append(features)
    
X = np.nan_to_num(np.asarray(X), 0)    
print(type(X))


<type 'numpy.ndarray'>


In [150]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import random
import math

# Fix random values
random.seed = 1234
np.random.seed(1234)

# Create Random Forest model
model = RandomForestClassifier(n_estimators=40)


# Apply cross validation on the users
nr_folds = 5
uniq_ids = np.unique(ids)
kf = KFold(n_splits=nr_folds, shuffle=True)
splits = kf.split(uniq_ids)
    
cm = np.zeros([len(LABELS_TO_IDX), len(LABELS_TO_IDX)])
for fold, (train_idx, test_idx) in enumerate(splits):
    # Determine train & test user indices for this fold
    train_idx = np.in1d(ids, uniq_ids[train_idx])
    test_idx = np.in1d(ids, uniq_ids[test_idx])

    # Determine train & test window indices for this fold

    train_X, train_y = X[train_idx], y[train_idx]
    test_X, test_y = X[test_idx], y[test_idx]

    # Train the model on train set
    model.fit(train_X, train_y)

    # Perform prediction on test set
    pred_y = model.predict(test_X)
    
    # Calculate confusion matrix
    cm += confusion_matrix(y_true=test_y, y_pred=pred_y)


# Calculate the mean of the Confusion Matrix
cm = cm / np.float(nr_folds)

# normalize Confusion matrix to 1 per class
cm = cm / np.sum(cm, axis=1).reshape(-1, 1).astype(float)
accuracy = cm.diagonal().sum() / len(cm)

# convert Confusion Matrix to DataFrame
cm = pd.DataFrame((100*cm).round(1))
cm.columns = IDX_TO_LABELS
cm.index = IDX_TO_LABELS

print cm
print
print "Accuracy: {}%".format((accuracy * 100).round(1))


            Walking  Jogging  Upstairs  Downstairs  Standing  Sitting
Walking        90.3      4.3       2.3         3.0       0.0      0.0
Jogging         1.2     96.8       1.3         0.7       0.0      0.0
Upstairs       33.9     13.0      45.8         6.5       0.7      0.0
Downstairs     46.3      2.6      13.0        38.1       0.0      0.0
Standing        0.0      0.0       1.9         0.0      97.2      0.9
Sitting         0.0      0.0       2.4         0.0       8.7     89.0

Accuracy: 76.2%
