## Preprocessing .csv

### Changes made to your data
* `Extension.csv`
  - Added 0-timestep for first sample. It was missing.
  - Removed the last sample, as it was incomplete.
  - Removed `-,-,-,-` at the end.
* `Flexion.csv`
  - Removed the last sample, as it was incomplete.
  - Removed `-,-,-,-` at the end.

In [None]:
import pandas as pd
import numpy as np

In [None]:
ext = pd.read_csv('./Extension.csv') # pd.read_csv('./testExt.csv')
flex = pd.read_csv('./Flexion.csv') # pd.read_csv('./testFlex.csv')

In [None]:
def convert_to_numpy(df):
    df = df[df['TimeCoordinate'] != '-'] # Drop rows with '-' in 'TimeCoordinate' column
    # NOTE: LSTM inherently has a time component, so we don't need to keep track of time
    df = df.drop(df.columns[[0]], axis=1) # Drop 'TimeCoordinate' column
    df = df.drop(df.columns[[-1]], axis=1) # Drop 'Label' column
    return df.values.astype(float)

ext_np = convert_to_numpy(ext)
flex_np = convert_to_numpy(flex)

In [None]:
ext.shape, ext_np.shape, flex.shape, flex_np.shape

In [None]:
3200 // 20, 3020 // 20 # <-- Number of samples per class

In [None]:
ext_np = ext_np.reshape(-1, 20, 2)
flex_np = flex_np.reshape(-1, 20, 2)

ext_np.shape, flex_np.shape

In [None]:
# Generate labels
ext_labels = np.zeros(ext_np.shape[0]) # NOTE: 0-class
flex_labels = np.ones(flex_np.shape[0]) # NOTE: 1-class

In [None]:
# Merge the two classes
X = np.concatenate((ext_np, flex_np), axis=0)
y = np.concatenate((ext_labels, flex_labels), axis=0)

# Randomly shuffle the data only along the first axis for X and y
perm = np.random.permutation(len(X))

X_shuffled = X[perm]
y_shuffled = y[perm]

X_shuffled, y_shuffled

In [None]:
X_shuffled[1], y_shuffled[1] # Compare with the original csv, as a check.

In [None]:
X_shuffled.shape, y_shuffled.shape # 311 samples - 160 extension, 151 flexion

In [None]:
# Saving both to compressed .npz file
# np.savez("./shuffled_data.npz", X=X_shuffled, y=y_shuffled) # shuffled_test_data

## Checking X.npy

In [None]:
import numpy as np

# Load from .npz file
dat = np.load("./shuffled_data.npz") # shuffled_test_data

In [None]:
X, y = dat['X'], dat['y'] # Shuffled data

In [None]:
X.shape, y.shape

In [None]:
X[1], y[1]

## Data Scaling
- [ ] sklearn.preprocessing.RobustScaler (IQR)
- [ ] sklearn.preprocessing.MinMaxScaler (0-1)
- [ ] sklearn.preprocessing.StandardScaler (Z-score)

In [41]:
import numpy as np

# Load from .npz file
dat = np.load("./shuffled_data.npz") # shuffled_test_data

X, y = dat['X'], dat['y'] # Shuffled data

In [42]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import numpy as np

scaler = RobustScaler()

# apply the fit_transform method on each sample along the second axis
X_scaled = np.empty_like(X)
for i in range(X.shape[0]):
    X_scaled[i,:,:] = scaler.fit_transform(X[i,:,:])

# Compare with manually scaled data
X_mm = (X[0] - X[0].min(axis=0)) / (X[0].max(axis=0) - X[0].min(axis=0))
X_ss = (X[0] - X[0].mean(axis=0)) / X[0].std(axis=0)
X_rs = (X[0] - np.median(X[0], axis=0)) / (np.quantile(X[0], 0.75, axis=0) - np.quantile(X[0], 0.25, axis=0))
np.allclose(X_mm, X_scaled[0]), np.allclose(X_ss, X_scaled[0]), np.allclose(X_rs, X_scaled[0])

(False, False, True)

In [43]:
X.shape, X_scaled.shape

((311, 20, 2), (311, 20, 2))

In [44]:
# Save the scaled data to .npz file
np.savez("./robustscaled_data.npz", X=X_scaled, y=y) # minmaxscaled_data

### Mix data sources

In [45]:
import numpy as np

# Load from .npz file
dat_orig = np.load("./shuffled_data.npz") # shuffled_test_data
dat_new = np.load("./shuffled_test_data.npz") # shuffled_test_data

# X, y = dat['X'], dat['y'] # Shuffled data

In [57]:
dat_orig['X'].shape, dat_new['X'].shape, dat_orig['y'].shape, dat_new['y'].shape

((311, 20, 2), (50, 20, 2), (311,), (50,))

In [53]:
X_mix, y_mix = np.r_[dat_orig['X'], dat_new['X']], np.r_[dat_orig['y'], dat_new['y']]

In [54]:
np.savez("./unshuffled_mix_data.npz", X=X_mix, y=y_mix)

In [55]:
import numpy as np

# Load from .npz file
dat = np.load("./unshuffled_mix_data.npz") # shuffled_test_data

X, y = dat['X'], dat['y'] # Shuffled data

X.shape, y.shape

((361, 20, 2), (361,))