In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

**This file is to build the final augmented dataset used for training model, including two major parts:**


**1. Post processing: singal post-processing for both raw data collected from human and robot.** 
   1. For human: downsampling (decimate) both EMG/IMU to the rate at 10 Hz
   2. For robot: downsampling (decimate) one segment of datapoints to 8 points, add 2 static points at the end of each segment. Integrate 4 segments into single sequence. 
   3. Concatenate 10 repeats for both human and robot data, and then align them to obtain the original dataset.

**2. Data augmentation: apply augmentaion on original dataset in order to train the mVAE model**
   1. Using sklearn.preprocessing.MinMaxScaler to normailize the original data to the range at [-1, 1]
   2. Horizontally concatenate all data points at current time (at t) with them at previous time (at t -1)
   3. Split dataset into training set and testing set at ratio of 80:20
   4. Mask all robot data in training set with value -2 to obtain the case 2 dataset; mask all original data at t in training set with value -2  to obtain case 3 dataset
   5. Vertically concatenate original data with case 2 and case 3 data to obtain the final augmented training set

In [None]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from scipy.signal import detrend
from scipy import signal
import math
from sklearn import preprocessing

In [None]:
########### Below is the part for processing emg data ###########
### function for getting emg data in one task for one arm 
def process_emg(i, arm):  
  df_emg = pd.read_csv('/content/drive/MyDrive/finalProject/human_/task_'  + str(i) + '-' + arm +'_myo-emg.csv')
  #slice rows : 10 trials (40s) in total
  df_emg = df_emg.iloc[200:8200,:]

  # convert '.data' col to array
  df_emg['.data'] = df_emg['.data'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
  def f(emgs): 
      return [emg for i, emg in enumerate(emgs)]

  emg_names = ['emg_0', 'emg_1', 'emg_2', 'emg_3', 'emg_4', 'emg_5', 'emg_6', 'emg_7']
  df_emg[emg_names] = df_emg.apply(lambda x: f(x['.data']), axis=1, result_type='expand')

  # time_elapsed
  df_emg['time']=pd.to_datetime(df_emg['time'])
  df_emg['time_elapsed']=df_emg['time'].apply(lambda t : t - df_emg.iloc[0,0]).dt.total_seconds()

  #post_process 
  df_emg_proc = pd.DataFrame()
  for emg in emg_names:
    df_emg[emg] = detrend(df_emg[emg], type='constant')       ##detrend: remove any constant offset
    df_emg[emg] = abs(df_emg[emg])                            ##Full-wave rectification: simply just take the absolute values 
    df_emg[emg] = df_emg[emg].rolling(window=50, min_periods=1).mean()       ##Smoothening: moving average with 250ms sliding window
    #downsample to 10hz  (400points for 40s)
    q = 20 # downsampling factor
    df_emg_proc[emg] = signal.decimate(df_emg[emg], q)
  duration = df_emg['time_elapsed'].tail(1).item()
  df_emg_proc['time_elapsed']= np.linspace(0, duration, len(df_emg_proc))

  return df_emg_proc
#################################################################

In [None]:
########### Below is the part for processing imu data ###########
### function for getting imu data in one task for one arm 
def process_imu(i, arm):
  df_imu = pd.read_csv('/content/drive/MyDrive/finalProject/human_/task_'  + str(i) + '-' + arm +'_myo-imu.csv')
  df_imu = df_imu.iloc[50:2050,:]

  # time_elapsed
  df_imu['time']=pd.to_datetime(df_imu['time'])
  df_imu['time_elapsed']=df_imu['time'].apply(lambda t : t - df_imu.iloc[0,0]).dt.total_seconds()

  # post_process
  df_imu_proc = pd.DataFrame()
  #downsample to 10hz  (400points for 40s)
  q = 5 # downsampling factor
  #for ori
  ori_name = ['.orientation.x', '.orientation.y', '.orientation.z', '.orientation.w']
  for ori in ori_name:
    df_imu_proc[ori] = signal.decimate(df_imu[ori], q)
  #for ang_vel
  ang_vel_name = ['.angular_velocity.x', '.angular_velocity.y', '.angular_velocity.z']
  for ang_vel in ang_vel_name:
    df_imu_proc[ang_vel] = signal.decimate(df_imu[ang_vel], q)
  #for lin_acc
  lin_acc_name = ['.linear_acceleration.x', '.linear_acceleration.y', '.linear_acceleration.z']
  for lin_acc in lin_acc_name:
    df_imu_proc[lin_acc] = signal.decimate(df_imu[lin_acc], q)

  duration = df_imu['time_elapsed'].tail(1).item()
  df_imu_proc['time_elapsed']= np.linspace(0, duration, len(df_imu_proc))

  return df_imu_proc
###################################################################

In [None]:
########### Below is the part for processing robot data ###########
### function for getting robo data segment: 4 seg in one sequence
def build_robo_segment(task, trial, seg):
  df_robo = pd.read_csv('/content/drive/MyDrive/finalProject/robot_/' + str(task) + '/trial_' + str(trial) + '_' + str(seg) + '-points_data.csv')
  # convert '.data' col to array
  df_robo['.position'] = df_robo['.position'].apply(lambda x: np.fromstring(x[1:-1], sep=','))
  def split(positions): 
      return [pos for i, pos in enumerate(positions)]

  pos_names = ['pos_1', 'pos_2', 'pos_3', 'pos_4', 'pos_5', 'pos_6', 'pos_7', 'f_pos_1', 'f_pos_2']
  df_robo[pos_names] = df_robo.apply(lambda x: split(x['.position']), axis=1, result_type='expand')
  #extract joint position data
  joints_pos_names = pos_names[:7]
  df_robo_pos = df_robo[joints_pos_names]
  df_robo_pos['real_time'] = [i / 10 for i in range(len(df_robo_pos))]
  #process robo data (decimate and add noise)
  df_robo_proc = pd.DataFrame()
  for pos in joints_pos_names:
    q = math.ceil(len(df_robo_pos)/8) # downsampling factor -> resample to size of 8
    df_robo_proc[pos] = signal.decimate(df_robo_pos[pos], q, n=3) # n is the order of filter
    # df_robo_proc[pos] += np.random.normal(0,0.02,len(df_robo_proc)) # add G noise
  duration = df_robo_pos['real_time'].tail(1).item()
  df_robo_proc['proc_time'] = np.linspace(0, duration, len(df_robo_proc))
  ## add more static points in th end
  end_pos = df_robo_proc.tail(1)
  length = len(df_robo_proc)
  while length < 10:
    df_robo_proc = df_robo_proc.append(end_pos, ignore_index = True)
    length += 1
  
  return df_robo_proc  

In [None]:
### function for building integrated robo data: pos and vel for whole single trial
def build_robo_integrated(task, trial):
  df_robo_seg_list = []
  for j in range(4):
    df_robo_seg_list.append(build_robo_segment(task, trial, seg=j + 1))
  ##integrate one trial
  df_robo_integrated = pd.concat(df_robo_seg_list, ignore_index = True)

  ### fabricate time stamp and generate vel
  df_robo_integrated['proc_time'] = np.linspace(0, 4, len(df_robo_integrated))
  joints_pos_names = ['pos_1', 'pos_2', 'pos_3', 'pos_4', 'pos_5', 'pos_6', 'pos_7']
  joints_vel_names = ['vel_1', 'vel_2', 'vel_3', 'vel_4', 'vel_5', 'vel_6', 'vel_7']
  for pos, vel in zip(joints_pos_names, joints_vel_names):
    vel_list = [0] # start from 0 vel
    for i in range(len(df_robo_integrated) - 1):
      vel_instant = (df_robo_integrated[pos][i+1] - df_robo_integrated[pos][i]) / (df_robo_integrated['proc_time'][i+1] - df_robo_integrated['proc_time'][i])
      vel_list.append(vel_instant)
    df_robo_integrated[vel] = vel_list

  return df_robo_integrated

In [None]:
### function for getting repeats for 10 trials in one task
def process_robo(k):
  df_robo_repeat_list = []
  for i in range(10):
    df_robo_repeat_list.append(build_robo_integrated(task=k, trial=i+1))
    
  return pd.concat(df_robo_repeat_list, ignore_index=True)
########################################################################

In [None]:
########### Below is the part for aligning human and robot data ########
### function for getting the original dataset
def get_original_dataset(n):  # n is the nums of task
  task_list = []
  for i in range(n):
    df_RL_emg = process_emg(i + 1, 'RL').drop(columns=['time_elapsed'])
    df_RU_emg = process_emg(i + 1, 'RU').drop(columns=['time_elapsed'])
    df_RL_imu = process_imu(i + 1, 'RL').drop(columns=['time_elapsed'])
    df_RU_imu = process_imu(i + 1, 'RU').drop(columns=['time_elapsed'])
    df_robo = process_robo(i+1).drop(columns=['proc_time'])
    ###align human/robo data for one task
    combined_data = [df_RL_imu, df_RL_emg, df_RU_imu, df_RU_emg, df_robo]
    df_data_oneTask = pd.concat(combined_data, axis=1)
    task_list.append(df_data_oneTask)

  df_original_data = pd.concat(task_list, ignore_index=True)
  return df_original_data

In [None]:
### obtain the original dataset
df_original_dataset = get_original_dataset(5)
###save as csv
%cd /content/drive/MyDrive/finalProject
df_original_dataset.to_csv('original_data_only_t.csv')
########################################################################

In [None]:
########### Below is the part for data augmentation ########
### normalize within -1 and 1 -> coln by coln 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1)) 
scaler.fit(df_original_dataset)
scaled = scaler.fit_transform(df_original_dataset)
scaled_original_dataset = pd.DataFrame(scaled, columns=df_original_dataset.columns)
scaled_original_dataset

In [None]:
###combine t with t - 1
df_original_dataset_prev = scaled_original_dataset.iloc[:-1, :]
df_original_dataset_cur = scaled_original_dataset.iloc[1:, :]
dataset_list = [df_original_dataset_cur, df_original_dataset_prev]

In [None]:
#[df_RL_imu, df_RL_emg, df_RU_imu, df_RU_emg, df_robo_pos, robo_vel]
### function for build a dataset with data at t and at t-1
def create_cur_prev_dataset(i):
  # 0: cur, 1: prev
  RL_imu = dataset_list[i].iloc[:,:10].reset_index(drop=True)
  RL_emg = dataset_list[i].iloc[:,10:18].reset_index(drop=True)
  RU_imu = dataset_list[i].iloc[:,18:28].reset_index(drop=True)
  RU_emg = dataset_list[i].iloc[:,28:36].reset_index(drop=True)
  robo_pos = dataset_list[i].iloc[:,36:43].reset_index(drop=True)
  robo_vel = dataset_list[i].iloc[:,43:50].reset_index(drop=True)

  return [RL_imu, RL_emg, RU_imu, RU_emg, robo_pos, robo_vel]

In [None]:
cur_data_list = create_cur_prev_dataset(0)
prev_data_list = create_cur_prev_dataset(1)

#pd.concat(combined_data, axis=1)
item_list = []
for i in range(len(cur_data_list)):
  item_list.append(cur_data_list[i])
  item_list.append(prev_data_list[i])

data_with_cur_prev = pd.concat(item_list, axis=1)
data_with_cur_prev.to_csv('original_data_with_cur_prev.csv')  # 1999 rows × 100 columns

In [None]:
####split dataset to train 80%, test 20%
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data_with_cur_prev, test_size=0.2)
training_data.to_csv('raw_training_data.csv')
testing_data.to_csv('testing_data.csv')

In [None]:
### load training data
# case 1: original data
training_data = pd.read_csv('raw_training_data.csv', header=None, skiprows=1, index_col=[0]).reset_index(drop=True) # 1599 rows × 100 columns

In [None]:
#### augment for training data
# case 2: mask robot data 

# training_data.reset_index(drop=True, inplace=True)
masked_robo = pd.DataFrame(np.full((training_data.shape[0],28),-2))
training_no_robo = pd.concat([training_data.iloc[:, :72],masked_robo], axis=1)
training_no_robo.columns = training_data.columns
training_no_robo.to_csv('training_no_robo.csv') # 1599 rows × 100 columns

In [None]:
# case 3: mask data at t
masked_imu_cur = pd.DataFrame(np.full((training_data.shape[0],10),-2))
masked_emg_cur = pd.DataFrame(np.full((training_data.shape[0],8),-2))
masked_robo_cur = pd.DataFrame(np.full((training_data.shape[0],7),-2))
aug_item_list = [masked_imu_cur, training_data.iloc[:,10:20], masked_emg_cur, training_data.iloc[:,28:36],
                 masked_imu_cur, training_data.iloc[:,46:56], masked_emg_cur, training_data.iloc[:,64:72],
                 masked_robo_cur, training_data.iloc[:,79:86], masked_robo_cur, training_data.iloc[:,93:100]]
training_no_cur = pd.concat(aug_item_list, axis=1)
training_no_cur.columns = training_data.columns
training_no_cur.to_csv('training_no_cur.csv') # 1599 rows × 100 columns

In [None]:
##concat case 1, 2, 3
aug_training_list = [training_data, training_no_robo, training_no_cur]
aug_training_data = pd.concat(aug_training_list, axis=0, ignore_index=True)
aug_training_data # 4797 rows × 100 columns

In [None]:
### aug + original
original_training_data_for_label = pd.concat([training_data,training_data,training_data], axis=0, ignore_index=True)
final_list = [aug_training_data, original_training_data_for_label]
final_aug_training = pd.concat(final_list, axis=1)
### save final aug training data
final_aug_training.to_csv('final_aug_training_data.csv') # 4797 rows × 200 columns