#### Prepare all data to be windowed 

In [1]:
import pandas as pd 
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

seed = 7
np.random.seed(seed)

In [2]:
file_path = '../Data/'
candidate_file, label_file = 'data_10Hz_candidates.csv','data_10Hz_labeled.csv'
window_size , overlap = 32, 0.5

In [3]:
def data_loader(candidate_file, label_file):
    dataset = pd.read_csv(file_path + candidate_file)
    label  = pd.read_csv(file_path + label_file)
    return dataset, label

In [4]:
def data_processing(dataset, label, window_size , overlap):

    feat = ['AccelX','AccelY','AccelZ','GyroX', 'GyroY', 'GyroZ', 'Heading', 'Speed', 'SAS_Angle', 'lateral_offset', 'pred']
    cat = list(label['maneuver'])
    data = dataset.loc[:, feat]
    
    ## Previous pred -> left 1 , right 2 
    ## lc pred -> one hot encoding
    data = pd.get_dummies(data, columns = ['pred'])
    data['maneuver'] = cat

    ## make sliding windows
    windows = []

    df = data
    k = int((len(df)-window_size)/(window_size*(1-overlap)))
    for i in range(k+1):
        w_s = int(i*(1-overlap)*window_size)
        w_e = w_s + window_size
        win = df.iloc[w_s:w_e,:]
        win = win.fillna(0)
        windows.append(win)

    ## make label for window
    label = []
    for win in windows:
        mane = win['maneuver'].value_counts()
        label.append(mane.index[0])

    y = np.array(label)
    np.unique(y)
    
    ## Data arrangement for learning

    X = np.array(windows)
    X = X[:,:,:-1] ##slice 'maneuver'(answer) in X
    X = np.nan_to_num(X)
    le = LabelEncoder()
    y = le.fit_transform(y)
    
    return X, y 

In [5]:
dataset0, label0 = data_loader(candidate_file, label_file)

X, y = data_processing(dataset0, label0, window_size , overlap)
print(X.shape, y.shape)

(3433, 32, 13) (3433,)


In [6]:
# # Multivariate Timeseries Standardization 
# from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# # X = TimeSeriesScalerMeanVariance(mu=0, std=1).fit_transform(X)

# X0 = TimeSeriesScalerMeanVariance(mu=0, std=1).fit_transform(X[:,:,:6])
# X1 = X[:,:,6:]
# X = np.concatenate((X0, X1), axis = 2)

In [7]:
# Train-Test split
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, stratify = y, test_size=0.2, random_state=0)

print(train_X.shape)
print(train_y.shape)

(2746, 32, 13)
(2746,)


In [8]:
# save windowed data into .pkl format file
data = [train_X, test_X, train_y, test_y]

with open(file_path + 'data_no_std.pkl', 'wb') as f:
    pickle.dump(data, f)

In [9]:
## test_results to find proper window_size and overlap

# window_size 32 overlap 0.5  // 0.82 0.74 0.77
# window_size 32 overlap 0.75 // 0.87 0.81 0.84
# window_size 32 overlap 0.875 // 0.93 0.93 0.93 

### overlap 높으면 traintest split 하면서 비슷한 frame이 많이 나와서 score가 올라가는듯

# window_size 32 overlap 0.6875 // 0.89 0.83 0.86
# window_size 32 overlap 0.625 // 0.89 0.75 0.8
# window_size 32 overlap 0.5625 // 0.77 0.77 0.76
# window_size 32 overlap 0.59375 // 0.86 0.80 0.82


# window_size 64 overlap 0.5 // 0.71 0.66 0.67
# window_size 64 overlap 0.75 // 0.83 0.76 0.79
# window_size 64 overlap 0.625 // 0.78 0.7 0.73

# window_size 48 overlap 0.5 // 0.75 0.65 0.66
# window_size 48 overlap 0.75 // 0.84 0.75 0.79
# window_size 48 overlap 0.625 // 0.81 0.73 0.76

# window_size 24 overlap 0.5 // 0.83 0.76 0.79
# window_size 24 overlap 0.75 // 0.89 0.88 0.88
# window_size 24 overlap 0.625 // 0.88 0.83 0.85

# window_size 16 overlap 0 // 0.81 0.74 0.77
# window_size 16 overlap 0.25 // 0.80 0.79 0.79