# Useful tools for TSC
<a href="https://colab.research.google.com/github/jarusgnuj/ioctm358/blob/master/notebooks/time_series_classification/Appendix_1_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Split the dataset into development and final test sets

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.utils

import tensorflow as tf

In [None]:
do_save = True

In [None]:
def load_data(filename):
    ''' Load the data from a file in a GitHub repo '''
    url_root = 'https://raw.githubusercontent.com/jarusgnuj/ai-ml-wksh/master/data/UCR_TSC_archive/SonyAIBORobotSurface1_IoC'
    url = url_root+'/'+filename
    robot_df = pd.read_csv(url, sep='\t', header=None)
    print('Loaded from', url)
    robot_data = robot_df.values
    print('The shape of robot_data is', robot_data.shape)
    return robot_data


robot_data = load_data('SonyAIBORobotSurface1_IoC_ALL.txt')

In [None]:
y_data = robot_data[:,0]
print('Number of samples of class 1', (y_data == 1.0).sum())
print('Number of samples of class 2', (y_data == 2.0).sum())
y_df = pd.DataFrame(robot_data[:,0])
y_df[0].value_counts().plot(kind='bar')

In [None]:
# Shuffle the data. 
# If your data samples are in chronological order, this will mix them up.
robot_data = sklearn.utils.shuffle(robot_data)

# Create a balanced dataset

In [None]:
robot_df = pd.DataFrame(robot_data)
class1_df = robot_df[(robot_df[0]==1)]
class2_df = robot_df[(robot_df[0]==2)]
n1 = class1_df.count()[0]
n2 = class2_df.count()[0]
n = min(n1, n2)
print('Selection set sizes:', n1, n2)
print('Min selection set size:', n)
class1_df = class1_df.iloc[:n]
class2_df = class2_df.iloc[:n]
balanced_df = pd.concat([class1_df, class2_df])
balanced_df = sklearn.utils.shuffle(balanced_df)
balanced_df[0].value_counts().plot(kind='bar')

# Save the balanced dataset

In [None]:
data_name = 'SonyAIBORobotSurface1_IoC'
data_filename = data_name+'_BALANCED.txt'
if do_save:   
    np.savetxt(data_filename, balanced_df.to_numpy(), fmt='%8e', delimiter='\t')
    print('Data saved to file', data_filename)

# Reload and check the balanced dataset

In [None]:
balanced_data = np.loadtxt(Path(data_filename))
print('The shape of balanced_data is', balanced_data.shape)
y_data = balanced_data[:,0]
print('Number of samples of class 1', (y_data == 1.0).sum())
print('Number of samples of class 2', (y_data == 2.0).sum())
y_df = pd.DataFrame(y_data)
y_df[0].value_counts().plot(kind='bar')

# Split the data

In [None]:
train_data, test_data = train_test_split(balanced_data, test_size=100, random_state=21, stratify=balanced_data[:,0])
print('The shape of train_data is', train_data.shape)
print('The shape of test_data is', test_data.shape)
print('Training data:')
print('Number of samples of class 1', (train_data[:,0] == 1.0).sum())
print('Number of samples of class 2', (train_data[:,0] == 2.0).sum())
print('Test data:')
print('Number of samples of class 1', (test_data[:,0] == 1.0).sum())
print('Number of samples of class 2', (test_data[:,0] == 2.0).sum())

# Save the development and final test datasets

In [None]:
data_name = 'SonyAIBORobotSurface1_IoC'
if do_save:
    data_filename = data_name+'_DEV.txt'
    np.savetxt(data_filename, train_data, fmt='%8e', delimiter='\t')
    data_filename = data_name+'_FINAL_TEST.txt'
    np.savetxt(data_filename, test_data, fmt='%8e', delimiter='\t')

# Standardise the data
We find that the robot data has already been standardised; the full dataset has a mean of zero and a standard deviation of one. Here we show how a dataset can be standardised. The StandardScalar transform function subtracts the mean of the training set (u) and divides by the standard deviation of the training set (s) and returns


z = (data - u)/s


Notice that we use the mean and standard deviation of the training set to standardise the test data. This avoids leaking information about the test set into our test data samples.

In [None]:
train_data_mean = train_data.mean()
train_data_std = train_data.std()
print('train_data_mean', train_data_mean)
print('train_data_std', train_data_std)

do_standardise = False
if do_standardise:
    scaler = StandardScaler()
    scaler.fit(train_data)
    train_data = scaler.transform(train_data) # (train_data - train_data_mean)/(train_data_std) 
    test_data = scaler.transform(test_data)   # (test_data - train_data_mean)/(train_data_std)
    print('Standardisation done.')
    print('train_data.mean()', train_data.mean())
    print('train_data.std()', train_data.std())

# Speed - GPU
Using a GPU can speed up calculations. However, it can take longer to transfer the data to the GPU.

You are more likely to see a speed-up if batch size is large. As you increase batch size, check that valuation accuracy does not deteriorate.

To use a GPU in colab select Edit - Notebook settings and then set Hardware accelerator to GPU

In [None]:
# Check to see if you are using a GPU.
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found a GPU at: {}'.format(device_name))