<a href="https://colab.research.google.com/github/LoriSchuan-dev/transferLearning_AIwearable/blob/main/UNT_CSCE5280_Fall2021_TransferLearning_Iteration2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import platform

#!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
#from google.colab import drive
from oauth2client.client import GoogleCredentials

os.getcwd()
os.listdir()
#drive.mount('/content/drive')
#os.getcwd()
#os.listdir("./drive/MyDrive")


In [None]:
# cloning our team's repo on github
#!git clone https://github.com/LoriSchuan-dev/transferLearning_AIwearable.git
#os.listdir()

## **Confusion Matrix**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

# **DATASET**

In [None]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
#gauth = GoogleAuth()
#gauth.credentials = GoogleCredentials.get_application_default()
#gDrive = GoogleDrive(gauth)


# download the dataset from our team's shared folder on google drive
!gdown --id '1X3vtb0ZPQBgv6Fdu_iHul9PNGbSnJqGN'

# check if download successful
#os.listdir()

In [None]:
# extract from the dataset compressed package
!unzip "gestures-dataset.zip"
# check if extraction successful
os.listdir("./gestures-dataset")

/
/


['lib64',
 'home',
 'media',
 'tmp',
 'opt',
 'run',
 'sbin',
 'sys',
 'mnt',
 'root',
 'dev',
 'lib',
 'usr',
 'proc',
 'srv',
 'bin',
 'var',
 'boot',
 'etc',
 'content',
 '.dockerenv',
 'tools',
 'datalab',
 'tensorflow-1.15.2',
 'lib32',
 'python-apt']

In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")




gesture_subset = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12','13','14','15','16','17','18','19','20']

gesture_subset.sort()

print("Loadind Dataset for gestures: ", gesture_subset)

### path to dataset, will need to change depending on where and how you run 
### i.e., google colab online or your own computer, and where do you locate files on your computer
### preferred relative path with "../" or "./"
### *note, on Windows, "/path" is relative path starting from current directory location, and equivalent to "./path"
### but on Linux/Unix, "/path" is absolute path starting from root-directory, and root-directory is also different from the directory named "root" (which is a subdirectory on root-dir accessible by path "~")
path = r'./gestures-dataset'

samples=0
dataset = None

for subject in os.listdir(path):
        if os.path.isfile(os.path.join(path, subject)):
            continue
        if subject in ('U01', 'U02', 'U03', 'U04', 'U05', 'U06', 'U07', 'U08'):
            
            for gesture in os.listdir(os.path.join(path, subject)):
                
                if os.path.isfile(os.path.join(path, subject, gesture)):
                    
                    continue
                gesture = str(gesture)
                if gesture not in gesture_subset:
                    continue
                
                for samplefile in os.listdir(os.path.join(path, subject, gesture)):
                    
                    if os.path.isfile(os.path.join(path, subject, gesture, samplefile)):
                        
                        df = pd.read_csv(os.path.join(path, subject, gesture, samplefile), \
                            sep = ' ', \
                            names = ['System.currentTimeMillis()', \
                            'System.nanoTime()', \
                            'sample.timestamp', \
                            'X', \
                            'Y', \
                            'Z' \
                            ])
                        df = df[["sample.timestamp", "X", "Y", "Z"]]
                        

                                        
                        start = df["sample.timestamp"][0]
                        df["sample.timestamp"] -= start
                        df["sample.timestamp"] /= 10000000
                        df["subject"] = subject
                        df["gesture"] = gesture
                        df["sample"] = str(samplefile[:-4])
                        samples += 1
                        #print(df)
                        
                        if dataset is None:
                            dataset = df.copy()
                        else:
                            dataset = pd.concat([dataset, df])


dataset = dataset.sort_values(by=['gesture','subject','sample','sample.timestamp'])
data = dataset    
print(str(samples) + " samples loaded")

## Scaling

In [None]:
print("Scaling Dataset for gestures: ", gesture_subset)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
dataset_scaled = None

samples = 0

for i, gesture in enumerate(gesture_subset):
        df_gesture=data[data['gesture']==gesture] 
        for j, subject in enumerate(df_gesture['subject'].unique()):
            df_subject=df_gesture[df_gesture['subject']==subject]
            for k, sample in enumerate(df_subject['sample'].unique()):
                df_sample=df_subject[df_subject['sample']==sample].copy()
                df_sample.sort_values(by=['sample.timestamp'])

                sc = scaler
                sc = sc.fit_transform(df_sample[["X", "Y", "Z"]])
                sc = pd.DataFrame(data=sc, columns=["X", "Y", "Z"])
                df_sample['X'] = sc['X']
                df_sample['Y'] = sc['Y']
                df_sample['Z'] = sc['Z']
                if dataset_scaled is None:
                    dataset_scaled = df_sample.copy()
                else:
                    dataset_scaled = pd.concat([dataset_scaled, df_sample])
                samples += 1
print(str(samples) + " samples scaled")
data = dataset_scaled



## **Cleaning**

In [None]:
print("Cleaning Dataset for gestures: ", gesture_subset)

dataset_outliers = None
dataset_cleaned = None

samples = 0
outliers = 0

for i, gesture in enumerate(gesture_subset):
        df_gesture = data[data['gesture']==gesture]
        for j, subject in enumerate(df_gesture['subject'].unique()):
            df_subject = df_gesture[df_gesture['subject']==subject]
        
            time_mean = df_subject.groupby(["gesture","subject", "sample"]).count().groupby(["gesture","subject"]).agg({'sample.timestamp': ['mean']})
            time_std = df_subject.groupby(["gesture","subject", "sample"]).count().groupby(["gesture","subject"]).agg({'sample.timestamp': ['std']})
            time_max = time_mean['sample.timestamp'].iloc[0]['mean'] + 1.0 * time_std['sample.timestamp'].iloc[0]['std']
            time_min = time_mean['sample.timestamp'].iloc[0]['mean'] - 1.0 * time_std['sample.timestamp'].iloc[0]['std']
            for k, sample in enumerate(df_subject['sample'].unique()):
                df_sample=df_subject[df_subject['sample']==sample]
                df_sample_count = df_sample.count()['sample.timestamp']
                if df_sample_count < time_min or df_sample_count > time_max:
                    if dataset_outliers is None:
                        dataset_outliers = df_sample.copy()
                    else:
                        dataset_outliers = pd.concat([dataset_outliers, df_sample])
                    outliers += 1
                else:
                    if dataset_cleaned is None:
                        dataset_cleaned = df_sample.copy()
                    else:
                        dataset_cleaned = pd.concat([dataset_cleaned, df_sample])
                    samples += 1
print(str(samples) + " samples cleaned")
print(str(outliers) + " samples outliers")
data = dataset_cleaned

## **Time Slicing of Cleaned Data**

In [None]:
print("Time slicing Cleaned Dataset for gestures: ", gesture_subset)
dataset_timecut = None
samples = 0
damaged = 0
for i, gesture in enumerate(data['gesture'].unique()):
  df_gesture = data[data['gesture']==gesture]
  for j, subject in enumerate(df_gesture['subject'].unique()):
    df_subject = df_gesture[df_gesture['subject']==subject] 
    time_max = 19 # 18 * 11 = 198
    for i, sample in enumerate(df_subject['sample'].unique()):
                df_sample = df_subject[df_subject['sample']==sample]
                df_sample_count = df_sample.count()['sample.timestamp']
                #print(df_sample_count)
                if df_sample_count >= time_max:
                    df_sample = df_sample[df_sample['sample.timestamp'] <= (11 * (time_max-1))]
                    df_sample_count = df_sample.count()['sample.timestamp']
                    #print(df_sample_count)
                elif df_sample_count < time_max:
                    for tmp in range(df_sample_count * 11, (time_max) * 11, 11):
                        df = pd.DataFrame([[tmp, 0.0, 0.0, 0.0, gesture, subject, sample]], columns=['sample.timestamp', 'X', 'Y', 'Z', 'gesture', 'subject', 'sample'])
                        df_sample = df_sample.append(df, ignore_index=True)            
                #print(df_sample)
                df_sample_count = df_sample.count()['sample.timestamp']
                #print(df_sample_count)
                if df_sample_count != time_max:
                    damaged += 1
                    continue
                if dataset_timecut is None:
                    dataset_timecut = df_sample.copy()
                else:
                    dataset_timecut = pd.concat([dataset_timecut, df_sample])
                samples += 1

dataset_cleaned = dataset_timecut
print(str(samples) + " cleaned samples sliced")
print(str(damaged) + " cleaned samples damaged")

## **Slicing outliers**

In [None]:
data = dataset_outliers
print("Time slicing Outliers Dataset for gestures: ", gesture_subset)
dataset_timecut = None
samples = 0
damaged = 0
for i, gesture in enumerate(data['gesture'].unique()):
        df_gesture = data[data['gesture']==gesture]
        for j, subject in enumerate(df_gesture['subject'].unique()):
            df_subject = df_gesture[df_gesture['subject']==subject] 
            time_max = 19 # 18 * 11 = 198
            for i, sample in enumerate(df_subject['sample'].unique()):
                df_sample = df_subject[df_subject['sample']==sample]
                df_sample_count = df_sample.count()['sample.timestamp']
                #print(df_sample_count)
                if df_sample_count >= time_max:
                    df_sample = df_sample[df_sample['sample.timestamp'] <= (11 * (time_max-1))]
                    df_sample_count = df_sample.count()['sample.timestamp']
                    #print(df_sample_count)
                elif df_sample_count < time_max:
                    for tmp in range(df_sample_count * 11, (time_max) * 11, 11):
                        df = pd.DataFrame([[tmp, 0.0, 0.0, 0.0, gesture, subject, sample]], columns=['sample.timestamp', 'X', 'Y', 'Z', 'gesture', 'subject', 'sample'])
                        df_sample = df_sample.append(df, ignore_index=True)            
                #print(df_sample)
                df_sample_count = df_sample.count()['sample.timestamp']
                #print(df_sample_count)
                if df_sample_count != time_max:
                    damaged += 1
                    continue
                if dataset_timecut is None:
                    dataset_timecut = df_sample.copy()
                else:
                    dataset_timecut = pd.concat([dataset_timecut, df_sample])
                samples += 1

dataset_outliers = dataset_timecut
print(str(samples) + " outliers samples sliced")1
print(str(damaged) + " outliers samples damaged")

## **Hyperparameter tuning**

In [None]:

from keras.models import Sequential
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import adam_v2
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
#    from scikeras.wrappers import KerasClassifier

import sklearn
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import numpy as np
 
# fix random seed for reproducibility
seed = 1000
np.random.seed(seed)


# create the dataset
def get_dataset(data):
    X_train = []
    Y_train = []
    groups = []
    for i, gesture in enumerate(data['gesture'].unique()):
      df_gesture = data[data['gesture']==gesture]
      for j, subject in enumerate(df_gesture['subject'].unique()):
              df_subject = df_gesture[df_gesture['subject']==subject]
              for k, sample in enumerate(df_subject['sample'].unique()):
                  df_sample = df_subject[df_subject['sample']==sample]
                  accel_vector = []
                  for index, row in df_sample.sort_values(by='sample.timestamp').iterrows():
                      accel_vector.append([row['X'],row['Y'],row['Z']])
                  accel_vector = np.asarray(accel_vector)
                  X_train.append(accel_vector)
                  Y_train.append(gesture)
                  groups.append(subject)
    X_train = np.asarray(X_train)
    Y_train = LabelEncoder().fit_transform(Y_train)
    #print(Y_train)
    return X_train, Y_train, groups

# Function to create model, required for KerasClassifier
def create_model(dropout_rate=0.8, units=128, optimizer=adam_v2.Adam(learning_rate=0.001)):
    model = Sequential()
    model.add(
        Bidirectional(
            LSTM(
                units=units, 
                input_shape=[19, 3]
            )
        )
    )
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(units=units, activation='relu'))
    model.add(Dense(len(gesture_subset), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    #print(model.summary())	
    return model

model = KerasClassifier(build_fn=create_model, verbose=0)
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=1000)
    # get the dataset
X, y, g = get_dataset(dataset_cleaned)
    #cv = cv.split(X, y, g)
batch_size = [19]
epochs = [64, 128]
    #epochs = [128]
units = [32,64,128]
#    units = [16]
dropout_rate = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
#    dropout_rate = [0.5]
param_grid = dict(epochs=epochs, units=units, batch_size=batch_size, dropout_rate=dropout_rate)
print("Hyperparameter tunning started for Dataset for gestures: ", gesture_subset)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=cv, verbose=1)
grid_result = grid.fit(X, y, groups=g)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
train_mean = grid_result.cv_results_['mean_fit_time']
train_std = grid_result.cv_results_['std_fit_time']
score_mean = grid_result.cv_results_['mean_score_time']
score_std = grid_result.cv_results_['std_score_time']
params = grid_result.cv_results_['params']
for mean, stdev, train_mean, train_std, score_mean, score_std, param in zip(means, stds, train_mean, train_std, score_mean, score_std, params):
    print("accuracy: %f (%f) train time: %f (%f) score time: %f (%f) with: %r" % (mean, stdev, train_mean, train_std, score_mean, score_std, param))
print("Hyperparameter tunning completed for Dataset: ", gesture_subset)


model = grid_result.best_estimator_


In [None]:
# make new temporary folder to store results
!mkdir results
os.listdir()

In [None]:
#gesture_subset = []
modelStorePath = "./results/"

import pickle

def save_model(model, gesture_subset, filePath):
    gesture_subset.sort()
    name = '-'.join(gesture_subset)
    # saving model
    pickle.dump(model.classes_, open(filePath + name + '_model_classes.pkl','wb'))
    model.model.save(name + '_lstm')

import tensorflow as tf
def load_model(gesture_subset, filePath):
    gesture_subset.sort()
    name = '-'.join(gesture_subset)
    # loading model
    build_model = lambda: tf.keras.models.load_model(name + '_lstm')
    classifier = KerasClassifier(build_fn=build_model, epochs=1, batch_size=10, verbose=0)
    classifier.classes_ = pickle.load(open(filePath + name + '_model_classes.pkl','rb'))
    classifier.model = build_model()
    return classifier

print("Saving model to disk started for Dataset gestures: ", gesture_subset)
#save_model(model, gesture_subset, modelStorePath)
print("Saving model to disk completed for Dataset gestures: ", gesture_subset)

print("Loading model to disk started for Dataset gestures: ", gesture_subset)
#model = load_model(gesture_subset, modelStorePath)
#print(model.model.summary())
print("Loading model to disk completed for Dataset gestures: ", gesture_subset)


In [None]:

print("Testing model against outliers for Dataset gestures: ", gesture_subset)
data = dataset_outliers
X, y, g = get_dataset(dataset_outliers)
y_pred = model.predict(X)
#print(y)
#print(y_pred)

from sklearn.metrics import classification_report
print(classification_report(y, y_pred, target_names=gesture_subset))    

from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(y, y_pred)
make_confusion_matrix(cf_matrix, categories=gesture_subset, figsize=[8,8])
#return grid_result

# **ZIP RESULTS AND UPLOAD TO TEAM's FOLDER on GG DRIVE**

In [None]:


import time
# zipping results fodler with timestamp
resultZip = "result_" + str(time.time()) + ".zip"
print (resultZip)
!zip $resultZip "./results"
os.listdir()

# Authenticate and create the PyDrive client.
auth.authenticate_user()

# upload to our team's shared folder on gg drive
!pip install --upgrade gupload
!gupload --to "17ftcqLR_52zZ8byGL_s0aXxhyP2Ko7Y3" $resultZip