# HW5 - Brian Hicks, Joe Everton
## CS 498, Applied Machine Learning
Using the ADL dataset, available [here](https://archive.ics.uci.edu/ml/datasets/Dataset+for+ADL+Recognition+with+Wrist-worn+Accelerometer#).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [None]:
import os
import ntpath
# https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format
def path_leaf(path):
    head, tail = ntpath.split(path)
    return tail or ntpath.basename(head)

def load_data():
  dataset = []
  labels = []
  label_index = 0
  for dirname, dirnames, filenames in os.walk('HMP_Dataset'):
      # target all leaf directories not ending with '_MODEL'.
      if len(dirnames) == 0 and '_MODEL' not in dirname:
        class_name = path_leaf(dirname)
        labels.append(class_name)
        for filename in filenames:
            dataset.append((np.loadtxt(os.path.join(dirname, filename)), label_index))
        label_index += 1
  return dataset, labels

In [None]:
def window_stack(a, stepsize, width):
  n = a.shape[0]
  return np.hstack( a[i:1+n+i-width:stepsize] for i in range(0,width) )

def get_flattened_data(data, stepsize, width):
  flattened_data = window_stack(data[0][0], stepsize, width)
  for i in range(1, len(dataset)):
    slices = window_stack(data[i][0], stepsize, width)
    flattened_data = np.concatenate((flattened_data, slices), axis=0)
  return flattened_data

# def split_and_flatten_example(example, k, stepsize):
#   chunk_list = window_stack(example, stepsize, k)
# #   chunk_list = np.array_split(example, np.arange(k, len(example), step=k))
#   if len(chunk_list[-1]) != k:
#     chunk_list = chunk_list[:-1]
#   if stepsize != k:
#     chunk_list = chunk_list[1:]
# #   chunk_list = [np.ravel(chunk) for chunk in chunk_list]
#   return chunk_list

In [None]:
class h_kmeans:
  n_jobs = -1
  def __init__(self, n_clusters, n_jobs=-1):
    # n_clusters is a vector of K for each depth.  The depth of the tree == length of n_clusters.
    if type(n_clusters) != tuple:
      # if given a non-tuple (int) make it a tuple.
      n_clusters = (n_clusters,)
    h_kmeans.n_jobs = n_jobs
    self.my_k = n_clusters
    # print ("Initializing KMeans, n_clusters = {}".format(self.my_k[0]))
    self.km = KMeans(n_clusters=self.my_k[0], n_jobs=h_kmeans.n_jobs)
    self.children = []
    if len(n_clusters) > 1:
      # Remove my k, pass the rest to the children.
      self.children = [h_kmeans(n_clusters[1:]) for km in range(self.my_k[0])]
      # self.children = [h_kmeans(n_clusters[1:])] * self.my_k[0]

  def fit(self, X):
    if len(X) > 5000 and len(self.children) > 0:
      rand_indexes = np.random.randint(len(X), size=5000)
      self.km.fit(X[rand_indexes])
    else:
      self.km.fit(X)
    if len(self.children) > 0:
      clusters = self.km.predict(X)
      # print("fitting children with row count:", end='')
      for index, child_node in enumerate(self.children):
        # pick out the rows that were predicted for cluster child_node.
        match = clusters == index
        # print("{}, ".format(np.sum(match)), end='')
        child_node.fit(X[match])
      # print("")
    return self
  
  def predict(self, X):
    if len(self.children) == 0:
      clusters = self.km.predict(X)
      return clusters
    if len(self.children) > 0:
      clusters = self.km.predict(X)
      depth_clusters = []
      # TODO: We could pass sample_weight, and have it be 0 where it doesn't 
      # belong to this child, and 1 where it does.  That would get rid of the
      # for loop.
      for i in range(len(X)):
        # a cluster anywhere on the tree should have a unique id for histogramming
        # This math is supposed to use children results to get the flattened cluster
        # index.
        depth_clusters.append(
          clusters[i] * self.my_k[1] + self.children[clusters[i]].predict(X[i].reshape(1, -1)).item())
      return np.array(depth_clusters)

In [None]:
def get_histograms(km, dataset, K, k):
  hist_points = []
  for example in dataset:
    split_image = window_stack(example[0], 11, k)
    clusters = km.predict(split_image)
    hist, bin_edges = np.histogram(clusters, density=True, bins=np.arange(K+1))
    hist_points.append(hist)
  hist_points = np.array(hist_points)
  class_labels = np.array(list(zip(*dataset))[1])
  return hist_points, class_labels

In [None]:
def measure_accuracy_with_folding(hist_points, class_labels):
  accuracies = []
  kf = KFold(n_splits=3, shuffle=True)
  for train_index, test_index in kf.split(hist_points, class_labels):
    X_train, X_test = hist_points[train_index], hist_points[test_index]
    y_train, y_test = class_labels[train_index], class_labels[test_index]
    clf = RandomForestClassifier(n_estimators=750, max_depth=120)
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)
    accuracies.append(np.sum(y_prediction == y_test) / len(y_test))
  print("accuracy mean: {}\n\tindividual: {}".format(np.mean(accuracies), accuracies))

In [None]:
dataset, labels = load_data()

In [None]:
# Cut into k-sized chunks and fill the dictionary.
k = 32
dictionary = get_flattened_data(dataset, 11, k)
print(dictionary.shape)

In [None]:
# k_experiments = ((32, 15), 480, (40, 12))#, (15, 8, 4))
# k_experiments = (100, 200, 300, 400, 480, 500)
# k_experiments = (10, 25, 50, 100, 125, 150, 175, 200, 225, 250)
# k_experiments = (200, 480)
k_experiments = (400, 480, (32, 15), (40, 12), (15, 8, 4))
for k_val in k_experiments:
  print("k_val: {}".format(k_val))
  km = h_kmeans(k_val, n_jobs=-1).fit(dictionary)
  hist_points, class_labels = get_histograms(km, dataset, np.prod(k_val), k)
  measure_accuracy_with_folding(hist_points, class_labels)

# Snippets

## Split experiments.
Probably going to use kfold anyway.

## Experiments with NDArray Views.
Never did find a way to use integer indexing to produce a view.

## Sliding Window