In [14]:
import pandas

# import CSV files as DataFrames
driving_data = pandas.read_csv("driving.csv", header=0, usecols=(10, 11, 12))
walking_data = pandas.read_csv("walking.csv", header=0, usecols=(10, 11, 12))
running_data = pandas.read_csv("running.csv", header=0, usecols=(10, 11, 12))
sitting_data = pandas.read_csv("static.csv", header=0, usecols=(10, 11, 12))
climbing_data = pandas.read_csv("upstair.csv", header=0, usecols=(10, 11, 12))

'''
Super huge thank you to Heather for letting me use her data!! I had my own originally,
which I recoreded by setting a herd of iPhoned freshmen loose in the Nelson (more efficient
lol), but then we found out about the can't-turn-off-your-screen thing, which invalidated
all the data.

Then I went to record it all myself on my Android using Androsensor, but after I recorded all
the data, I found out that accelerometer data didn't record because my phone's gyroscope
is broken. This was the moment at which I decided to just ask someone else if I could use their
data.
'''
# ignore this
1

1

In [13]:
'''
This is all just useful information. Thanks for the helpful
builtin, pandas! This was helpful in figuring out what my
features should be. It's commented out because it's not
necessary for the code to run, and it has a long output, but
you can un-comment it if you want to see the results. 
''' 
1
# print("--------")
# print("driving")
# print("--------")
# print(driving_data.describe())
# print("--------")
# print("walking")
# print("--------")
# print(walking_data.describe())
# print("--------")
# print("running")
# print("--------")
# print(running_data.describe())
# print("--------")
# print("sitting")
# print("--------")
# print(sitting_data.describe())
# print("--------")
# print("climbing")
# print("--------")
# print(climbing_data.describe())

1

In [3]:
import random

# chunk_data : DataFrame -> listof(DataFrame)
# input : data - a DataFrame of length ~30k holding the dataset you want
#                to chunk up
# output : a list of 30 DataFrames, each DataFrame being 1000 long (except
#          for the very last one, which might be missing like 10 points,
#          but whatever)
def chunk_data(data):
    toReturn = []
    for i in range(30):
        # 975 instead of 1000 bc datasets are a few points short
        # this should only cause a trivial difference if any at all
        toReturn.append(data[i*975 : (i+1)*975])
    return toReturn

# chunk the data!
driving_chunks = chunk_data(driving_data)
sitting_chunks = chunk_data(sitting_data)
walking_chunks = chunk_data(walking_data)
climbing_chunks = chunk_data(climbing_data)
running_chunks = chunk_data(running_data)

# shuffle is there because it makes the data more, like, balanced
#    --> cause like what if you have random crap at the end
random.shuffle(driving_chunks)
random.shuffle(sitting_chunks)
random.shuffle(walking_chunks)
random.shuffle(climbing_chunks)
random.shuffle(running_chunks)

# get x, y, and z

# get_x : listof(DataFrame) -> listof(Series)
# input : aloc - a list of DataFrame chunks
# output : a list of Series containing only the x-accelerometer
#          data from those chunks
def get_x(aloc):
    toReturn = []
    for chunk in aloc:
        toReturn.append(chunk["user_acc_x"])
    return toReturn

# get_y : listof(DataFrame) -> listof(Series)
# input : aloc - a list of DataFrame chunks
# output : a list of Series containing only the y-accelerometer
#          data from those chunks
def get_y(aloc):
    toReturn = []
    for chunk in aloc:
        toReturn.append(chunk["user_acc_y"])
    return toReturn

# get_z : listof(DataFrame) -> listof(Series)
# input : aloc - a list of DataFrame chunks
# output : a list of Series containing only the z-accelerometer
#          data from those chunks
def get_z(aloc):
    toReturn = []
    for chunk in aloc:
        toReturn.append(chunk["user_acc_z"])
    return toReturn

In [4]:
import numpy

# make_feature_chunk : Series -> listof(float)
# input : chunk - a Series with ten seconds of data for a single
#         activity in one dimension (x, y, or z)
# output : a three-element array holding the max, min, and max
#          of the fft for the chunk, in that order
def make_feature_chunk(chunk):
    return [chunk.as_matrix().max(), chunk.as_matrix().mean(), chunk.as_matrix().std()]

# extract_features : listof(DataFrame) -> listof(listof(float)))
# input : aloc - a list of DataFrame chunks
# output : a list of 3-elt lists, where 0th elt is max, 1st is
#          mean, 2nd is standard deviation
def extract_features(aloc):
    toReturn = []
    for chunk in aloc:
        toReturn.append(make_feature_chunk(chunk))
    return toReturn

# features_by_plane : listof(DataFrame) -> listof(listof(listof(float)))
# input : aloc - list of DataFrame chunks
# output : a list of three lists, divvied up by x, y, and z. those lists
#          have 30 elts of 3-elt lists, which contain the min & max, in
#          that order
def features_by_plane(aloc):
    return [extract_features(get_x(aloc)), extract_features(get_y(aloc)), extract_features(get_z(aloc))]

# make the array with the features!!
driving_features = features_by_plane(driving_chunks)
sitting_features = features_by_plane(sitting_chunks)
walking_features = features_by_plane(walking_chunks)
climbing_features = features_by_plane(climbing_chunks)
running_features = features_by_plane(running_chunks)

In [5]:
# get_train : listof(listof(listof(float))) -> listof(listof(listof(float)))
# input : features - 3-elt list of lists, where 0th is x, 1st is y, and 2nd
#         is z. inner list is 30-elts long and has all the chunks
#         the chunks are all the features (max, mean, and std)
# output : the same data structure, but only the first 24 of the
#          thirty chunks
def get_train(features):
    toReturn = []
    for dimension in features:
        toReturn.append(dimension[:24])
    return toReturn

# get_test : listof(listof(listof(float))) -> listof(listof(listof(float)))
# input : features - 3-elt list of lists, where 0th is x, 1st is y, and 2nd
#         is z. inner list is 30-elts long and has all the chunks
#         the chunks are all the features (max, mean, and std)
# output : the same data structure, but only the last 6 of the
#          thirty chunks
def get_test(features):
    toReturn = []
    for dimension in features:
        toReturn.append(dimension[24:])
    return toReturn

# divvy up the training data / testing data
driving_train = get_train(driving_features)
driving_test = get_test(driving_features)

sitting_train = get_train(sitting_features)
sitting_test = get_test(sitting_features)

walking_train = get_train(walking_features)
walking_test = get_test(walking_features)

climbing_train = get_train(climbing_features)
climbing_test = get_test(climbing_features)

running_train = get_train(running_features)
running_test = get_test(running_features)

In [6]:
# concatenate the x, y, and z arrays bc srsly why did i do that

# collapse_arr : listof(listof(listof(float))) -> listof(listof(float))
# input : data - 3-elt list of lists, where 0th is x, 1st is y, and 2nd is z.
#         inner list is 30-elts long and has all the chunks
#         the chunks are all the features (max, mean, and std)
# output : all the chunks, but not separated by x, y, or z (so it'll
#          be all of x, then all of y, then all of z, continously)
def collapse_arr(data):
    toReturn = []
    for plane in data:
        for chunk in plane:
            toReturn.append(chunk)
    return toReturn

total_driving_train = collapse_arr(driving_train)
total_driving_test = collapse_arr(driving_test)

total_sitting_train = collapse_arr(sitting_train)
total_sitting_test = collapse_arr(sitting_test)

total_walking_train = collapse_arr(walking_train)
total_walking_test = collapse_arr(walking_test)

total_climbing_train = collapse_arr(climbing_train)
total_climbing_test = collapse_arr(climbing_test)

total_running_train = collapse_arr(running_train)
total_running_test = collapse_arr(running_test)

# combine the data
all_the_training_data = total_driving_train + total_sitting_train + total_walking_train + total_climbing_train + total_running_train
all_the_testing_data = total_driving_test + total_sitting_test + total_walking_test + total_climbing_test + total_running_test
    
# make_labels : int -> listof(int)
# input : n - the number of labels you need
#         k - how many instances of each of those labels you need
# output : an array with n unique k-length labels
# example : make_labels(2, 3) --> [0, 0, 0, 1, 1, 1]
def make_labels(n, k):
    toReturn = []
    for i in range(n):
        for j in range(k):
            toReturn.append(i)
    return toReturn

# make the training labels
training_labels = make_labels(5, 72)
testing_labels = make_labels(5, 18)

In [16]:
# Now it is time for magic!!!
from sklearn import linear_model

logistic_regression = linear_model.LogisticRegression()
logistic_regression.fit(all_the_training_data, training_labels)
# logistic_regression.score(all_the_training_data, training_labels)
logistic_regression.score(all_the_testing_data, testing_labels)

0.78888888888888886

In [8]:
# the number above is not magic
# sad :C

from sklearn import svm

svm = svm.SVC()
svm.fit(all_the_training_data, training_labels)
# svm.score(all_the_training_data, training_labels)
svm.score(all_the_testing_data, testing_labels)

0.74444444444444446

In [9]:
# neither is that one

from sklearn import neighbors

k_neighbors = neighbors.KNeighborsClassifier()
k_neighbors.fit(all_the_training_data, training_labels)
# k_neighbors.score(all_the_training_data, training_labels)
k_neighbors.score(all_the_testing_data, testing_labels)

0.97777777777777775

In [11]:
# the number above kind of is [magic]!!!

'''
K-Nearest-Neighbors is definitely the best one here. I suspect this goes back
to something we talked about in class (or something I read, can't remember)
where we concluded that elaborate equations do not necessarily make a classifier
accurate, but instead what makes your algorithm accurate is having a ton of data.
In this way, even a relatively simple algorithm (like k-n-n) will do well. The
thing that surpises me about that here is that I don't have that much data,
so it's not immediately obvious to me why knn worked well in this instance.
I also feel like if knn did well, shouldn't logistic regression have done well?
They both simple and I feel like you can describe their algorithm as "well,
here's a group... let's draw the line through it." So if I had to guess, I
would have hypothesized they would have done similarly, but they didn't.

Earlier, I attempted fft, and while I wasn't able to get it to work
programmatically,  I think finding the max of the fft would have been
a good way to approach finding some of the differences. This is because
the max would essentially be describing, for example, the space
between steps (the cadence of the gait), which is what's going to be one
of the biggest differences between the activities (especially running and
walking).

If I had gotten fft to work, I also would have used it to approach the bonus
problem. The way I would have done that is by collecting my breathing data,
then running it thorugh fft. Theoretically, I'd get a result that looks something
like this:

^
|
|              __
|             /  \
|            /    \
|   __      /      \
|  /  \    /        \
|_/    \__/          \_______>
|___________________________________>

Now, before even using this, we could assume a two things:
a) my heartrate is faster than my respiratory rate
b) breathing makes my chest rise more than my heart beating does

With these assumptions in mind, we could look at the result of this fft
and note that the first bump, which is high frequency and low amplitude,
probably is the heartrate. Thus the larger bump, which is lower frequency
and higher amplitude (to be honest I don't totally understand what the y-axis
for the result of an fft is, but we know that the bigger your bump is on the
y-dimension, the bigger the movement probably was), is probably my
respiratory rate.

This assignment was fun! My midnight Slack struggles probably didn't indicate
as much, but doing ML for the first time made me feel like a ~*~real
computer scientist~*~.

'''
# this is so my text doesn't get printed below cause that's ugly
# I know I could have used Markdown, but I tried it and it did not adhere
# to my aesthetic
1

2