In [9]:
import pandas as pd
import numpy as np
from datetime import datetime
#Attempt to load csv file test data for EDx course PH526x final project.  Useless columns are dropped.
train_time_series = pd.read_csv('https://courses.edx.org/assets/courseware/v1/b98039c3648763aae4f153a6ed32f38b/asset-v1:HarvardX+PH526x+1T2020+type@asset+block/train_time_series.csv')
train_time_series = train_time_series.drop(['Unnamed: 0','accuracy','UTC time'],axis='columns')
train_labels = pd.read_csv('https://courses.edx.org/assets/courseware/v1/d64e74647423e525bbeb13f2884e9cfa/asset-v1:HarvardX+PH526x+1T2020+type@asset+block/train_labels.csv')
train_labels = train_labels.drop(['Unnamed: 0','UTC time'],axis='columns')
test_time_series = pd.read_csv('https://courses.edx.org/assets/courseware/v1/1ca4f3d4976f07b8c4ecf99cf8f7bdbc/asset-v1:HarvardX+PH526x+1T2020+type@asset+block/test_time_series.csv')
test_labels = pd.read_csv('https://courses.edx.org/assets/courseware/v1/72d5933c310cf5eac3fa3f28b26d9c39/asset-v1:HarvardX+PH526x+1T2020+type@asset+block/test_labels.csv')
#test_time_series = test_time_series.drop(['Unnamed: 0','accuracy','UTC time'],axis='columns')

In [15]:
#expand the train_labels to so we have an arry for each entry in the test_time_series.  Assume the activities between timestamps are consistent and continuous\n",
#go through each train_time_series and get the nearest label
def getNearestLabel(timestamp,labels):
    #helper function to return the label from the provided labels dataframe nearest to the given timestamp
    closest = labels.iloc[(labels['timestamp']-timestamp).abs().argsort()[1:2]] 
    #chooses the label from the next highest timestamp
    return(closest.label.values[0])
ext = train_time_series.timestamp.apply(lambda x: getNearestLabel(x,train_labels))

In [18]:
#implement and test randomforest on data
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=4, random_state=0)
regressor.fit(train_time_series[['x','y','z']], ext)
y_pred = regressor.predict(train_time_series[['x','y','z']])
#predictions come out as floats, we need to choose which prediction wins
y_pred_round = [ int(round(f)) for f in y_pred] #round all predictions to nearest int.

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(ext.tolist(),y_pred_round))
print(classification_report(ext.tolist(),y_pred_round))
print(accuracy_score(ext.tolist(), y_pred_round))

[[ 199   66    1    1]
 [   2 1998  126    8]
 [   0  180  681   16]
 [   0   33  121  312]]
              precision    recall  f1-score   support

           1       0.99      0.75      0.85       267
           2       0.88      0.94      0.91      2134
           3       0.73      0.78      0.75       877
           4       0.93      0.67      0.78       466

    accuracy                           0.85      3744
   macro avg       0.88      0.78      0.82      3744
weighted avg       0.86      0.85      0.85      3744

0.8520299145299145


In [20]:
#rounding the number to the nearest int may not be the best method of choosing.   Since we assume that a given activity continues over multiple measurements, then we can apply moving median over several entries to smooth the category
#create bigger list of predictions by padding the ends
begin = y_pred[0]
end = y_pred[len(y_pred)-1]
expanded_pred = [begin, begin, *y_pred, end] #Pandas rolling function extends the tail, so the beginning needs to be padded.
#create the moving averages list\n",
pred_avg = pd.Series(expanded_pred).rolling(window=4).median().dropna() #Pandas creates NaN entries for the leading window measurements. 
#compare
pred_avg_round = [ int(round(f)) for f in pred_avg] #round all predictions to nearest int
print(confusion_matrix(ext.tolist(),pred_avg_round))
print(classification_report(ext.tolist(),pred_avg_round))
print(accuracy_score(ext.tolist(), pred_avg_round))
#smoothing the output produces a significantly better categorization than simply rounding the output."

[[ 209   57    1    0]
 [   0 2106   28    0]
 [   0   37  840    0]
 [   0    4  168  294]]
              precision    recall  f1-score   support

           1       1.00      0.78      0.88       267
           2       0.96      0.99      0.97      2134
           3       0.81      0.96      0.88       877
           4       1.00      0.63      0.77       466

    accuracy                           0.92      3744
   macro avg       0.94      0.84      0.88      3744
weighted avg       0.93      0.92      0.92      3744

0.9212072649572649


In [22]:
#Run the train_time_series to generate the graded output
startTime = datetime.now()
test_predictions = regressor.predict(test_time_series[['x','y','z']])
begin = test_predictions[0]
end = test_predictions[len(test_predictions)-1]
ETP = [begin, begin, *test_predictions, end] #Pandas rolling function extends the tail, so the beginning needs to be padded

#create the moving averages list
test_predictions = pd.Series(ETP).rolling(window=4).median().dropna() #Pandas creates NaN entries for the leading window measurements.   Median works better than mean() for accuracy\
test_predictions = [ int(round(f)) for f in test_predictions]
#match test predictions to test_time_series
test_time_series['label'] = pd.Series(test_predictions)
#fill in the test_labels spreadsheet with the prediction for that timestamps
answer = test_time_series[test_time_series.timestamp.isin(test_labels.timestamp)]
#output results in prefered format
endtime = datetime.now()
answer['label'].to_csv('randomocean.csv')
print(endtime - startTime)
print(answer.label.to_list())

0:00:00.072885
[3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 3, 2, 2, 3, 3, 3, 3, 2, 2, 3, 2, 2, 3, 3, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 2, 2, 3, 3, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 2, 3, 2, 2, 2, 2, 2, 3, 2]
