# IMPORTS

In [1]:
import arff
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# PREPROCESSING
### ARFF to DataFrame

In [2]:
## Function that joins ARFF files into single pandas dataframe. Assumes files have a systematic format,
## e.g. phone_accelerometer_001.arff, where the number (unique) would change for every set of data.
# Inputs: File prefix & suffix before identifying number, Range of file numbers
# Output: DataFrame
def arffToDataFrame(directory, prefix, suffix, lo, hi):
    df_list = []
    for i in range(lo, hi+1):
        filename = os.path.join(directory, prefix + str(i) + suffix)
        try:
            data = arff.load(open(filename))
            labels = [label[0] for label in data['attributes']]
            df = pd.DataFrame(data['data'], columns=labels)
            df['class'] = i # Corrects class label to UID. Incorrectly labelled when read from ARFF file.
            df_list.append(df)
        except:
            pass
    
    df_concat = pd.concat(df_list)
    return df_concat

### Convert for each sensor on smartphone (2) & smartwatch (2)

In [3]:
df_phone_accel = arffToDataFrame('phone_accel', 'data_', '_accel_phone.arff', 1600, 1640)
df_phone_gyro = arffToDataFrame('phone_gyro', 'data_', '_gyro_phone.arff', 1600, 1640)
df_watch_accel = arffToDataFrame('watch_accel', 'data_', '_accel_watch.arff', 1600, 1640)
df_watch_gyro = arffToDataFrame('watch_gyro', 'data_', '_gyro_watch.arff', 1600, 1640)

In [4]:
# Sample DataFrame
df_phone_accel.head()

Unnamed: 0,ACTIVITY,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,ZMFCC11,ZMFCC12,XYCOS,XZCOS,YZCOS,XYCOR,XZCOR,YZCOR,RESULTANT,class
0,A,0.235,0.47,0.275,0.02,0.0,0.0,0.0,0.0,0.0,...,0.486106,0.479859,-0.550668,0.049864,0.121354,-0.251024,0.164468,-0.110722,10.0518,1600
1,A,0.275,0.44,0.27,0.015,0.0,0.0,0.0,0.0,0.0,...,0.479571,0.473409,-0.633171,0.072129,0.161492,-0.386416,0.21568,-0.034375,10.1171,1600
2,A,0.32,0.43,0.245,0.0,0.005,0.0,0.0,0.0,0.0,...,0.483005,0.476798,-0.659493,0.087043,0.162157,-0.325151,0.27238,-0.077274,9.98384,1600
3,A,0.315,0.495,0.185,0.005,0.0,0.0,0.0,0.0,0.0,...,0.480711,0.474534,-0.712081,0.00381,0.210015,-0.364285,0.203131,0.015328,10.106,1600
4,A,0.215,0.455,0.325,0.005,0.0,0.0,0.0,0.0,0.0,...,0.468836,0.462811,-0.534933,0.047553,0.275833,-0.216423,0.2385,-0.00987,10.0521,1600


### Algorithms to Use for Building Models
API can be found here:
http://scikit-learn.org/stable/modules/classes.html

In [5]:
algorithms = [RandomForestClassifier()]#, DecisionTreeClassifier(), KNeighborsClassifier()]

# PERSONAL MODELS

In [16]:
## Function that buids and tests a list of given machine learning algorithms.
# Inputs: DataFrame (data),
#         List of algorithms (from sklearn - e.g. sklearn.tree.DecisionTreeClassifier),
#         Name of label being tested (e.g. 'ACTIVITY' for Activity Recognition)
# Outputs: Individual scores (1 per algorithm x Number of users)
#          Confunsion Matrices (1 per algorithm x Number of users)
def runPersonalModels(df, algorithms, label_name):
    # List to hold scores (list of lists since there are several algorithms per user)
    individual_scores = []
    individual_confusion_matrices = []
    
    # Build a model per user
    for user in df['class'].unique():
        scores = []
        confusion_matrices = []
        data = df.loc[df['class'] == user]
        
        # For every user, build a model per algorithm
        for algorithm in algorithms:
            # 10-Fold Cross Validation (cv=10)
            # See http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html
            predicted = cross_val_predict(algorithm, data.drop(label_name, axis=1), data[label_name], cv=10)
            
            # Accuracy: % of Correctly Classified Labels
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
            accuracy = accuracy_score(data[label_name], predicted)
            
            # Compute confusion matrix.
            # Use full list of activities as labels (to account for missing data)
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
            cm = confusion_matrix(data[label_name], predicted, labels=df[label_name].unique())
            
            # Output Status
            print(algorithm)
            print(accuracy)
            
            # Add score and confusion matrix for the algorithm
            scores.append(accuracy)
            confusion_matrices.append(cm)
        
        # Add score and confusion matrix for the user
        individual_scores.append(scores)
        individual_confusion_matrices.append(confusion_matrices)
    
    return individual_scores, individual_confusion_matrices

In [6]:
## Function that aggregates individual scores and confusion matrices (returned by runPersonalModels function)
# Input: Scores (per algorithm per user)
#        Confusion Matrices (per algorithm per user)
#        List of labels for labelling the confusion matrices
# Output: Scores (per algorithm), Confusion Matrices (per algorithm)
def aggregateIndividualScores(scores, cms, labels):
    avg_scores = []
    agg_cms = []
    
    # Average individual scores for each algorithm 
    for i in range(len(scores[0])):
        avg_scores.append(np.mean([x[i] for x in scores]))
    
    # Aggregate individual confusion matrices
    cms = [sum(i) for i in zip(*cms)] # As list of arrays
    
    # Convert to a dataframe for visualization
    for cm in cms:
        cms_df = pd.DataFrame(cm, columns=labels, index=labels)
        agg_cms.append(cms_df)
    
    # Add column for accuracy per activity
    for cm in agg_cms:
        per_act = []
        
        # Calculate % of correct labels
        for act in cm.columns:
            percent_correct = cm.loc[act][act] / cm.loc[act].sum()
            per_act.append(percent_correct)
        
        cm['PER_ACT_ACC'] = per_act
    
    return avg_scores, agg_cms

## Results
### Watch Accel

In [12]:
personal_scores, personal_cms = runPersonalModels(df_watch_accel, algorithms, 'ACTIVITY')

NameError: name 'training_data' is not defined

In [9]:
agg_personal_scores, agg_personal_cms = aggregateIndividualScores(personal_scores, personal_cms, df_watch_accel['ACTIVITY'].unique())

In [10]:
agg_personal_cms[0]

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S,PER_ACT_ACC
A,645,1,50,1,2,0,1,0,0,1,0,0,21,1,1,0,0,3,0.887208
B,2,689,2,0,1,0,0,0,0,0,0,0,1,11,2,1,0,3,0.967697
C,45,7,624,1,2,0,1,0,0,1,0,0,28,2,4,0,2,12,0.855967
D,1,2,7,628,8,17,8,2,14,3,12,5,2,0,0,5,1,15,0.860274
E,11,0,5,17,674,8,2,3,4,5,5,3,5,0,1,5,0,8,0.891534
F,0,0,1,13,12,666,6,2,7,5,2,8,0,0,1,2,1,3,0.91358
G,0,0,1,4,1,4,674,2,18,5,4,4,0,1,0,0,3,9,0.923288
H,2,1,2,2,3,1,9,648,4,29,5,16,0,0,0,2,1,4,0.888889
I,0,0,2,12,0,10,24,16,578,26,18,44,0,1,1,3,0,1,0.785326
J,0,1,0,6,3,3,3,50,32,609,10,22,0,1,0,2,1,2,0.81745


In [11]:
agg_personal_scores

[0.87690843263807672]

### Watch Gyro

In [32]:
personal_scores, personal_cms = runPersonalModels(df_watch_gyro, algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.850152905199
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.873456790123
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [33]:
agg_personal_scores, agg_personal_cms = aggregateIndividualScores(personal_scores, personal_cms, df_watch_gyro['ACTIVITY'].unique())

In [34]:
agg_personal_cms[0]

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S,PER_ACT_ACC
A,646,9,34,1,1,1,6,1,1,0,0,0,17,1,1,0,0,5,0.892265
B,15,664,4,1,0,0,0,0,1,0,0,0,5,7,5,0,1,10,0.931276
C,43,10,524,3,6,0,11,0,5,0,0,2,76,9,2,0,1,36,0.71978
D,2,0,17,466,77,22,12,5,25,25,21,11,6,0,1,18,11,10,0.639232
E,6,0,4,78,469,27,18,16,23,25,17,26,6,4,2,24,4,6,0.621192
F,0,0,1,23,18,592,13,14,17,5,7,16,4,0,1,17,1,2,0.80985
G,9,1,13,3,9,10,604,9,23,9,8,11,7,0,1,1,4,6,0.82967
H,1,1,1,7,15,13,21,539,29,40,28,22,7,0,0,5,0,2,0.737346
I,2,0,10,19,23,19,31,39,483,42,7,43,5,0,2,6,3,1,0.657143
J,1,0,3,8,20,14,15,61,48,513,10,37,0,1,0,10,1,3,0.688591


In [35]:
agg_personal_scores

[0.76395064126465484]

### Phone Accel

In [15]:
personal_scores, personal_cms = runPersonalModels(df_phone_accel, algorithms, 'ACTIVITY')

<class 'generator'>
<generator object _BaseKFold.split at 0x116a2f7d8>


NameError: name 'training_data' is not defined

In [37]:
agg_personal_scores, agg_personal_cms = aggregateIndividualScores(personal_scores, personal_cms, df_phone_accel['ACTIVITY'].unique())

In [38]:
agg_personal_cms[0]

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S,PER_ACT_ACC
A,941,3,12,0,0,0,0,0,0,0,0,0,8,3,1,0,0,4,0.968107
B,35,957,8,0,0,0,0,0,0,0,0,0,4,0,0,0,0,2,0.951292
C,21,32,839,3,0,0,0,0,0,0,0,0,20,2,5,0,0,0,0.909978
D,7,5,28,860,2,5,5,13,9,6,11,2,2,4,0,6,8,4,0.880246
E,3,3,10,10,902,3,6,4,4,4,0,5,2,7,9,1,4,13,0.911111
F,3,2,4,2,7,857,1,10,6,2,2,3,3,3,4,3,1,9,0.929501
G,1,2,2,8,3,9,869,11,9,7,1,25,0,5,5,8,3,7,0.891282
H,3,0,0,23,4,8,31,775,19,20,8,25,1,2,6,9,9,2,0.820106
I,2,0,2,8,7,11,15,29,844,6,13,13,0,1,1,4,17,4,0.863869
J,1,2,0,1,2,1,9,28,18,710,7,13,0,2,3,18,10,3,0.857488


In [39]:
agg_personal_scores

[0.8898115572560219]

### Phone Gyro

In [40]:
personal_scores, personal_cms = runPersonalModels(df_phone_gyro, algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.719626168224
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.740740740741
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [41]:
agg_personal_scores, agg_personal_cms = aggregateIndividualScores(personal_scores, personal_cms, df_phone_gyro['ACTIVITY'].unique())

In [42]:
agg_personal_cms[0]

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S,PER_ACT_ACC
A,694,3,15,0,0,0,1,0,0,0,0,0,4,6,1,0,0,1,0.957241
B,18,703,18,2,0,0,0,0,0,0,0,0,7,2,0,0,0,0,0.937333
C,28,32,592,2,0,0,0,0,0,0,0,1,41,7,6,0,0,3,0.831461
D,1,5,21,478,31,17,18,27,20,30,43,14,11,8,8,21,21,11,0.608917
E,1,1,1,34,500,11,27,17,14,12,18,12,3,14,11,9,20,41,0.670241
F,0,1,1,25,27,538,13,21,9,19,23,18,1,3,1,29,9,7,0.722148
G,2,2,2,15,37,22,473,22,34,25,11,23,4,7,5,12,26,15,0.641791
H,0,1,1,52,19,25,42,329,52,64,51,43,2,3,4,24,11,13,0.447011
I,0,1,0,28,34,19,52,51,380,42,35,49,1,3,5,21,25,10,0.502646
J,1,0,0,36,22,35,42,77,54,339,36,33,0,1,1,23,14,3,0.472803


In [43]:
agg_personal_scores

[0.68592649409305673]

# IMPERSONAL MODELS

In [12]:
## Function that buids and tests a list of given machine learning algorithms.
# Inputs: DataFrame (data),
#         List of algorithms (from sklearn - e.g. sklearn.tree.DecisionTreeClassifier),
#         Name of label being tested (e.g. 'ACTIVITY' for Activity Recognition,
def runImpersonalTests(df, algorithms, label_name):
    # List to hold scores (list of lists since there are several algorithms per user)
    individual_scores = []
    individual_confusion_matrices = []
    
    # Build a model per user
    for user in df['class'].unique():
        scores = []
        confusion_matrices = []
        
        # Split data into training and test. Training data is every user except one, test is remaining user
        train_data = df.loc[df['class'] != user]
        test_data = df.loc[df['class'] == user]
        
        # For every user, build a model per algorithm
        for algorithm in algorithms:
            # Build model and predict
            algorithm.fit(train_data.drop(label_name, axis=1), train_data[label_name])
            predicted = algorithm.predict(test_data.drop(label_name, axis=1))
            
            # Accuracy: % of Correctly Classified Labels
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
            accuracy = accuracy_score(test_data[label_name], predicted)
            
            # Compute confusion matrix.
            # Use full list of activities as labels (to account for missing data)
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
            cm = confusion_matrix(test_data[label_name], predicted, labels=df[label_name].unique())
            
            # Output Status
            print(algorithm)
            print(accuracy)
            
            # Add score and confusion matrix for the algorithm
            scores.append(accuracy)
            confusion_matrices.append(cm)
        
        # Add score and confusion matrix for the user
        individual_scores.append(scores)
        individual_confusion_matrices.append(confusion_matrices)
    
    return individual_scores, individual_confusion_matrices

## Results

In [13]:
impersonal_scores, impersonal_cms = runImpersonalTests(df_watch_accel, algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.810397553517
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.651234567901
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [14]:
agg_impersonal_scores, agg_impersonal_cms = aggregateIndividualScores(impersonal_scores, impersonal_cms, df_watch_accel['ACTIVITY'].unique())

In [15]:
agg_impersonal_cms[0]

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,O,P,Q,R,S,PER_ACT_ACC
A,501,0,123,4,1,0,4,1,0,0,0,1,53,3,16,1,2,17,0.689133
B,0,653,5,0,0,0,1,0,0,1,0,0,4,36,9,0,1,2,0.917135
C,119,5,455,3,0,1,11,5,0,4,2,1,54,12,2,4,2,49,0.624143
D,0,0,9,464,36,55,7,19,28,22,21,17,2,0,1,31,2,16,0.635616
E,0,0,9,91,468,4,14,15,17,16,61,32,6,0,0,9,5,9,0.619048
F,0,0,2,81,7,495,0,3,7,17,11,27,1,0,0,75,0,3,0.679012
G,7,2,13,12,29,2,523,8,27,21,9,27,17,6,1,3,10,13,0.716438
H,2,0,0,23,20,5,14,409,35,98,25,68,1,0,1,10,1,17,0.561043
I,0,0,5,22,40,12,22,60,334,78,73,76,2,0,0,7,0,5,0.453804
J,1,0,0,55,26,7,5,93,88,370,27,55,0,0,0,4,0,14,0.496644


In [16]:
agg_impersonal_scores

[0.63871085144201112]

# Subset of Activities

In [17]:
exercise_activities = ['A', 'B', 'C', 'D', 'E', 'M', 'O', 'P']
eating_activities = ['H', 'I', 'J', 'K', 'L']
other_activities = ['F', 'G', 'Q', 'R', 'S']

In [18]:
subset1_scores, subset1_cms = runPersonalModels(df_watch_accel.loc[df_watch_accel['ACTIVITY'].isin(exercise_activities)],
                                                  algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.958333333333
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.958333333333
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [20]:
subset1_scores, subset1_cms = aggregateIndividualScores(subset1_scores, subset1_cms, exercise_activities)

In [21]:
subset1_cms[0]

Unnamed: 0,A,B,C,D,E,M,O,P,PER_ACT_ACC
A,643,3,53,2,7,18,0,1,0.884457
B,3,693,3,0,2,2,5,4,0.973315
C,37,5,633,0,0,47,4,3,0.868313
D,5,3,8,684,20,7,2,1,0.936986
E,8,0,11,33,691,13,0,0,0.914021
M,32,6,63,3,7,613,5,2,0.838577
O,3,13,8,1,3,8,669,23,0.918956
P,2,7,3,0,1,2,38,688,0.928475


In [22]:
subset1_scores

[0.90777765239286834]

In [23]:
subset2_scores, subset2_cms = runPersonalModels(df_watch_accel.loc[df_watch_accel['ACTIVITY'].isin(eating_activities)],
                                                  algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.966666666667
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.922222222222
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [24]:
subset2_scores, subset2_cms = aggregateIndividualScores(subset2_scores, subset2_cms, eating_activities)

In [26]:
subset2_cms[0]

Unnamed: 0,H,I,J,K,L,PER_ACT_ACC
H,654,21,25,15,14,0.897119
I,17,598,35,28,58,0.8125
J,48,39,625,13,20,0.838926
K,17,48,25,622,41,0.826029
L,23,83,47,57,528,0.715447


In [27]:
subset2_scores

[0.81701454906579973]

In [28]:
subset3_scores, subset3_cms = runPersonalModels(df_watch_accel.loc[df_watch_accel['ACTIVITY'].isin(other_activities)],
                                                  algorithms, 'ACTIVITY')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.978494623656
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
0.955555555556
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,


In [29]:
subset3_scores, subset3_cms = aggregateIndividualScores(subset3_scores, subset3_cms, other_activities)

In [30]:
subset3_cms[0]

Unnamed: 0,F,G,Q,R,S,PER_ACT_ACC
F,698,11,12,1,7,0.957476
G,9,700,5,6,10,0.958904
Q,12,6,706,15,24,0.925295
R,3,15,2,702,7,0.962963
S,1,13,9,17,703,0.946164


In [31]:
subset3_scores

[0.95159444306134022]

# Train / Test Curve

## Personal Models

In [126]:
## Function that buids and tests a list of given machine learning algorithms.
# Inputs: DataFrame (data),
#         List of algorithms (from sklearn - e.g. sklearn.tree.DecisionTreeClassifier),
#         Name of label being tested (e.g. 'ACTIVITY' for Activity Recognition)
# Outputs: Individual scores (1 per algorithm x Number of users)
#          Confunsion Matrices (1 per algorithm x Number of users)
def runPersonalModels(df, algorithms, label_name):
    # List to hold scores (list of lists since there are several algorithms per user)
    individual_scores = []
    individual_confusion_matrices = []
    
    # Build a model per user
    for user in df['class'].unique():
        scores = []
        confusion_matrices = []
        data = df.loc[df['class'] == user]
        
        skf = StratifiedKFold(n_splits=10)
        
        # For every user, build a model per algorithm
        for algorithm in algorithms:
            # 10-Fold Cross Validation (cv=10)
            # See http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html
            predicted = cross_val_predict(algorithm, data.drop(label_name, axis=1), data[label_name], cv=10)
            print(predicted)
            
            # Accuracy: % of Correctly Classified Labels
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
            accuracy = accuracy_score(data[label_name], predicted)
            
            # Compute confusion matrix.
            # Use full list of activities as labels (to account for missing data)
            # See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
            cm = confusion_matrix(data[label_name], predicted, labels=df[label_name].unique())
            
            # Output Status
            print(algorithm)
            print(accuracy)
            
            # Add score and confusion matrix for the algorithm
            scores.append(accuracy)
            confusion_matrices.append(cm)
        
        # Add score and confusion matrix for the user
        individual_scores.append(scores)
        individual_confusion_matrices.append(confusion_matrices)
    
    return individual_scores, individual_confusion_matrices

## Impersonal Models

In [17]:
s = StratifiedKFold(n_splits=10)

In [19]:
s.split(df_phone_accel.drop('ACTIVITY', axis=1), df_phone_accel['ACTIVITY'])

ValueError: too many values to unpack (expected 2)