- Generate epoch files with epoch length as 10 seconds.
- Load the epoch files and label files with 10 second worth of labels.
- Train the model.
- Use the model to make predictions on test set and compare with actual annotations we have.
- Make the confusion matrix.
- Do it for: original model and custom features model.
- Use those models for 22 random participants.

In [5]:
def get_files(data_path) -> None:
    total_csv_zipped = []
    for path, dirnames, filenames in os.walk(data_path):
        for file in filenames:
            if file.endswith(".csv.gz"):
                total_csv_zipped.append(os.path.join(path, file))
                
            if file.endswith(".csv") and file[0]!='c': #ignore the capture24 file
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))
    return sorted(total_csv_zipped)

In [6]:
import pandas as pd
import os
epoch_size = 5 # 5 seconds

epochs = "/home/aayush/accelerometer/compare_classification/epoch_data/5_sec/original_features"
epoch_files = get_files(epochs)

labels = "/home/yacine/accel/capture24/participants/"
label_files = get_files(labels)

In [7]:
import csv
def create_labels_dict(labels_dict_location = "/home/aayush/accelerometer/accprocess/anno-label.csv"):
    labels_dict = {}
    with open(labels_dict_location, "r") as annotation_dict:
        reader = csv.DictReader(annotation_dict)
        for row in reader:
            if labels_dict.get(row['annotation']) is None:
                labels_dict[row['annotation']] = [row['label:Walmsley2020']]
            else:
                labels_dict[row['annotation']].append(row['label:Walmsley2020'])
    return labels_dict

labels_dict = create_labels_dict()

# replace the annotated labels with the same format of strings as predicted labels using mapping from labels_dict
flat_dict = {k: v[0] for k, v in labels_dict.items()}

In [8]:
def parse_datetime(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object


def parse_datetime_df_time(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    clean_datetime_str = clean_datetime_str.split('+')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object

In [10]:
df_all = []
for data_file in get_files("/home/aayush/accelerometer/compare_classification/training_data/5_sec/original_features"):
    df = pd.read_csv(data_file)
    df_all.append(df)

In [11]:
epochs = pd.concat(df_all)

In [12]:
df = epochs
df.head()

Unnamed: 0,enmoTrunc,enmoAbs,xMean,yMean,zMean,xRange,yRange,zRange,xStd,yStd,...,p625,totalPower,temp,samples,dataErrors,clipsBeforeCalibr,clipsAfterCalibr,rawSamples,participant,label
0,0.000645,0.003413,-0.459854,-0.52342,0.714303,0.015839,0.015632,0.031827,0.004623,0.006924,...,-17.915316,-14.848814,0.0,500,0,0,0,501,1.0,sleep
1,0.00104,0.003455,-0.461786,-0.522951,0.714494,0.015839,0.031264,0.031827,0.006518,0.006803,...,-16.959734,-14.294128,0.0,500,0,0,0,502,1.0,sleep
2,0.000892,0.002792,-0.461406,-0.523857,0.714557,0.015839,0.015632,0.031827,0.006239,0.007136,...,-18.161487,-15.943441,0.0,500,0,0,0,502,1.0,sleep
3,0.000744,0.002928,-0.460994,-0.52342,0.714526,0.015839,0.031264,0.031827,0.005894,0.006994,...,-17.928012,-15.81256,0.0,500,0,0,0,502,1.0,sleep
4,0.000636,0.002756,-0.462546,-0.521231,0.715035,0.015839,0.031264,0.031827,0.006982,0.005316,...,-18.134155,-15.430904,0.0,500,0,0,0,502,1.0,sleep


In [13]:
df.rename(columns={'annotation':'label'}, inplace=True)
print(len(df))

1766564


In [14]:
print(df['label'].unique())

['sleep' 'light' 'sedentary' 'moderate-vigorous']


In [None]:
# df = df.drop(labels=["time"], axis=1)

In [15]:
df = df.dropna(subset=["label"])
print(df['label'].unique())

['sleep' 'light' 'sedentary' 'moderate-vigorous']


In [16]:
# df["label"] = df["label"].astype('string', copy=False)
len(df)

1766564

In [17]:
df.to_csv("/home/aayush/accelerometer/compare_classification/training_data/5_sec/original_features/5s_training_original_features_all.csv", index=False)

In [18]:
test_participants = "101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151"
test_participants = [float(t) for t in test_participants.split(",")]
    
test_features_df = df[df["participant"].isin(test_participants)].iloc[:, :-2]
test_labels_df = df[df["participant"].isin(test_participants)]["label"]

print(len(test_features_df))
print(test_labels_df.unique())

583801
['sleep' 'sedentary' 'light' 'moderate-vigorous']


In [19]:
train_all_df = df[~df["participant"].isin(test_participants)]
train_features_df = df[~df["participant"].isin(test_participants)].iloc[:, :-2]
train_labels_predict = df[~df["participant"].isin(test_participants)]["label"]

print(len(train_features_df))
print(train_labels_predict)

1182763
0         sleep
1         sleep
2         sleep
3         sleep
4         sleep
          ...  
118769    sleep
118770    sleep
118771    sleep
118772    sleep
118773    sleep
Name: label, Length: 1182763, dtype: object


In [20]:
def _Model(**kwargs):
    return BalancedRandomForestClassifier(
        n_estimators=3000,
        max_depth=10,
        min_samples_leaf=1,
        replacement=True,
        sampling_strategy='not minority',
        random_state=42,
        **kwargs
    )

In [21]:
from imblearn.ensemble import BalancedRandomForestClassifier

print('Training...')
model = _Model(n_jobs=10, verbose=1)
# fit the model as numpy array so that we do not get warnings during prediction
X_features = train_features_df.to_numpy()
X_labels = train_labels_predict.to_numpy()
model = model.fit(X_features, X_labels)
model.verbose = 0  # silence future calls to .predict()
labels = model.classes_

Training...


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    9.6s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:   47.4s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  1.8min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  3.1min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:  4.8min
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:  7.0min
[Parallel(n_jobs=10)]: Done 2430 tasks      | elapsed:  9.6min
[Parallel(n_jobs=10)]: Done 3000 out of 3000 | elapsed: 11.8min finished


In [22]:
print(labels)
pid = df[~df["participant"].isin(test_participants)]["participant"].to_numpy()
print(len(pid))
print(len(train_labels_predict))

['light' 'moderate-vigorous' 'sedentary' 'sleep']
1182763
1182763


In [23]:
def saveToTar(tarOut, **kwargs):
    """Save objects to tar file. Objects must be passed as keyworded arguments,
    then the key is used for the object name in the tar file.

    :param **kwargs: Objects to be saved passed as keyworded arguments.

    :return: tar file written to <tarOut>
    :rtype: void
    """

    try:

        tmpdir = tempfile.mkdtemp()

        with tarfile.open(tarOut, mode='w') as tf:

            for key, val in kwargs.items():
                pth = os.path.join(tmpdir, key)
                joblib.dump(val, pth, compress=True)
                tf.add(pth, arcname=key)

        print('Models saved to', tarOut)

    finally:

        try:
            shutil.rmtree(tmpdir)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))


def getFileFromTar(tarArchive, targetFile):
    """Read file from tar

    This is currently more tricky than it should be see
    https://github.com/numpy/numpy/issues/7989

    :param str tarArchive: Input tarfile object
    :param str targetFile: Target individual file within .tar

    :return: file object byte stream
    :rtype: object
    """

    with tarfile.open(tarArchive, 'r') as t:
        b = BytesIO()
        try:
            b.write(t.extractfile(targetFile).read())
        except KeyError:
            return None
        b.seek(0)

    return b


def trainHMM(Y_prob, Y_true, labels=None, uniform_prior=True):
    """ https://en.wikipedia.org/wiki/Hidden_Markov_model

    :return: Dictionary containing prior, emission and transition
        matrices, and corresponding labels.
    :rtype: dict

    """

    if labels is None:
        labels = np.unique(Y_true)

    if uniform_prior:
        # All labels with equal probability
        prior = np.ones(len(labels)) / len(labels)
    else:
        # Label probability equals empirical rate
        prior = np.mean(Y_true.reshape(-1, 1) == labels, axis=0)

    emission = np.vstack(
        [np.mean(Y_prob[Y_true == label], axis=0) for label in labels]
    )
    transition = np.vstack(
        [np.mean(Y_true[1:][(Y_true == label)[:-1]].reshape(-1, 1) == labels, axis=0)
            for label in labels]
    )

    params = {'prior': prior, 'emission': emission, 'transition': transition, 'labels': labels}

    return params

In [24]:
from sklearn.model_selection import cross_val_predict, cross_val_score
import numpy as np

print('Cross-predicting to derive the observations for HMM...')

NJOBS_PER_CV_MODEL = min(2, 10)
cvp = cross_val_predict(
    _Model(n_jobs=NJOBS_PER_CV_MODEL), X_features, X_labels, groups=pid,
    cv=10,
    n_jobs=10 // NJOBS_PER_CV_MODEL,
    method="predict_proba",
    verbose=3,
)

Cross-predicting to derive the observations for HMM...


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed: 33.3min remaining: 33.3min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed: 66.8min finished


In [25]:
labels

array(['light', 'moderate-vigorous', 'sedentary', 'sleep'], dtype=object)

In [26]:
print('Training HMM...')
# train_labels_predict_array = np.array(train_labels_predict)
# hmmParams = trainHMM(cvp,  train_labels_predict_array)

hmmParams = trainHMM(cvp,  X_labels)

Training HMM...


In [27]:
hmmParams

{'prior': array([0.25, 0.25, 0.25, 0.25]),
 'emission': array([[0.33591968, 0.28668   , 0.29047209, 0.08692823],
        [0.30827429, 0.358804  , 0.261873  , 0.07104872],
        [0.28982134, 0.24954853, 0.3430877 , 0.11754243],
        [0.08788251, 0.06791808, 0.12158693, 0.72261248]]),
 'transition': array([[9.91444636e-01, 1.54675846e-03, 6.78829604e-03, 2.20309811e-04],
        [6.27018452e-03, 9.92577764e-01, 1.07650758e-03, 7.55443918e-05],
        [2.89921932e-03, 1.03401512e-04, 9.96901932e-01, 9.54475496e-05],
        [1.80909633e-04, 9.77889909e-06, 5.37839450e-05, 9.99755528e-01]]),
 'labels': array(['light', 'moderate-vigorous', 'sedentary', 'sleep'], dtype=object)}

In [28]:
# METs = {y: train_all_df[train_all_df["label"] == y].groupby("label")["MET"].mean().mean() 
#     for y in model.classes_}
METs = {'light': 2.270833333333333,
 'moderate-vigorous': 4.682608695652173,
 'sedentary': 1.5634920634920637,
 'sleep': 0.9499999999999998}
METs

{'light': 2.270833333333333,
 'moderate-vigorous': 4.682608695652173,
 'sedentary': 1.5634920634920637,
 'sleep': 0.9499999999999998}

In [29]:
import os
import tempfile
import shutil
import joblib
import tarfile

featureCols = np.array(train_features_df.columns)
# Write model to file
outFile = os.path.join("/home/aayush/accelerometer/compare_classification/accProcess_output/5_sec/original_features/model_used", '5s_without_extra_model.tar')
saveToTar(outFile,
          model=model,
          labels=labels,
          featureCols=featureCols,
          hmmParams=hmmParams,
          METs=METs)
print(f'Output trained model written to: {outFile}')

Models saved to /home/aayush/accelerometer/compare_classification/accProcess_output/5_sec/original_features/model_used/5s_without_extra_model.tar
Output trained model written to: /home/aayush/accelerometer/compare_classification/accProcess_output/5_sec/original_features/model_used/5s_without_extra_model.tar


In [30]:
# Display feature importances
feature_importances = pd.Series(model.feature_importances_, index=df.columns[:-2])
print(feature_importances.nlargest(30))  # Show the top 30 features

max          0.079299
xRange       0.074254
sd           0.068114
75thp        0.061402
zRange       0.051310
xStd         0.050279
mean         0.047256
zStd         0.038432
fft3         0.035838
xMean        0.029427
yRange       0.027779
fft1         0.027379
fft2         0.026150
fft4         0.020270
pmax         0.019904
median       0.019336
enmoAbs      0.018958
MAD          0.017909
yawg         0.017906
fft5         0.016953
enmoTrunc    0.015377
MPD          0.014529
yMean        0.013758
pitchg       0.013085
fft8         0.012896
yStd         0.012630
fft6         0.010826
p1           0.010187
entropy      0.009841
fft7         0.009762
dtype: float64


In [31]:
import sklearn.metrics as metrics

# test the performance
Y_labels_test_pred = model.predict(test_features_df)
print(metrics.classification_report(test_labels_df, Y_labels_test_pred))
testScore = metrics.f1_score(test_labels_df, Y_labels_test_pred, average='macro', zero_division=0)
print(f'Score: {testScore:.2f}')



                   precision    recall  f1-score   support

            light       0.43      0.34      0.38    124699
moderate-vigorous       0.17      0.42      0.24     36007
        sedentary       0.67      0.51      0.58    226843
            sleep       0.83      0.93      0.88    196252

         accuracy                           0.61    583801
        macro avg       0.52      0.55      0.52    583801
     weighted avg       0.64      0.61      0.62    583801

Score: 0.52


In [32]:
def viterbi(Y_obs, hmm_params):
    """ Perform HMM smoothing over observations via Viteri algorithm

    https://en.wikipedia.org/wiki/Viterbi_algorithm

    :param dict hmm_params: Dictionary containing prior, emission and transition
        matrices, and corresponding labels

    :return: Smoothed sequence of activities
    :rtype: numpy.array
    """

    def log(x):
        SMALL_NUMBER = 1e-16
        return np.log(x + SMALL_NUMBER)

    prior = hmm_params['prior']
    emission = hmm_params['emission']
    transition = hmm_params['transition']
    labels = hmm_params['labels']

    nobs = len(Y_obs)
    nlabels = len(labels)

    Y_obs = np.where(Y_obs.reshape(-1, 1) == labels)[1]  # to numeric

    probs = np.zeros((nobs, nlabels))
    probs[0, :] = log(prior) + log(emission[:, Y_obs[0]])
    for j in range(1, nobs):
        for i in range(nlabels):
            probs[j, i] = np.max(
                log(emission[i, Y_obs[j]]) +
                log(transition[:, i]) +
                probs[j - 1, :])  # probs already in log scale
    viterbi_path = np.zeros_like(Y_obs)
    viterbi_path[-1] = np.argmax(probs[-1, :])
    for j in reversed(range(nobs - 1)):
        viterbi_path[j] = np.argmax(
            log(transition[:, viterbi_path[j + 1]]) +
            probs[j, :])  # probs already in log scale

    viterbi_path = labels[viterbi_path]  # to labels

    return viterbi_path

In [33]:
YpredHmm = viterbi(Y_labels_test_pred, hmmParams)

print('\nTest performance (HMM):')
print(metrics.classification_report(test_labels_df, YpredHmm))
testHmmScore = metrics.f1_score(test_labels_df, YpredHmm, average='macro', zero_division=0)
print(f'Score: {testHmmScore:.2f}')


Test performance (HMM):
                   precision    recall  f1-score   support

            light       0.51      0.35      0.42    124699
moderate-vigorous       0.18      0.48      0.27     36007
        sedentary       0.74      0.59      0.65    226843
            sleep       0.86      0.98      0.92    196252

         accuracy                           0.66    583801
        macro avg       0.57      0.60      0.56    583801
     weighted avg       0.70      0.66      0.67    583801

Score: 0.56
