- Generate epoch files with epoch length as 10 seconds.
- Load the epoch files and label files with 10 second worth of labels.
- Train the model.
- Use the model to make predictions on test set and compare with actual annotations we have.
- Make the confusion matrix.
- Do it for: original model and custom features model.
- Use those models for 22 random participants.

In [1]:
def get_files(data_path) -> None:
    total_csv_zipped = []
    for path, dirnames, filenames in os.walk(data_path):
        for file in filenames:
            if file.endswith(".csv.gz"):
                total_csv_zipped.append(os.path.join(path, file))
                
            if file.endswith(".csv") and file[0]!='c': #ignore the capture24 file
                # paths contains the base directory for that file.
                # dirnames contains other directories within this folder.
                # filenames contains the list of filenames within path.
                total_csv_zipped.append(os.path.join(path, file))
    return sorted(total_csv_zipped)

In [2]:
import pandas as pd
import os
epoch_size = 5 # 5 seconds

epochs = "/home/aayush/accelerometer/compare_classification/epoch_data/5_sec/original_features"
epoch_files = get_files(epochs)

labels = "/home/yacine/accel/capture24/participants/"
label_files = get_files(labels)

In [3]:
import csv
def create_labels_dict(labels_dict_location = "/home/aayush/accelerometer/accprocess/anno-label.csv"):
    labels_dict = {}
    with open(labels_dict_location, "r") as annotation_dict:
        reader = csv.DictReader(annotation_dict)
        for row in reader:
            if labels_dict.get(row['annotation']) is None:
                labels_dict[row['annotation']] = [row['label:Walmsley2020']]
            else:
                labels_dict[row['annotation']].append(row['label:Walmsley2020'])
    return labels_dict

labels_dict = create_labels_dict()

# replace the annotated labels with the same format of strings as predicted labels using mapping from labels_dict
flat_dict = {k: v[0] for k, v in labels_dict.items()}

In [4]:
def parse_datetime(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object


def parse_datetime_df_time(dt_string):
    clean_datetime_str = dt_string.split('[')[0].strip()
    clean_datetime_str = clean_datetime_str.split('+')[0].strip()
    dt_object = pd.to_datetime(clean_datetime_str)
    return dt_object

In [5]:
all_epoch_dfs = []
for i, (epochfilename, labelfilename) in enumerate(zip(epoch_files[30:60], label_files[30:60])):
    epoch_df = pd.read_csv(epochfilename)
    label_df = pd.read_csv(labelfilename)

    # Take the timestamp after every thirty seconds
    label_df = label_df[["annotation", "time"]][0::epoch_size]
    
    # Convert to datetime object
    label_df["time"] = label_df["time"].apply(parse_datetime)

    # replace the label with the categories we expect
    label_df['annotation'] = label_df['annotation'].replace(flat_dict)

    # convert epoch df time stamp with datetime object
    epoch_df['time'] = epoch_df['time'].apply(parse_datetime_df_time)
    epoch_df["participant"] = float(epochfilename.split("/")[-1].split("-")[0][1:])

    # predropped_epoch = len(epoch_df)
    # predropped_label = len(label_df)
    
    # # keep only those time stamps which are present in both epoch and label df
    # label_df = label_df[label_df["time"].isin(epoch_df['time'])]
    # epoch_df = epoch_df[epoch_df["time"].isin(label_df['time'])]

    # print(f"We removed {predropped_label-len(label_df)} rows from label df because timestamps were not in epoch df.")
    # print(f"We removed {predropped_epoch-len(epoch_df)} rows from epoch_df because timestamps were not in label_df.")
    
    label_df = label_df[["time", "annotation"]]
    label_df.set_index("time", inplace=True)
    epoch_df.set_index("time", inplace=True)
    
    # epoch_df["label"] = label_df['annotation']
    epoch_df=epoch_df.join(label_df, on="time")
    
    all_epoch_dfs.append(epoch_df)
    print(f"{i+1}/{len(epoch_files)} done...")

  exec(code_obj, self.user_global_ns, self.user_ns)


1/151 done...
2/151 done...
3/151 done...
4/151 done...
5/151 done...
6/151 done...
7/151 done...
8/151 done...
9/151 done...
10/151 done...
11/151 done...
12/151 done...
13/151 done...
14/151 done...
15/151 done...
16/151 done...
17/151 done...
18/151 done...
19/151 done...
20/151 done...
21/151 done...
22/151 done...
23/151 done...
24/151 done...
25/151 done...
26/151 done...
27/151 done...
28/151 done...
29/151 done...
30/151 done...


In [7]:
epochs = pd.concat(all_epoch_dfs)

In [8]:
df = epochs
df.head()

Unnamed: 0_level_0,enmoTrunc,enmoAbs,xMean,yMean,zMean,xRange,yRange,zRange,xStd,yStd,...,p625,totalPower,temp,samples,dataErrors,clipsBeforeCalibr,clipsAfterCalibr,rawSamples,participant,annotation
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-03-30 00:53:00,0.00102,0.002526,-0.663161,0.69535,0.275062,0.015426,0.015587,0.015899,0.00216,0.005566,...,-17.287757,-14.2246,0.0,500,0,0,0,501,31.0,sleep
2016-03-30 00:53:05,0.001086,0.002506,-0.662945,0.696223,0.273854,0.015426,0.015587,0.015899,0.001192,0.006304,...,-17.169523,-14.317169,0.0,500,0,0,0,502,31.0,sleep
2016-03-30 00:53:10,0.001161,0.002885,-0.663099,0.696628,0.271819,0.015426,0.015587,0.015899,0.001936,0.00658,...,-17.009971,-13.91144,0.0,500,0,0,0,502,31.0,sleep
2016-03-30 00:53:15,0.001231,0.002597,-0.66313,0.696129,0.274331,0.015426,0.015587,0.015899,0.002051,0.006235,...,-17.485773,-14.258189,0.0,500,0,0,0,502,31.0,sleep
2016-03-30 00:53:20,0.001138,0.002736,-0.663142,0.695524,0.274797,0.015597,0.015641,0.016498,0.001935,0.005744,...,-17.263014,-14.378538,0.0,500,0,0,0,502,31.0,sleep


In [9]:
df.rename(columns={'annotation':'label'}, inplace=True)
print(len(df))

556084


In [10]:
print(df['label'].unique())

['sleep' nan 'light' 'sedentary' 'moderate-vigorous']


In [11]:
# df = df.drop(labels=["time"], axis=1)

In [12]:
df = df.dropna(subset=["label"])
print(df['label'].unique())

['sleep' 'light' 'sedentary' 'moderate-vigorous']


In [13]:
# df["label"] = df["label"].astype('string', copy=False)
len(df)

352540

In [15]:
df.to_csv("/home/aayush/accelerometer/compare_classification/training_data/5_sec/original_features/5s_training_original_features_30_60.csv", index=False)

In [None]:
test_participants = "101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151"
test_participants = [float(t) for t in test_participants.split(",")]
    
test_features_df = df[df["participant"].isin(test_participants)].iloc[:, :-2]
test_labels_df = df[df["participant"].isin(test_participants)]["label"]

print(len(test_features_df))
print(test_labels_df.unique())

In [None]:
train_all_df = df[~df["participant"].isin(test_participants)]
train_features_df = df[~df["participant"].isin(test_participants)].iloc[:, :-2]
train_labels_predict = df[~df["participant"].isin(test_participants)]["label"]

print(len(train_features_df))
print(train_labels_predict)

In [None]:
def _Model(**kwargs):
    return BalancedRandomForestClassifier(
        n_estimators=3000,
        max_depth=10,
        min_samples_leaf=1,
        replacement=True,
        sampling_strategy='not minority',
        random_state=42,
        **kwargs
    )

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier

print('Training...')
model = _Model(n_jobs=10, verbose=1)
# fit the model as numpy array so that we do not get warnings during prediction
X_features = train_features_df.to_numpy()
X_labels = train_labels_predict.to_numpy()
model = model.fit(X_features, X_labels)
model.verbose = 0  # silence future calls to .predict()
labels = model.classes_

In [None]:
print(labels)
pid = df[~df["participant"].isin(test_participants)]["participant"].to_numpy()
print(len(pid))
print(len(train_labels_predict))

In [None]:
def saveToTar(tarOut, **kwargs):
    """Save objects to tar file. Objects must be passed as keyworded arguments,
    then the key is used for the object name in the tar file.

    :param **kwargs: Objects to be saved passed as keyworded arguments.

    :return: tar file written to <tarOut>
    :rtype: void
    """

    try:

        tmpdir = tempfile.mkdtemp()

        with tarfile.open(tarOut, mode='w') as tf:

            for key, val in kwargs.items():
                pth = os.path.join(tmpdir, key)
                joblib.dump(val, pth, compress=True)
                tf.add(pth, arcname=key)

        print('Models saved to', tarOut)

    finally:

        try:
            shutil.rmtree(tmpdir)
        except OSError as e:
            print("Error: %s - %s." % (e.filename, e.strerror))


def getFileFromTar(tarArchive, targetFile):
    """Read file from tar

    This is currently more tricky than it should be see
    https://github.com/numpy/numpy/issues/7989

    :param str tarArchive: Input tarfile object
    :param str targetFile: Target individual file within .tar

    :return: file object byte stream
    :rtype: object
    """

    with tarfile.open(tarArchive, 'r') as t:
        b = BytesIO()
        try:
            b.write(t.extractfile(targetFile).read())
        except KeyError:
            return None
        b.seek(0)

    return b


def trainHMM(Y_prob, Y_true, labels=None, uniform_prior=True):
    """ https://en.wikipedia.org/wiki/Hidden_Markov_model

    :return: Dictionary containing prior, emission and transition
        matrices, and corresponding labels.
    :rtype: dict

    """

    if labels is None:
        labels = np.unique(Y_true)

    if uniform_prior:
        # All labels with equal probability
        prior = np.ones(len(labels)) / len(labels)
    else:
        # Label probability equals empirical rate
        prior = np.mean(Y_true.reshape(-1, 1) == labels, axis=0)

    emission = np.vstack(
        [np.mean(Y_prob[Y_true == label], axis=0) for label in labels]
    )
    transition = np.vstack(
        [np.mean(Y_true[1:][(Y_true == label)[:-1]].reshape(-1, 1) == labels, axis=0)
            for label in labels]
    )

    params = {'prior': prior, 'emission': emission, 'transition': transition, 'labels': labels}

    return params

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
import numpy as np

print('Cross-predicting to derive the observations for HMM...')

NJOBS_PER_CV_MODEL = min(2, 10)
cvp = cross_val_predict(
    _Model(n_jobs=NJOBS_PER_CV_MODEL), X_features, X_labels, groups=pid,
    cv=10,
    n_jobs=10 // NJOBS_PER_CV_MODEL,
    method="predict_proba",
    verbose=3,
)

In [None]:
labels

In [None]:
print('Training HMM...')
# train_labels_predict_array = np.array(train_labels_predict)
# hmmParams = trainHMM(cvp,  train_labels_predict_array)

hmmParams = trainHMM(cvp,  X_labels)

In [None]:
hmmParams

In [None]:
# METs = {y: train_all_df[train_all_df["label"] == y].groupby("label")["MET"].mean().mean() 
#     for y in model.classes_}
METs = {'light': 2.270833333333333,
 'moderate-vigorous': 4.682608695652173,
 'sedentary': 1.5634920634920637,
 'sleep': 0.9499999999999998}
METs

In [None]:
import os
import tempfile
import shutil
import joblib
import tarfile

featureCols = np.array(train_features_df.columns)
# Write model to file
outFile = os.path.join("/home/aayush/accelerometer/compare_classification/accProcess_output/5_sec/original_features/model_used", '5s_without_extra_model.tar')
saveToTar(outFile,
          model=model,
          labels=labels,
          featureCols=featureCols,
          hmmParams=hmmParams,
          METs=METs)
print(f'Output trained model written to: {outFile}')

In [None]:
# Display feature importances
feature_importances = pd.Series(model.feature_importances_, index=df.columns[:-2])
print(feature_importances.nlargest(30))  # Show the top 30 features

In [None]:
import sklearn.metrics as metrics

# test the performance
Y_labels_test_pred = model.predict(test_features_df)
print(metrics.classification_report(test_labels_df, Y_labels_test_pred))
testScore = metrics.f1_score(test_labels_df, Y_labels_test_pred, average='macro', zero_division=0)
print(f'Score: {testScore:.2f}')

In [None]:
def viterbi(Y_obs, hmm_params):
    """ Perform HMM smoothing over observations via Viteri algorithm

    https://en.wikipedia.org/wiki/Viterbi_algorithm

    :param dict hmm_params: Dictionary containing prior, emission and transition
        matrices, and corresponding labels

    :return: Smoothed sequence of activities
    :rtype: numpy.array
    """

    def log(x):
        SMALL_NUMBER = 1e-16
        return np.log(x + SMALL_NUMBER)

    prior = hmm_params['prior']
    emission = hmm_params['emission']
    transition = hmm_params['transition']
    labels = hmm_params['labels']

    nobs = len(Y_obs)
    nlabels = len(labels)

    Y_obs = np.where(Y_obs.reshape(-1, 1) == labels)[1]  # to numeric

    probs = np.zeros((nobs, nlabels))
    probs[0, :] = log(prior) + log(emission[:, Y_obs[0]])
    for j in range(1, nobs):
        for i in range(nlabels):
            probs[j, i] = np.max(
                log(emission[i, Y_obs[j]]) +
                log(transition[:, i]) +
                probs[j - 1, :])  # probs already in log scale
    viterbi_path = np.zeros_like(Y_obs)
    viterbi_path[-1] = np.argmax(probs[-1, :])
    for j in reversed(range(nobs - 1)):
        viterbi_path[j] = np.argmax(
            log(transition[:, viterbi_path[j + 1]]) +
            probs[j, :])  # probs already in log scale

    viterbi_path = labels[viterbi_path]  # to labels

    return viterbi_path

In [None]:
YpredHmm = viterbi(Y_labels_test_pred, hmmParams)

print('\nTest performance (HMM):')
print(metrics.classification_report(test_labels_df, YpredHmm))
testHmmScore = metrics.f1_score(test_labels_df, YpredHmm, average='macro', zero_division=0)
print(f'Score: {testHmmScore:.2f}')