In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [None]:
"""This file runs our ML model fit from the PPG_DaLia dataset onto the Big Ideas dataset.
        - we train a random forest model with 300 trees
        - we also show the average accuracy, precision, and recall score of this classifier using Leave-One-Individuals-Out Cross Validation
        - we condense no_ activity to contain baseline, clean_baseline, lunch, working, driving
        - we condense activity to contain stairs, soccer, cycling, walking
        - after applying our model on the Big Ideas dataset, we identify periods if they are more than 5 minutes
        - combined identified periods if there were less than a minute in difference and deleted if food was consumed an hour before or after"""

In [None]:
""" Combines biosignal csvs into one dataframe and saves in a dictionary """
ppg_dataframes = {}
for i in np.arange(1, 16):
    name = f"data{i}"
    acc = pd.read_csv(f"D:/REU_2024/PPG/S{i}/S{i}_E4/ACC_specific_windowed_features.csv") # simple windowed biosignals with activity from ppg dataset
    hr = pd.read_csv(f"D:/REU_2024/PPG/S{i}/S{i}_E4/HR_specific_windowed_features.csv")
    bvp = pd.read_csv(f"D:/REU_2024/PPG/S{i}/S{i}_E4/BVP_specific_windowed_features.csv")
    acc['start_time'] = pd.to_datetime(acc['start_time'])
    hr['start_time'] = pd.to_datetime(hr['start_time'])
    bvp['start_time'] = pd.to_datetime(bvp['start_time'])
    acc['end_time'] = pd.to_datetime(acc['end_time'])
    hr['end_time'] = pd.to_datetime(hr['end_time'])
    bvp['end_time'] = pd.to_datetime(bvp['end_time'])

    common_columns = ['start_time', 'end_time', 'activity']
    data = acc.merge(hr, on=common_columns, how='inner').merge(bvp, on=common_columns, how='inner')
    print(f"Subject {i} Combined Data Length: {len(data)}")

    data = data[~data['activity'].isin([0])]
    data.loc[data['activity'].isin([1, 2, 7, 8, 9]), 'activity'] = 0
    data.loc[data['activity'].isin([3, 4, 5, 6]), 'activity'] = 1
    ppg_dataframes[name] = data

bigideas_dataframes = {}
for i in np.arange(1, 17):
    if i < 10:
        name = f"data{i}"
        acc = pd.read_csv(f"D:/REU_2024/BigIdeas/00{i}/ACC_00{i}_simple_windowed.csv") # simple windowed biosignals from big ideas dataset
        hr = pd.read_csv(f"D:/REU_2024/BigIdeas/00{i}/HR_00{i}_simple_windowed.csv")
        bvp = pd.read_csv(f"D:/REU_2024/BigIdeas/00{i}/BVP_00{i}_simple_windowed.csv")
    if i > 9:
        name = f"data{i}"
        acc = pd.read_csv(f"D:/REU_2024/BigIdeas/0{i}/ACC_0{i}_simple_windowed.csv")
        hr = pd.read_csv(f"D:/REU_2024/BigIdeas/0{i}/HR_0{i}_simple_windowed.csv")
        bvp = pd.read_csv(f"D:/REU_2024/BigIdeas/0{i}/BVP_0{i}_simple_windowed.csv")
    acc['start_time'] = pd.to_datetime(acc['start_time'])
    hr['start_time'] = pd.to_datetime(hr['start_time'])
    bvp['start_time'] = pd.to_datetime(bvp['start_time'])
    acc['end_time'] = pd.to_datetime(acc['end_time'])
    hr['end_time'] = pd.to_datetime(hr['end_time'])
    bvp['end_time'] = pd.to_datetime(bvp['end_time'])

    common_columns = ['start_time', 'end_time']
    data = acc.merge(hr, on=common_columns, how='inner').merge(bvp, on=common_columns, how='inner')
    data = data.dropna() # removes rows with missing data due to differing collection frequencies
    print(f"Subject {i} Combined Data Length: {len(data)}")

    data.columns = data.columns.str.replace(' ', '', regex=False)
    bigideas_dataframes[name] = data

Subject 1 Combined Data Length: 1324
Subject 2 Combined Data Length: 1181
Subject 3 Combined Data Length: 1254
Subject 4 Combined Data Length: 1302
Subject 5 Combined Data Length: 1328
Subject 6 Combined Data Length: 1413
Subject 7 Combined Data Length: 1334
Subject 8 Combined Data Length: 1160
Subject 9 Combined Data Length: 1232
Subject 10 Combined Data Length: 1532
Subject 11 Combined Data Length: 1299
Subject 12 Combined Data Length: 1126
Subject 13 Combined Data Length: 1312
Subject 14 Combined Data Length: 1280
Subject 15 Combined Data Length: 1139
Subject 1 Combined Data Length: 73779
Subject 2 Combined Data Length: 84518
Subject 3 Combined Data Length: 2232
Subject 4 Combined Data Length: 35926
Subject 5 Combined Data Length: 96772
Subject 6 Combined Data Length: 59134
Subject 7 Combined Data Length: 87953
Subject 8 Combined Data Length: 83423
Subject 9 Combined Data Length: 82157
Subject 10 Combined Data Length: 89398
Subject 11 Combined Data Length: 69695
Subject 12 Combined 

In [None]:
""" Shows random forest accuracy, precision, recall in supervised setting using LOOCV"""
accuracies = []
precisions = []
recalls = []

dataset_names = list(ppg_dataframes.keys())

for i in range(len(dataset_names)):
    test_name = dataset_names[i]
    train_names = dataset_names[:i] + dataset_names[i+1:]

    print(f"Testing on {test_name}")

    train_data = pd.concat([ppg_dataframes[name] for name in train_names], ignore_index=True)
    test_data = ppg_dataframes[test_name]

    X_train, y_train = train_data.drop(columns=['start_time', 'end_time', 'activity']), train_data['activity']
    X_test, y_test = test_data.drop(columns=['start_time', 'end_time', 'activity']), test_data['activity']

    model = RandomForestClassifier(n_estimators=300, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred) # correct classified over total data
    prec = precision_score(y_test, y_pred, average='binary', zero_division=1) # how correct our positive predictions are
    recall = recall_score(y_test, y_pred, average='binary', zero_division=1) # how well our model identified positives

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(recall)

    print(f"{test_name}: Accuracy = {acc:.4f}, Precision = {prec:.4f}, Recall = {recall:.4f}")

average_accuracy = np.mean(accuracies)
average_precision = np.mean(precisions)
average_recall = np.mean(recalls)

print(f"Overall Average Accuracy: {average_accuracy:.4f}")
print(f"Overall Average Precision: {average_precision:.4f}")
print(f"Overall Average Recall: {average_recall:.4f}")

Testing on data1
data1: Accuracy = 0.9313, Precision = 0.8811, Recall = 0.8431
Testing on data2
data2: Accuracy = 0.9079, Precision = 0.8086, Recall = 0.8846
Testing on data3
data3: Accuracy = 0.9138, Precision = 0.8179, Recall = 0.8740
Testing on data4
data4: Accuracy = 0.9333, Precision = 0.8906, Recall = 0.8708
Testing on data5
data5: Accuracy = 0.8267, Precision = 0.6566, Recall = 0.6905
Testing on data6
data6: Accuracy = 0.9442, Precision = 0.9208, Recall = 0.8873
Testing on data7
data7: Accuracy = 0.9371, Precision = 0.8893, Recall = 0.8662
Testing on data8
data8: Accuracy = 0.8278, Precision = 0.7204, Recall = 0.5776
Testing on data9
data9: Accuracy = 0.8716, Precision = 0.7953, Recall = 0.7426
Testing on data10
data10: Accuracy = 0.8228, Precision = 0.8261, Recall = 0.5723
Testing on data11
data11: Accuracy = 0.9203, Precision = 0.8642, Recall = 0.8358
Testing on data12
data12: Accuracy = 0.9384, Precision = 0.8783, Recall = 0.9167
Testing on data13
data13: Accuracy = 0.9132, P

In [None]:
""" Trains random forest on whole PPG-DaLia dataset and applies on Big Ideas dataset """
ppg_features = pd.concat([df.drop(columns=['start_time','end_time','activity']) for df in ppg_dataframes.values()], axis=0)
ppg_activity = pd.concat([df['activity'] for df in ppg_dataframes.values()], axis=0)
sc = StandardScaler()
ppg_features = sc.fit_transform(ppg_features)

rf = RandomForestClassifier(n_estimators=300)
rf.fit(ppg_features, ppg_activity)

for key, df in bigideas_dataframes.items():
    bigideas_features = df.drop(columns=['start_time', "end_time"]).to_numpy()
    bigideas_features = sc.transform(bigideas_features) # normalizes big ideas dataset according to ppg dataset metrics

    bigideas_activity = rf.predict(bigideas_features)

    bigideas_dataframes[key]['activity'] = bigideas_activity
    print(f"Dataset {key} predicted")



Dataset data1 predicted




Dataset data2 predicted
Dataset data3 predicted




Dataset data4 predicted




Dataset data5 predicted




Dataset data6 predicted




Dataset data7 predicted




Dataset data8 predicted




Dataset data9 predicted




Dataset data10 predicted




Dataset data11 predicted




Dataset data12 predicted




Dataset data13 predicted




Dataset data14 predicted




Dataset data15 predicted




Dataset data16 predicted


In [None]:
""" Saves Big Ideas dataset including activity """
i = 1
for key, df in bigideas_dataframes.items():
    reduced_df = df[['start_time', 'end_time', 'activity']]
    if i < 10:
        file = f"D:/REU_2024/BigIdeas/00{i}/windowed_activity.csv" # edit file path accordingly
    if i > 9:
        file = f"D:/REU_2024/BigIdeas/0{i}/windowed_activity.csv"

    if os.path.exists(file):
        os.remove(file)
    reduced_df.to_csv(file, index=False)
    print(f"Person {i} saved!")
    i += 1

Person 1 saved!
Person 2 saved!
Person 3 saved!
Person 4 saved!
Person 5 saved!
Person 6 saved!
Person 7 saved!
Person 8 saved!
Person 9 saved!
Person 10 saved!
Person 11 saved!
Person 12 saved!
Person 13 saved!
Person 14 saved!
Person 15 saved!
Person 16 saved!


In [None]:
""" Defines funtion to grab long activity periods, limiting to 5 min, differences in interval time as 1 min, and removes food as confounder"""
def filter_activity_intervals(file_path, i, min_duration=300, diff_threshold=60):
        # Load CSV into DataFrame
    df = pd.read_csv(file_path)

    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])
    if i < 10:
        food = pd.read_csv(f"D:/REU_2024/BigIdeas/00{i}/Food_Log_00{i}.csv")
    if i > 9:
        food = pd.read_csv(f"D:/REU_2024/BigIdeas/0{i}/Food_Log_0{i}.csv")

    intervals = []
    start = None

    for i, row in df.iterrows():
        if row['activity'] == 1:
            if start is None:
                start = row['start_time']
            end = row['end_time']
        else:
            if start is not None:
                intervals.append((start, end))
                start = None

    if start is not None:
        intervals.append((start, end))

    merged_intervals = []
    prev_start, prev_end = intervals[0]

    for start, end in intervals[1:]:
        if (start - prev_end).total_seconds() < diff_threshold:
            prev_end = end
        else:
            merged_intervals.append((prev_start, prev_end))
            prev_start, prev_end = start, end

    merged_intervals.append((prev_start, prev_end))

    final_intervals = [(s, e) for s, e in merged_intervals if (e - s).total_seconds() >= min_duration]

    food['datetime'] = pd.to_datetime(food['date'] + ' ' + food['time'])
    food_times = food['datetime']
    filtered_intervals = []

    for start, end in final_intervals:
        if not any((start - pd.Timedelta(hours=1) <= t <= start) or (end <= t <= end + pd.Timedelta(hours=1)) for t in food_times):
            filtered_intervals.append((start, end))

    # Create DataFrame
    result_df = pd.DataFrame(filtered_intervals, columns=['start_time', 'end_time'])

    return result_df


In [None]:
for i in range(1,17):
    if i < 10:
        data = f"D:/REU_2024/BigIdeas/00{i}/windowed_activity.csv"
    if i > 9:
        data = f"D:/REU_2024/BigIdeas/0{i}/windowed_activity.csv"

    print(filter_activity_intervals(data, i).shape)

(6, 2)
(29, 2)
(0, 2)
(20, 2)
(7, 2)
(10, 2)
(16, 2)
(65, 2)
(33, 2)
(8, 2)
(35, 2)
(2, 2)
(30, 2)
(0, 2)
(43, 2)
(53, 2)


In [None]:
for i in range(1,17):
    if i < 10:
        data = f"D:/REU_2024/BigIdeas/00{i}/windowed_activity.csv" # edit where to grab data
        file = f"D:/REU_2024/BigIdeas/00{i}/activity_bouts.csv" # and where to save data
    if i > 9:
        data = f"D:/REU_2024/BigIdeas/0{i}/windowed_activity.csv"
        file = f"D:/REU_2024/BigIdeas/0{i}/activity_bouts.csv"

    if os.path.exists(file):
        os.remove(file)
    filter_activity_intervals(data, i).to_csv(file, index=False)
    print(f"Person {i} activity saved!")
    i += 1

Person 1 activity saved!
Person 2 activity saved!
Person 3 activity saved!
Person 4 activity saved!
Person 5 activity saved!
Person 6 activity saved!
Person 7 activity saved!
Person 8 activity saved!
Person 9 activity saved!
Person 10 activity saved!
Person 11 activity saved!
Person 12 activity saved!
Person 13 activity saved!
Person 14 activity saved!
Person 15 activity saved!
Person 16 activity saved!
