In [58]:
# Import the necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

In [59]:
# Read the datasets
subject_data = pd.read_csv("subjectID_26.csv")
fatigue_data = pd.read_csv('fatiguePROs_subject_26.csv')

In [60]:
# Function to parse the timestamps of the wearable dataset in the proper date format
def parse_wearable_timestamp(ts):
    return datetime.strptime(ts, '%d.%m.%y %H:%M')

In [61]:
# Function to parse the timestamps of the fatigue dataset in the proper date format
def parse_fatigue_timestamp_adjusted(ts):
    try:
        return datetime.strptime(ts, '%d.%m.%y %H:%M')
    except ValueError:
        return datetime.strptime(ts, '%d.%m.%y')

In [62]:
# Apply the parsing to the datasets
subject_data['Timestamp'] = subject_data['Timestamp'].apply(parse_wearable_timestamp)
fatigue_data['Days'] = fatigue_data['Days'].apply(parse_fatigue_timestamp_adjusted)

In [63]:
# Create a date column for merging
subject_data['Date'] = subject_data['Timestamp'].dt.date
fatigue_data['Date'] = fatigue_data['Days'].dt.date

In [64]:
# Summarize each column by the mean
wearable_daily_summary = subject_data.groupby('Date').agg({
    'ActivityCounts': 'median',
    'Barometer': 'median',
    'BloodPerfusion': 'median',
    'BloodPulseWave': 'median',
    'EnergyExpenditure': 'median',
    'GalvanicSkinResponse': 'median',
    'HR': 'median',
    'HRV': 'median',
    'RESP': 'median',
    'Steps': 'sum',  # Assuming sum is more appropriate for steps
    'SkinTemperature': 'median'
}).reset_index()

In [65]:
# Group the data by the mean for Question 1
# Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine
fatigue_daily_summary = fatigue_data.groupby('Date').agg({
    'Ans for Q1': 'mean'
}).reset_index()

In [66]:
# Merge the summaries on Date
daily_combined_summary = pd.merge(wearable_daily_summary, fatigue_daily_summary, on='Date', how='outer')

In [67]:
# Prepare the data for modeling
data_for_modeling = daily_combined_summary.dropna(subset=['Ans for Q1'])
X = data_for_modeling.drop(['Date', 'Ans for Q1'], axis=1)
y = data_for_modeling['Ans for Q1']

In [68]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
# Create a pipeline for imputation, scaling, and modeling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [70]:
# Fit the model
pipeline.fit(X_train, y_train)

In [71]:
# Extract and print feature importance
feature_importances = pipeline.named_steps['model'].feature_importances_
feature_importance_dict = dict(zip(X.columns, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

In [72]:
# Displaying the sorted feature importance
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

GalvanicSkinResponse: 0.18
BloodPerfusion: 0.14
SkinTemperature: 0.13
Steps: 0.12
BloodPulseWave: 0.12
Barometer: 0.08
HR: 0.07
RESP: 0.05
HRV: 0.05
EnergyExpenditure: 0.04
ActivityCounts: 0.02


The analysis from the Random Forest Regression model suggests that the most influential factors in determining quality of life include physiological parameters like Galvanic Skin Response and Blood Perfusion, alongside Skin Temperature. Metrics related to physical exertion, such as Steps and Blood Pulse Wave, also play a significant role, in addition to environmental measures like Barometer readings. Notably, traditional heart function indicators (HR and HRV) and aspects of physical activity (Energy Expenditure) contribute to a lesser extent. This implies a relationship between physiological responses, physical activity, and environmental conditions in shaping overall well-being. Furthermore, it underscores the value of data from wearable devices in shedding light on indicators of health-related quality of life.

In [73]:
features = ['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']

In [74]:
phrase_to_remove = "Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine"
filtered_fatigue_data = fatigue_data.loc[~fatigue_data['Question'].str.contains(phrase_to_remove, case=False, na=False)]
new_fatigue_data = filtered_fatigue_data.drop("Ans for Q1", axis=1)
new_fatigue_data

Unnamed: 0,SubjectID,Days,Timezone,Question,Ans for Q2-Q4,Date
1,26,2018-01-26 21:00:00,CET,"Are you feeling better, worse or the same as y...",Worse,2018-01-26
2,26,2018-01-26 21:00:00,CET,"Physically, today how often did you feel exhau...",Regularly,2018-01-26
3,26,2018-01-26 21:00:00,CET,"Mentally, today how often did you feel exhausted?",Often,2018-01-26
5,26,2018-01-27 21:00:00,CET,"Are you feeling better, worse or the same as y...",Better,2018-01-27
6,26,2018-01-27 21:00:00,CET,"Physically, today how often did you feel exhau...",Sometimes,2018-01-27
...,...,...,...,...,...,...
222,26,2018-08-14 00:21:00,CEST,"Physically, today how often did you feel exhau...",Sometimes,2018-08-14
223,26,2018-08-14 00:21:00,CEST,"Mentally, today how often did you feel exhausted?",Regularly,2018-08-14
225,26,2018-08-15 00:40:00,CEST,"Are you feeling better, worse or the same as y...",Worse,2018-08-15
226,26,2018-08-15 00:40:00,CEST,"Physically, today how often did you feel exhau...",Often,2018-08-15


In [75]:
finished_fatigue_data = new_fatigue_data.set_index("Question", append=True).swaplevel().loc[new_fatigue_data["Question"].unique()].reset_index(level=0)

In [76]:
fatigue_data_q2 = finished_fatigue_data[finished_fatigue_data['Question'] == "Are you feeling better, worse or the same as yesterday?"]
fatigue_data_q3 = finished_fatigue_data[finished_fatigue_data['Question'] == "Physically, today how often did you feel exhausted?"]
fatigue_data_q4 = finished_fatigue_data[finished_fatigue_data['Question'] == "Mentally, today how often did you feel exhausted?"]

fatigue_data_q2 = fatigue_data_q2.reset_index(drop=True)
fatigue_data_q3 = fatigue_data_q3.reset_index(drop=True)
fatigue_data_q4 = fatigue_data_q4.reset_index(drop=True)

In [77]:
# Mappings for encoding
encoding_q2 = {'Worse': 0, 'Same': 1, 'Better': 2}
encoding_q3 = {'Never': 0, 'Sometimes': 1, 'Regularly': 2, 'Often': 3, 'Always': 4}
encoding_q4 = {'Never': 0, 'Sometimes': 1, 'Regularly': 2, 'Often': 3, 'Always': 4}

In [78]:
# Apply each encodings to the mapping
fatigue_data_q2['Ans_Encoded'] = fatigue_data_q2['Ans for Q2-Q4'].map(encoding_q2)
fatigue_data_q3['Ans_Encoded'] = fatigue_data_q3['Ans for Q2-Q4'].map(encoding_q3)
fatigue_data_q4['Ans_Encoded'] = fatigue_data_q4['Ans for Q2-Q4'].map(encoding_q4)

fatigue_data_q2 = fatigue_data_q2.drop("Days", axis=1)
fatigue_data_q3 = fatigue_data_q3.drop("Days", axis=1)
fatigue_data_q4 = fatigue_data_q4.drop("Days", axis=1)

In [79]:
# Data cleaning to prepare the data for the model
fatigue_data_q2_merged = pd.merge(fatigue_data_q2, wearable_daily_summary, on='Date', how='outer')
fatigue_data_q3_merged = pd.merge(fatigue_data_q3, wearable_daily_summary, on='Date', how='outer')
fatigue_data_q4_merged = pd.merge(fatigue_data_q4, wearable_daily_summary, on='Date', how='outer')

fatigue_data_q2_merged = fatigue_data_q2_merged.dropna(subset=['Ans_Encoded'])
fatigue_data_q3_merged = fatigue_data_q3_merged.dropna(subset=['Ans_Encoded'])
fatigue_data_q4_merged = fatigue_data_q4_merged.dropna(subset=['Ans_Encoded'])

In [81]:
# The 'Ans_Encoded' column is the target variable, and the rest are the features
X_q2 = fatigue_data_q2_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q2 = fatigue_data_q2_merged['Ans_Encoded']

X_q3 = fatigue_data_q3_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q3 = fatigue_data_q3_merged['Ans_Encoded']

X_q4 = fatigue_data_q4_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q4 = fatigue_data_q4_merged['Ans_Encoded']

In [82]:
X_train_q2, X_test_q2, y_train_q2, y_test_q2 = train_test_split(X_q2, y_q2, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q2, y_train_q2)

# Make predictions
y_pred_q2 = model.predict(X_test_q2)

# Evaluate the model
accuracy_q2 = accuracy_score(y_test_q2, y_pred_q2)
print(f"Accuracy: {accuracy_q2}")

Accuracy: 0.16666666666666666


In [83]:
X_train_q3, X_test_q3, y_train_q3, y_test_q3 = train_test_split(X_q3, y_q3, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q3, y_train_q3)

# Make predictions
y_pred_q3 = model.predict(X_test_q3)

# Evaluate the model
accuracy_q3 = accuracy_score(y_test_q3, y_pred_q3)
print(f"Accuracy: {accuracy_q3}")

Accuracy: 0.5


In [84]:
X_train_q4, X_test_q4, y_train_q4, y_test_q4 = train_test_split(X_q4, y_q4, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q4, y_train_q4)

# Make predictions
y_pred_q4 = model.predict(X_test_q4)

# Evaluate the model
accuracy_q4 = accuracy_score(y_test_q4, y_pred_q4)
print(f"Accuracy: {accuracy_q4}")

Accuracy: 0.5


In [85]:
full_fatigue_data = fatigue_data_q2_merged.merge(fatigue_data_q3_merged, on='Date', how='inner', suffixes=('_q2', '_q3'))
full_fatigue_data = full_fatigue_data.merge(fatigue_data_q4_merged, on='Date', how='inner', suffixes=('', '_q4'))

X_full = full_fatigue_data[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure', 'Ans_Encoded_q3', 'Ans_Encoded']]
# 'Ans_Encoded' from Q4 might have to be renamed to avoid collision if not done in the merge step.

y_full = full_fatigue_data['Ans_Encoded_q2']  # Target variable from Q2

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

model_full = HistGradientBoostingClassifier()

# Fit the model
model_full.fit(X_train_full, y_train_full)

# Make predictions
y_pred_full = model_full.predict(X_test_full)

# Evaluate the model
accuracy_full = accuracy_score(y_test_full, y_pred_full)
print(f"Accuracy with data from Question 3 and Question 4: {accuracy_full}")

Accuracy with data from Question 3 and Question 4: 0.2608695652173913


In [86]:
# For Question 2
answer_distribution_q2 = fatigue_data_q2['Ans for Q2-Q4'].value_counts()

# For Question 3: Physically, today how often did you feel exhausted?
answer_distribution_q3 = fatigue_data_q3['Ans for Q2-Q4'].value_counts()

# For Question 4: Mentally, today how often did you feel exhausted?
answer_distribution_q4 = fatigue_data_q4['Ans for Q2-Q4'].value_counts()

# If you want to see the distribution as percentages of the total responses:
answer_distribution_q2_percentage = fatigue_data_q2['Ans for Q2-Q4'].value_counts(normalize=True) * 100
answer_distribution_q3_percentage = fatigue_data_q3['Ans for Q2-Q4'].value_counts(normalize=True) * 100
answer_distribution_q4_percentage = fatigue_data_q4['Ans for Q2-Q4'].value_counts(normalize=True) * 100

# Print the distributions
print("Distribution for Question 2:")
print(answer_distribution_q2)
print("\nAs percentages:")
print(answer_distribution_q2_percentage)

print("Distribution for Question 3 (Physical Exhaustion):")
print(answer_distribution_q3)
print("\nAs percentages:")
print(answer_distribution_q3_percentage)

print("\nDistribution for Question 4 (Mental Exhaustion):")
print(answer_distribution_q4)
print("\nAs percentages:")
print(answer_distribution_q4_percentage)

Distribution for Question 2:
Worse     22
Better    22
Same      13
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Worse     38.596491
Better    38.596491
Same      22.807018
Name: Ans for Q2-Q4, dtype: float64
Distribution for Question 3 (Physical Exhaustion):
Sometimes    31
Regularly    18
Often         5
Never         2
Always        1
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Sometimes    54.385965
Regularly    31.578947
Often         8.771930
Never         3.508772
Always        1.754386
Name: Ans for Q2-Q4, dtype: float64

Distribution for Question 4 (Mental Exhaustion):
Sometimes    23
Regularly    15
Often        13
Never         4
Always        2
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Sometimes    40.350877
Regularly    26.315789
Often        22.807018
Never         7.017544
Always        3.508772
Name: Ans for Q2-Q4, dtype: float64
