In [14]:
# Import the necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

In [15]:
# Read the datasets
subject_data = pd.read_csv("subjectID_28.csv")
fatigue_data = pd.read_csv('fatiguePROs_subject_28.csv')

In [16]:
# Function to parse the timestamps of the wearable dataset in the proper date format
def parse_wearable_timestamp(ts):
    return datetime.strptime(ts, '%d.%m.%y %H:%M')

In [17]:
# Function to parse the timestamps of the fatigue dataset in the proper date format
def parse_fatigue_timestamp_adjusted(ts):
    try:
        return datetime.strptime(ts, '%d.%m.%y %H:%M')
    except ValueError:
        return datetime.strptime(ts, '%d.%m.%y')

In [18]:
# Apply the parsing to the datasets
subject_data['Timestamp'] = subject_data['Timestamp'].apply(parse_wearable_timestamp)
fatigue_data['Days'] = fatigue_data['Days'].apply(parse_fatigue_timestamp_adjusted)

In [19]:
# Create a date column for merging
subject_data['Date'] = subject_data['Timestamp'].dt.date
fatigue_data['Date'] = fatigue_data['Days'].dt.date

In [20]:
# Summarize each column by the mean
wearable_daily_summary = subject_data.groupby('Date').agg({
    'ActivityCounts': 'median',
    'Barometer': 'median',
    'BloodPerfusion': 'median',
    'BloodPulseWave': 'median',
    'EnergyExpenditure': 'median',
    'GalvanicSkinResponse': 'median',
    'HR': 'median',
    'HRV': 'median',
    'RESP': 'median',
    'Steps': 'sum',  # Assuming sum is more appropriate for steps
    'SkinTemperature': 'median'
}).reset_index()

In [21]:
# Group the data by the mean for Question 1
# Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine
fatigue_daily_summary = fatigue_data.groupby('Date').agg({
    'Ans for Q1': 'mean'
}).reset_index()

In [22]:
# Merge the summaries on Date
daily_combined_summary = pd.merge(wearable_daily_summary, fatigue_daily_summary, on='Date', how='outer')

In [23]:
# Prepare the data for modeling
data_for_modeling = daily_combined_summary.dropna(subset=['Ans for Q1'])
X = data_for_modeling.drop(['Date', 'Ans for Q1'], axis=1)
y = data_for_modeling['Ans for Q1']

In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Create a pipeline for imputation, scaling, and modeling
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [26]:
# Fit the model
pipeline.fit(X_train, y_train)

In [27]:
# Extract and print feature importance
feature_importances = pipeline.named_steps['model'].feature_importances_
feature_importance_dict = dict(zip(X.columns, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

In [28]:
# Displaying the sorted feature importance
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

BloodPulseWave: 0.21
RESP: 0.12
GalvanicSkinResponse: 0.12
BloodPerfusion: 0.10
HRV: 0.08
SkinTemperature: 0.08
ActivityCounts: 0.08
Barometer: 0.08
HR: 0.07
Steps: 0.05
EnergyExpenditure: 0.01


In [29]:
features = ['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']

In [30]:
phrase_to_remove = "Describe fatigue on a scale of 1 to 10, where 1 means you don’t feel tired at all and 10 means the worst tiredness you can imagine"
filtered_fatigue_data = fatigue_data.loc[~fatigue_data['Question'].str.contains(phrase_to_remove, case=False, na=False)]
new_fatigue_data = filtered_fatigue_data.drop("Ans for Q1", axis=1)
new_fatigue_data

Unnamed: 0,SubjectID,Days,Timezone,Question,Ans for Q2-Q4,Date
1,28,2018-08-09 22:26:00,CEST,"Are you feeling better, worse or the same as y...",Same,2018-08-09
2,28,2018-08-09 22:26:00,CEST,"Physically, today how often did you feel exhau...",Never,2018-08-09
3,28,2018-08-09 22:26:00,CEST,"Mentally, today how often did you feel exhausted?",Never,2018-08-09
5,28,2018-08-10 23:13:00,CEST,"Are you feeling better, worse or the same as y...",Same,2018-08-10
6,28,2018-08-10 23:13:00,CEST,"Physically, today how often did you feel exhau...",Never,2018-08-10
7,28,2018-08-10 23:13:00,CEST,"Mentally, today how often did you feel exhausted?",Sometimes,2018-08-10
9,28,2018-08-13 21:39:00,CEST,"Are you feeling better, worse or the same as y...",Same,2018-08-13
10,28,2018-08-13 21:39:00,CEST,"Physically, today how often did you feel exhau...",Sometimes,2018-08-13
11,28,2018-08-13 21:39:00,CEST,"Mentally, today how often did you feel exhausted?",Regularly,2018-08-13
13,28,2018-08-14 23:27:00,CEST,"Are you feeling better, worse or the same as y...",Worse,2018-08-14


In [31]:
finished_fatigue_data = new_fatigue_data.set_index("Question", append=True).swaplevel().loc[new_fatigue_data["Question"].unique()].reset_index(level=0)

In [32]:
fatigue_data_q2 = finished_fatigue_data[finished_fatigue_data['Question'] == "Are you feeling better, worse or the same as yesterday?"]
fatigue_data_q3 = finished_fatigue_data[finished_fatigue_data['Question'] == "Physically, today how often did you feel exhausted?"]
fatigue_data_q4 = finished_fatigue_data[finished_fatigue_data['Question'] == "Mentally, today how often did you feel exhausted?"]

fatigue_data_q2 = fatigue_data_q2.reset_index(drop=True)
fatigue_data_q3 = fatigue_data_q3.reset_index(drop=True)
fatigue_data_q4 = fatigue_data_q4.reset_index(drop=True)

In [33]:
# Mappings for encoding
encoding_q2 = {'Worse': 0, 'Same': 1, 'Better': 2}
encoding_q3 = {'Never': 0, 'Sometimes': 1, 'Regularly': 2, 'Often': 3, 'Always': 4}
encoding_q4 = {'Never': 0, 'Sometimes': 1, 'Regularly': 2, 'Often': 3, 'Always': 4}

In [34]:
# Apply each encodings to the mapping
fatigue_data_q2['Ans_Encoded'] = fatigue_data_q2['Ans for Q2-Q4'].map(encoding_q2)
fatigue_data_q3['Ans_Encoded'] = fatigue_data_q3['Ans for Q2-Q4'].map(encoding_q3)
fatigue_data_q4['Ans_Encoded'] = fatigue_data_q4['Ans for Q2-Q4'].map(encoding_q4)

fatigue_data_q2 = fatigue_data_q2.drop("Days", axis=1)
fatigue_data_q3 = fatigue_data_q3.drop("Days", axis=1)
fatigue_data_q4 = fatigue_data_q4.drop("Days", axis=1)

In [35]:
# Data cleaning to prepare the data for the model
fatigue_data_q2_merged = pd.merge(fatigue_data_q2, wearable_daily_summary, on='Date', how='outer')
fatigue_data_q3_merged = pd.merge(fatigue_data_q3, wearable_daily_summary, on='Date', how='outer')
fatigue_data_q4_merged = pd.merge(fatigue_data_q4, wearable_daily_summary, on='Date', how='outer')

fatigue_data_q2_merged = fatigue_data_q2_merged.dropna(subset=['Ans_Encoded'])
fatigue_data_q3_merged = fatigue_data_q3_merged.dropna(subset=['Ans_Encoded'])
fatigue_data_q4_merged = fatigue_data_q4_merged.dropna(subset=['Ans_Encoded'])

In [36]:
# The 'Ans_Encoded' column is the target variable, and the rest are the features
X_q2 = fatigue_data_q2_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q2 = fatigue_data_q2_merged['Ans_Encoded']

X_q3 = fatigue_data_q3_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q3 = fatigue_data_q3_merged['Ans_Encoded']

X_q4 = fatigue_data_q4_merged[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure']]
y_q4 = fatigue_data_q4_merged['Ans_Encoded']

In [37]:
X_train_q2, X_test_q2, y_train_q2, y_test_q2 = train_test_split(X_q2, y_q2, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q2, y_train_q2)

# Make predictions
y_pred_q2 = model.predict(X_test_q2)

# Evaluate the model
accuracy_q2 = accuracy_score(y_test_q2, y_pred_q2)
print(f"Accuracy: {accuracy_q2}")

Accuracy: 0.0


In [38]:
X_train_q3, X_test_q3, y_train_q3, y_test_q3 = train_test_split(X_q3, y_q3, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q3, y_train_q3)

# Make predictions
y_pred_q3 = model.predict(X_test_q3)

# Evaluate the model
accuracy_q3 = accuracy_score(y_test_q3, y_pred_q3)
print(f"Accuracy: {accuracy_q3}")

Accuracy: 1.0


In [39]:
X_train_q4, X_test_q4, y_train_q4, y_test_q4 = train_test_split(X_q4, y_q4, test_size=0.2, random_state=42)

# Initialize and train the classifier
model = HistGradientBoostingClassifier()

# Fit the model
model.fit(X_train_q4, y_train_q4)

# Make predictions
y_pred_q4 = model.predict(X_test_q4)

# Evaluate the model
accuracy_q4 = accuracy_score(y_test_q4, y_pred_q4)
print(f"Accuracy: {accuracy_q4}")

Accuracy: 0.5


In [40]:
full_fatigue_data = fatigue_data_q2_merged.merge(fatigue_data_q3_merged, on='Date', how='inner', suffixes=('_q2', '_q3'))
full_fatigue_data = full_fatigue_data.merge(fatigue_data_q4_merged, on='Date', how='inner', suffixes=('', '_q4'))

X_full = full_fatigue_data[['HR', 'Steps', 'BloodPulseWave', 'HRV', 'EnergyExpenditure', 'Ans_Encoded_q3', 'Ans_Encoded']]
# 'Ans_Encoded' from Q4 might have to be renamed to avoid collision if not done in the merge step.

y_full = full_fatigue_data['Ans_Encoded_q2']  # Target variable from Q2

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

model_full = HistGradientBoostingClassifier()

# Fit the model
model_full.fit(X_train_full, y_train_full)

# Make predictions
y_pred_full = model_full.predict(X_test_full)

# Evaluate the model
accuracy_full = accuracy_score(y_test_full, y_pred_full)
print(f"Accuracy with data from Question 3 and Question 4: {accuracy_full}")

Accuracy with data from Question 3 and Question 4: 0.6666666666666666


In [41]:
# For Question 2
answer_distribution_q2 = fatigue_data_q2['Ans for Q2-Q4'].value_counts()

# For Question 3: Physically, today how often did you feel exhausted?
answer_distribution_q3 = fatigue_data_q3['Ans for Q2-Q4'].value_counts()

# For Question 4: Mentally, today how often did you feel exhausted?
answer_distribution_q4 = fatigue_data_q4['Ans for Q2-Q4'].value_counts()

# If you want to see the distribution as percentages of the total responses:
answer_distribution_q2_percentage = fatigue_data_q2['Ans for Q2-Q4'].value_counts(normalize=True) * 100
answer_distribution_q3_percentage = fatigue_data_q3['Ans for Q2-Q4'].value_counts(normalize=True) * 100
answer_distribution_q4_percentage = fatigue_data_q4['Ans for Q2-Q4'].value_counts(normalize=True) * 100

# Print the distributions
print("Distribution for Question 2:")
print(answer_distribution_q2)
print("\nAs percentages:")
print(answer_distribution_q2_percentage)

print("Distribution for Question 3 (Physical Exhaustion):")
print(answer_distribution_q3)
print("\nAs percentages:")
print(answer_distribution_q3_percentage)

print("\nDistribution for Question 4 (Mental Exhaustion):")
print(answer_distribution_q4)
print("\nAs percentages:")
print(answer_distribution_q4_percentage)

Distribution for Question 2:
Same      3
Better    2
Worse     1
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Same      50.000000
Better    33.333333
Worse     16.666667
Name: Ans for Q2-Q4, dtype: float64
Distribution for Question 3 (Physical Exhaustion):
Never        4
Sometimes    2
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Never        66.666667
Sometimes    33.333333
Name: Ans for Q2-Q4, dtype: float64

Distribution for Question 4 (Mental Exhaustion):
Sometimes    4
Never        1
Regularly    1
Name: Ans for Q2-Q4, dtype: int64

As percentages:
Sometimes    66.666667
Never        16.666667
Regularly    16.666667
Name: Ans for Q2-Q4, dtype: float64
