In [18]:
import pandas as pd
from scipy.stats import ttest_ind

# Data contains 16 days before and after starting medication
# Load the data
pre_med_df = pd.read_csv("data/premed.csv")
post_med_df = pd.read_csv("data/postmed.csv")


In [19]:
# Ensure both datasets have the same columns
common_columns = list(set(pre_med_df.columns) & set(post_med_df.columns))
pre_med_df = pre_med_df[common_columns]
post_med_df = post_med_df[common_columns]

In [20]:
common_columns

['Average HRV',
 'Steps',
 'REM Sleep Duration',
 'Temperature Deviation (째C)',
 'Lowest Resting Heart Rate',
 'Equivalent Walking Distance',
 'Average Resting Heart Rate',
 'Sleep Latency',
 'Rest Time',
 'date',
 'Readiness Score',
 'Sleep Score',
 'Total Sleep Duration',
 'Average MET',
 'High Activity Time',
 'Deep Sleep Duration']

In [21]:
# Remove non-numeric columns (like date)
numeric_columns = pre_med_df.select_dtypes(include=['number']).columns

In [22]:
# Perform t-tests for each numeric column
stat_results = {}
for col in numeric_columns:
    stat, p_value = ttest_ind(pre_med_df[col], post_med_df[col], equal_var=False, nan_policy='omit')
    stat_results[col] = p_value

In [23]:
# Identify statistically significant differences (p < 0.05)
significant_results = {col: p for col, p in stat_results.items() if p < 0.05}

# Convert to DataFrame and display
df_results = pd.DataFrame(list(stat_results.items()), columns=["Metric", "P-value"])
df_results["Statistically Significant"] = df_results["P-value"] < 0.05

print(df_results)

                         Metric   P-value  Statistically Significant
0                   Average HRV  0.001216                       True
1                         Steps  0.020042                       True
2            REM Sleep Duration  0.827407                      False
3    Temperature Deviation (째C)  0.531599                      False
4     Lowest Resting Heart Rate  0.004806                       True
5   Equivalent Walking Distance  0.260142                      False
6    Average Resting Heart Rate  0.002567                       True
7                 Sleep Latency  0.078035                      False
8                     Rest Time  0.048369                       True
9               Readiness Score  0.016355                       True
10                  Sleep Score  0.756648                      False
11         Total Sleep Duration  0.303377                      False
12                  Average MET  0.351524                      False
13           High Activity Time  0

In [24]:
pre_avg = round(pre_med_df[numeric_columns].mean(), 2)
post_avg = round(post_med_df[numeric_columns].mean(), 2)

# Create a DataFrame with the results
avg_comparison_df = pd.DataFrame({
    "Metric": numeric_columns,
    "Pre-Medication Average": pre_avg.values,
    "Post-Medication Average": post_avg.values
})


In [17]:
avg_comparison_df

Unnamed: 0,Metric,Pre-Medication Average,Post-Medication Average
0,Average HRV,76.44,62.38
1,Steps,17712.76,13243.81
2,REM Sleep Duration,4400.62,4248.46
3,Temperature Deviation (째C),-0.17,-0.12
4,Lowest Resting Heart Rate,52.81,56.46
5,Equivalent Walking Distance,20007.59,17294.88
6,Average Resting Heart Rate,59.2,63.54
7,Sleep Latency,894.38,583.85
8,Rest Time,31694.12,26182.5
9,Readiness Score,78.19,68.54
