In [19]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [20]:
# Read in all the CSV files
t_skin = pd.read_csv('computed_temperature.csv')
vo2_max = pd.read_csv('demographic_vo2_max.csv')
glucose = pd.read_csv('glucose.csv')
df_hrv = pd.read_csv('heart_rate_variability_details.csv')
hormones_selfreport = pd.read_csv('hormones_and_selfreport.csv')
resting_heart_rate = pd.read_csv('resting_heart_rate.csv')



In [22]:
# Define the merge keys
merge_keys = ['id', 'study_interval', 'day_in_study']


# GLUCOSE
glucose_daily = glucose.groupby(merge_keys).agg({
    'glucose_value': ['mean', 'std', 'min', 'max']
}).reset_index()
glucose_daily.columns = ['id', 'study_interval', 'day_in_study', 
                          'glucose_mean', 'glucose_std', 'glucose_min', 'glucose_max']


# HRV
df_hrv_daily = df_hrv.groupby(merge_keys).agg({
    'rmssd': ['mean', 'std'],
    'coverage': 'mean',
    'low_frequency': 'mean',
    'high_frequency': 'mean'
}).reset_index()
df_hrv_daily.columns = ['id', 'study_interval', 'day_in_study',
                         'rmssd_mean', 'rmssd_std', 'coverage_mean', 
                         'low_freq_mean', 'high_freq_mean']


# SKIN TEMP
t_skin_clean = t_skin[['id', 'study_interval', 'sleep_start_day_in_study', 
                        'nightly_temperature', 'baseline_relative_nightly_standard_deviation']].copy()
t_skin_clean.rename(columns={'sleep_start_day_in_study': 'day_in_study'}, inplace=True)


# 6. Target variable - keep only rows with phase data
hormones_clean = hormones_selfreport[hormones_selfreport['phase'].notna()].copy()

# Start with hormones_clean as the base (only days with phase labels)
df_merged = hormones_clean[merge_keys + ['phase', 'lh', 'estrogen']].copy()

# Merge all datasets
df_merged = df_merged.merge(t_skin_clean, on=merge_keys, how='left')
df_merged = df_merged.merge(glucose_daily, on=merge_keys, how='left')
df_merged = df_merged.merge(df_hrv_daily, on=merge_keys, how='left')


# Check the merged data
print("Merged dataset shape:", df_merged.shape)
print("\nMissing values per column:")
print(df_merged.isnull().sum())
print("\nPhase distribution:")
print(df_merged['phase'].value_counts())
print("\nNumber of unique participants:", df_merged['id'].nunique())
print("\nData sample:")
print(df_merged.head(20))

Merged dataset shape: (6653, 17)

Missing values per column:
id                                                 0
study_interval                                     0
day_in_study                                       0
phase                                              0
lh                                               346
estrogen                                         347
nightly_temperature                             1151
baseline_relative_nightly_standard_deviation    1458
glucose_mean                                    3268
glucose_std                                     3269
glucose_min                                     3268
glucose_max                                     3268
rmssd_mean                                       870
rmssd_std                                        874
coverage_mean                                    870
low_freq_mean                                    870
high_freq_mean                                   870
dtype: int64

Phase distribution:
phas

In [23]:
# Analyze data completeness by participant
print("Data completeness analysis by participant:\n")

participant_summary = []
for participant_id in df_merged['id'].unique():
    participant_data = df_merged[df_merged['id'] == participant_id]
    
    summary = {
        'id': participant_id,
        'total_days': len(participant_data),
        'hrv_missing': participant_data['rmssd_mean'].isna().sum(),
        'temp_missing': participant_data['nightly_temperature'].isna().sum(),
        'glucose_missing': participant_data['glucose_mean'].isna().sum(),
        'lh_missing': participant_data['lh'].isna().sum(),
        'estrogen_missing': participant_data['estrogen'].isna().sum()
    }
    
    # Calculate percentages
    summary['hrv_missing_pct'] = (summary['hrv_missing'] / summary['total_days'] * 100)
    summary['temp_missing_pct'] = (summary['temp_missing'] / summary['total_days'] * 100)
    summary['glucose_missing_pct'] = (summary['glucose_missing'] / summary['total_days'] * 100)
    
    participant_summary.append(summary)

participant_df = pd.DataFrame(participant_summary)
print(participant_df.to_string())

# Identify participants to exclude (e.g., >50% missing in any key feature)
threshold = 50  # percentage
participants_to_exclude = participant_df[
    (participant_df['hrv_missing_pct'] > threshold) | 
    (participant_df['temp_missing_pct'] > threshold) |
    (participant_df['glucose_missing_pct'] > threshold)
]['id'].tolist()

print(f"\n\nParticipants to exclude (>{threshold}% missing data): {participants_to_exclude}")
print(f"Excluding {len(participants_to_exclude)} out of {len(participant_df)} participants")

# Filter dataset to exclude these participants
df_filtered = df_merged[~df_merged['id'].isin(participants_to_exclude)].copy()

print(f"\n\nDataset after filtering:")
print(f"Original: {len(df_merged)} samples from {df_merged['id'].nunique()} participants")
print(f"Filtered: {len(df_filtered)} samples from {df_filtered['id'].nunique()} participants")

print("\nMissing values after filtering:")
print(df_filtered.isnull().sum())
print("\nPercentage missing after filtering:")
print((df_filtered.isnull().sum() / len(df_filtered) * 100).round(2))

# Update df_merged to the filtered version
df_merged = df_filtered.copy()

print(f"\nPhase distribution after filtering:")
print(df_merged['phase'].value_counts())

Data completeness analysis by participant:

    id  total_days  hrv_missing  temp_missing  glucose_missing  lh_missing  estrogen_missing  hrv_missing_pct  temp_missing_pct  glucose_missing_pct
0    1          94           94            53               34           3                 3       100.000000         56.382979            36.170213
1    2         113            7            23                4           0                 0         6.194690         20.353982             3.539823
2    3          96           96            15               26           9                 9       100.000000         15.625000            27.083333
3    4          91           19            25               16          14                14        20.879121         27.472527            17.582418
4    6          94           32            31               34           9                 9        34.042553         32.978723            36.170213
5    7         102            6            14                9

In [11]:
y = hormones_selfreport['phase']
X = [t_skin['nightly_temperature'], glucose['glucose_value'], df_hrv['rmssd']]