In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
import os
warnings.filterwarnings('ignore')

In [10]:
activity_cols = ['IN_VEHICLE', 'ON_BIKE', 'ON_FOOT', 'RUNNING', 'STILL', 'WALKING']
mood_cols = ['Q1_SAD', 'Q2_HAPP', 'Q3_FATIG', 'Q4_EN', 'Q5_REL']

In [9]:
feature_sets = {
    'activity_based': [
        'IN_VEHICLE_prop', 'ON_BIKE_prop', 'ON_FOOT_prop',
        'RUNNING_prop', 'STILL_prop', 'WALKING_prop',
        'IN_VEHICLE_rate', 'ON_BIKE_rate', 'ON_FOOT_rate',
        'RUNNING_rate', 'STILL_rate', 'WALKING_rate'
    ],
    
    'phone_usage': [
        'daily_screen_hours', 'avg_session_duration',
        'usage_per_day_hours', 'UNLOCK_EVENTS_NUM'
    ],
    
    'physical_activity': [
        'activity_intensity', 'activity_intensity_ma7',
        'activity_intensity_std7'
    ],
    
    'sleep': [
        'sleep_hours', 'sleep_hours_ma7'
    ],
    
    'temporal': [
        'dayofweek', 'is_weekend', 'month'
    ]
}

In [None]:
subject_ids = sorted(os.listdir('data'))

results = {}

for subject_id in subject_ids:
    print(f'Processing {subject_id}...')

    # Load data
    df = pd.read_csv(f'data/{subject_id}')

    df['date'] = pd.to_datetime(df['date'])

    df['total_activities'] = df[activity_cols].sum(axis=1)

    for col in activity_cols:
        df[f'{col}_prop'] = df[col] / df['total_activities']
        
    if 'SCREEN_ON_SECONDS' in df.columns:
        df['daily_screen_hours'] = df['SCREEN_ON_SECONDS'] / 3600

    if 'UNLOCK_EVENTS_NUM' in df.columns and 'USAGE_DURATION_MIN' in df.columns:
        df['avg_session_duration'] = df['USAGE_DURATION_MIN'] / df['UNLOCK_EVENTS_NUM']
        df['usage_per_day_hours'] = df['USAGE_DURATION_MIN'] / 60

    df['dayofweek'] = df['date'].dt.weekday
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['month'] = df['date'].dt.month

    if 'MIMS_SUM_WEAR' in df.columns:
        # Create rolling statistics for physical activity
        df['activity_intensity'] = df['MIMS_SUM_WEAR']
        df['activity_intensity_ma7'] = df['MIMS_SUM_WEAR'].rolling(window=7).mean()
        df['activity_intensity_std7'] = df['MIMS_SUM_WEAR'].rolling(window=7).std()
    
    # Sleep features
    if 'SLEEP_MINUTES' in df.columns:
        df['sleep_hours'] = df['SLEEP_MINUTES'] / 60
        df['sleep_hours_ma7'] = df['sleep_hours'].rolling(window=7).mean()
    
    # Create lag features for mood variables
    mood_cols = ['Q1_SAD', 'Q2_HAPP', 'Q3_FATIG', 'Q4_EN', 'Q5_REL']
    for col in mood_cols:
        if col in df.columns:
            # Previous day
            df[f'{col}_lag1'] = df[col].shift(1)
            # Two days ago
            df[f'{col}_lag2'] = df[col].shift(2)
            # Weekly moving average
            df[f'{col}_ma7'] = df[col].rolling(window=7).mean()
            # Weekly standard deviation
            df[f'{col}_std7'] = df[col].rolling(window=7).std()


    print("\nActivity Counts Summary:")
    print(df[activity_cols].describe())
    
    # Mood patterns
    mood_cols = ['Q1_SAD', 'Q2_HAPP', 'Q3_FATIG', 'Q4_EN', 'Q5_REL']
    print("\nMood Statistics:")
    print(df[mood_cols].describe())
    
    # Correlations between activities and moods
    activity_props = [col for col in df.columns if '_prop' in col]
    correlations = df[mood_cols + activity_props].corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlations.loc[mood_cols, activity_props], 
                annot=True, fmt='.2f', cmap='RdBu_r')
    plt.title(f'Activity-Mood Correlations for {subject_id}')
    plt.tight_layout()
    plt.show()

Processing afflictedrevenueepilepsy_daily_metrics.csv...


AttributeError: 'RangeIndex' object has no attribute 'dayofweek'