# Merge PreProcessed MESA Data

The notebook `preprocess_data.ipynb` is responsible for generating the data files that are currently housed within the `data/mesa/processed/` directory.
These files will now be processed and consolidated.

## Dataset Characteristics:

- **HR Data**: This data is logged every second.
- **Activity and Sleep Data**: Entries for these datasets are made every 30 seconds.

## Procedure:

1. **Resampling HR Data**: The HR data will be resampled to align with the 30-second interval used by the activity and sleep datasets.
2. **Timestamp Comparison**: Timestamps across different data files will be compared to ensure synchronization.
3. **Merging Data**: For each subject, data across different files will be merged based on matching timestamps.
4. **Final Data Output**: The aggregated data for all subjects will be consolidated and saved to `final_merged_dataset.csv`.
5. **Data Anonymization**: All personally identifying information, including subject ID and timestamps, will be stripped to ensure privacy.





## Constants

In [None]:
PROCESSED_FOLDER_PATH_PREFIX = 'data/mesa/processed/'

In [None]:
import pandas as pd
import os
import logging
import sys
logging.basicConfig(level=logging.DEBUG)

In [None]:

# Create a custom logging handler that directs messages to standard output
stdout_handler = logging.StreamHandler(sys.stdout)

# Define a logging format
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stdout_handler.setFormatter(formatter)

# Get the root logger
root_logger = logging.getLogger()

# Clear any existing handlers from the root logger
root_logger.handlers = []

# Add the custom handler to the root logger
root_logger.addHandler(stdout_handler)

# Set the logging level to DEBUG for the root logger
root_logger.setLevel(logging.DEBUG)


In [None]:
def get_file_ids(data_dir):
    """Return unique file IDs from the directory."""
    return {file.split('_')[-1].split('.')[0] for file in os.listdir(data_dir) if file.endswith('.csv')}



In [None]:
def load_data_for_id(data_dir, id_val):
    """Load datasets for a specific ID."""
    try:
        activity_data = pd.read_csv(os.path.join(data_dir, f'activity_min_len_{id_val}.csv'))
        hr_data = pd.read_csv(os.path.join(data_dir, f'hr_min_len_{id_val}.csv'))
        sleep_data = pd.read_csv(os.path.join(data_dir, f'sleep_min_len_{id_val}.csv'))
        return activity_data, hr_data, sleep_data
    except FileNotFoundError:
        logging.debug(f"Files for ID: {id_val} not found.")
        return None, None, None



In [None]:
def resample_and_merge(activity_data, hr_data, sleep_data):
    """Resample HR data and merge datasets."""
    hr_data['Time (s)'] = (hr_data['Time (s)'] // 30) * 30
    hr_data_resampled = hr_data.groupby('Time (s)').mean().reset_index()

    merged_data = pd.merge(activity_data, hr_data_resampled, left_on='Elapsed Time', right_on='Time (s)', how='inner')
    merged_data = pd.merge(merged_data, sleep_data, left_on='Elapsed Time', right_on='Timestamp', how='inner')
    
    merged_data.drop(columns=['Time (s)', 'Timestamp'], inplace=True)
    merged_data.rename(columns={'Elapsed Time': 'Timestamp'}, inplace=True)
    return merged_data, hr_data_resampled


In [None]:
def gather_statistics(dataframes, statistics):
    """Gather statistics for the current ID's data."""
    data_keys = ["activity_data", "hr_data", "sleep_data", "hr_data_resampled", "merged_data"]
    for key, df in zip(data_keys, dataframes):
        if key in ["activity_data", "hr_data", "sleep_data"]:
            statistics["nan_counts"][key] += df.isnull().sum().sum()
        if key == "activity_data":
            statistics["zero_counts"][key] += (df['Activity Value'] == 0).sum()
        
        statistics["cumulative_counts"][key] += len(df)




In [None]:
def process_data_for_id(data_dir, id_val, statistics):
    activity_data, hr_data, sleep_data = load_data_for_id(data_dir, id_val)
    
    if activity_data is not None:
        merged_data, hr_data_resampled = resample_and_merge(activity_data, hr_data, sleep_data)
        merged_data['file_id'] = id_val
        gather_statistics([activity_data, hr_data, sleep_data, hr_data_resampled, merged_data], statistics)
        
        return merged_data
    return None



In [None]:
def main():
    current_directory = os.getcwd()
    data_dir = os.path.join(current_directory, PROCESSED_FOLDER_PATH_PREFIX)

    file_ids = get_file_ids(data_dir)
    
    statistics = {
        "nan_counts": {"activity_data": 0, "hr_data": 0, "sleep_data": 0},
        "zero_counts": {"activity_data": 0},
        "cumulative_counts": {
            "activity_data": 0, "hr_data": 0, "sleep_data": 0, 
            "hr_data_resampled": 0, "merged_data": 0
        }
    }

    merged_data_list = [process_data_for_id(data_dir, id_val, statistics) for id_val in file_ids]
    merged_data_list = [data for data in merged_data_list if data is not None]

    final_merged_data = pd.concat(merged_data_list, ignore_index=True)
    final_merged_data['Stage Value'] = final_merged_data['Stage Value'].apply(lambda x: 0 if x == 0 else 1)
    final_merged_data.drop(columns=['file_id', 'Timestamp'], inplace=True)

    # Logging statistics
    logging.debug("\nNaN Counts in Final Merged Dataset:")
    logging.debug(final_merged_data.isnull().sum())
    logging.debug("\nNaN Counts in Original Files:")
    for key, value in statistics["nan_counts"].items():
        logging.debug(f"{key}: {value}")
    logging.debug("\nZero Counts in Activity Data:")
    for key, value in statistics["zero_counts"].items():
        logging.debug(f"{key}: {value}")
    logging.debug("\nCumulative Counts:")
    for key, value in statistics["cumulative_counts"].items():
        logging.debug(f"{key}: {value}")

    final_merged_data.to_csv('final_merged_data.csv', index=False)



In [None]:
main()
