This script is dedicated to the data analysis paper. in this script, csv files are being read and data is being extracted to look for further information about the data collected by three biosensor devices: Empatica device, mz3 watch and oura ring

input all libraries needed

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import ks_2samp
import scipy.stats as stats

read csv files

In [None]:
# Load the data
oura_df = pd.read_csv('oura.csv', parse_dates=['timestamp'])
mz3_df = pd.read_csv('mz3.csv', parse_dates=['Time'])
empatica_df = pd.read_csv('empatica.csv', parse_dates=['minute'])

set all times to one specfic unit so it will be easy for caluculations

In [None]:
# Rename columns for consistency
oura_df.rename(columns={'timestamp': 'Timestamp'}, inplace=True)
mz3_df.rename(columns={'Time': 'Timestamp'}, inplace=True)
empatica_df.rename(columns={'minute': 'Timestamp'}, inplace=True)

# Remove timezone info if present
oura_df['Timestamp'] = oura_df['Timestamp'].dt.tz_localize(None)
mz3_df['Timestamp'] = mz3_df['Timestamp'].dt.tz_localize(None)
empatica_df['Timestamp'] = empatica_df['Timestamp'].dt.tz_localize(None)

In [None]:
# Set Timestamp as index and resample to 3-minute intervals
oura_df.set_index('Timestamp', inplace=True)
mz3_df.set_index('Timestamp', inplace=True)
empatica_df.set_index('Timestamp', inplace=True)

# Aggregate statistics
oura_3min = oura_df.resample('3min').agg({'bpm': ['mean', 'max', 'min']})
mz3_3min = mz3_df.resample('3min').agg({'hr': ['mean', 'max', 'min']})
empatica_3min = empatica_df.resample('3min').agg({'entry_count': ['mean', 'max', 'min']})

# Flatten column names
oura_3min.columns = [f'bpm_{stat}' for stat in ['mean', 'max', 'min']]
mz3_3min.columns = [f'hr_{stat}' for stat in ['mean', 'max', 'min']]
empatica_3min.columns = [f'entry_count_{stat}' for stat in ['mean', 'max', 'min']]


get summary stats for a specific time interval or ativity. summary stats will be calculated every 3 mins over a 30 min window

Day 2: 1/22 rest activity

In [None]:
# Define the specific time interval

start_time = '2024-01-22 05:36:00'
end_time = '2024-01-22 06:06:00'

# Resample the data to 3-minute intervals and compute more statistics
oura_3min_stats = oura_df.resample('3min').agg({
    'bpm': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

mz3_3min_stats = mz3_df.resample('3min').agg({
    'hr': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

empatica_3min_stats = empatica_df.resample('3min').agg({
    'entry_count': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

# Flatten the multi-level column names for better readability
oura_3min_stats.columns = [f'bpm_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
mz3_3min_stats.columns = [f'hr_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
empatica_3min_stats.columns = [f'entry_count_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]

# Filter the data for the specified time interval
filtered_oura = oura_3min_stats.loc[start_time:end_time]
filtered_mz3 = mz3_3min_stats.loc[start_time:end_time]
filtered_empatica = empatica_3min_stats.loc[start_time:end_time]

# Combine the statistics into one summary DataFrame
#summary_stats = pd.concat([filtered_oura, filtered_mz3, filtered_empatica], axis=1)

# Print the summary statistics for each 3-minute interval
#print(summary_stats)

# Print the Oura statistics
print("Oura Statistics")
print(filtered_oura)

# Print the MZ3 statistics
print("\nMZ3 Statistics")
print(filtered_mz3)

# Print the Empatica statistics
print("\nEmpatica Statistics")
print(filtered_empatica)

Oura Statistics
                      bpm_mean  bpm_max  bpm_min   bpm_std  bpm_count
Timestamp                                                            
2024-01-22 05:36:00  63.000000     64.0     62.0  1.414214          2
2024-01-22 05:39:00  66.500000     67.0     66.0  0.707107          2
2024-01-22 05:42:00  69.000000     69.0     69.0       NaN          1
2024-01-22 05:45:00  69.333333     72.0     67.0  2.516611          3
2024-01-22 05:48:00        NaN      NaN      NaN       NaN          0
2024-01-22 05:51:00  68.000000     71.0     65.0  3.000000          3
2024-01-22 05:54:00  70.000000     70.0     70.0       NaN          1
2024-01-22 05:57:00  72.500000     74.0     71.0  2.121320          2
2024-01-22 06:00:00  65.000000     66.0     64.0  1.000000          3
2024-01-22 06:03:00        NaN      NaN      NaN       NaN          0
2024-01-22 06:06:00  65.666667     67.0     64.0  1.527525          3

MZ3 Statistics
                       hr_mean  hr_max  hr_min    hr_std  

Day 2: 1/22 house work activity

In [None]:
start_time = "2024-01-22T07:34:00"
end_time = "2024-01-22T08:04:00"
# Define the specific time interval

# Resample the data to 3-minute intervals and compute more statistics
oura_3min_stats = oura_df.resample('3min').agg({
    'bpm': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

mz3_3min_stats = mz3_df.resample('3min').agg({
    'hr': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

empatica_3min_stats = empatica_df.resample('3min').agg({
    'entry_count': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

# Flatten the multi-level column names for better readability
oura_3min_stats.columns = [f'bpm_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
mz3_3min_stats.columns = [f'hr_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
empatica_3min_stats.columns = [f'entry_count_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]

# Filter the data for the specified time interval
filtered_oura = oura_3min_stats.loc[start_time:end_time]
filtered_mz3 = mz3_3min_stats.loc[start_time:end_time]
filtered_empatica = empatica_3min_stats.loc[start_time:end_time]

# Combine the statistics into one summary DataFrame
summary_stats = pd.concat([filtered_oura, filtered_mz3, filtered_empatica], axis=1)

# Print the summary statistics for each 3-minute interval
print(summary_stats)


                     bpm_mean  bpm_max  bpm_min  bpm_std  bpm_count  \
Timestamp                                                             
2024-01-22 07:36:00      61.0     61.0     61.0      NaN          1   
2024-01-22 07:39:00       NaN      NaN      NaN      NaN          0   
2024-01-22 07:42:00      61.0     61.0     61.0      NaN          1   
2024-01-22 07:45:00       NaN      NaN      NaN      NaN          0   
2024-01-22 07:48:00      62.0     62.0     62.0      NaN          1   
2024-01-22 07:51:00      65.0     65.0     65.0      NaN          1   
2024-01-22 07:54:00       NaN      NaN      NaN      NaN          0   
2024-01-22 07:57:00      66.0     66.0     66.0      NaN          1   
2024-01-22 08:00:00       NaN      NaN      NaN      NaN          0   
2024-01-22 08:03:00      64.0     64.0     64.0      NaN          1   

                       hr_mean  hr_max  hr_min     hr_std  hr_count  \
Timestamp                                                             
2024-

Day 2: 1/22 computer work


In [None]:
start_time = "2024-01-22T08:07:00"
end_time = "2024-01-22T08:37:00"
# Define the specific time interval

# Resample the data to 3-minute intervals and compute more statistics
oura_3min_stats = oura_df.resample('3min').agg({
    'bpm': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

mz3_3min_stats = mz3_df.resample('3min').agg({
    'hr': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

empatica_3min_stats = empatica_df.resample('3min').agg({
    'entry_count': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

# Flatten the multi-level column names for better readability
oura_3min_stats.columns = [f'bpm_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
mz3_3min_stats.columns = [f'hr_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
empatica_3min_stats.columns = [f'entry_count_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]

# Filter the data for the specified time interval
filtered_oura = oura_3min_stats.loc[start_time:end_time]
filtered_mz3 = mz3_3min_stats.loc[start_time:end_time]
filtered_empatica = empatica_3min_stats.loc[start_time:end_time]

# Combine the statistics into one summary DataFrame
summary_stats = pd.concat([filtered_oura, filtered_mz3, filtered_empatica], axis=1)

# Print the summary statistics for each 3-minute interval
print(summary_stats)

                     bpm_mean  bpm_max  bpm_min  bpm_std  bpm_count  \
Timestamp                                                             
2024-01-22 08:09:00       NaN      NaN      NaN      NaN          0   
2024-01-22 08:12:00      62.0     62.0     62.0      NaN          1   
2024-01-22 08:15:00       NaN      NaN      NaN      NaN          0   
2024-01-22 08:18:00      63.0     63.0     63.0      NaN          1   
2024-01-22 08:21:00      63.0     63.0     63.0      NaN          1   
2024-01-22 08:24:00       NaN      NaN      NaN      NaN          0   
2024-01-22 08:27:00      62.0     62.0     62.0      NaN          1   
2024-01-22 08:30:00       NaN      NaN      NaN      NaN          0   
2024-01-22 08:33:00      63.0     63.0     63.0      NaN          1   
2024-01-22 08:36:00      63.0     63.0     63.0      NaN          1   

                       hr_mean  hr_max  hr_min    hr_std  hr_count  \
Timestamp                                                            
2024-01

Day 2: 1/22 exercise activity

In [None]:
start_time = "2024-01-22T06:14:00"
end_time = "2024-01-22T06:44:00"
# Define the specific time interval

# Resample the data to 3-minute intervals and compute more statistics
oura_3min_stats = oura_df.resample('3min').agg({
    'bpm': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

mz3_3min_stats = mz3_df.resample('3min').agg({
    'hr': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

empatica_3min_stats = empatica_df.resample('3min').agg({
    'entry_count': ['mean', 'max', 'min', 'std', 'count']  # Adding std and count
})

# Flatten the multi-level column names for better readability
oura_3min_stats.columns = [f'bpm_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
mz3_3min_stats.columns = [f'hr_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]
empatica_3min_stats.columns = [f'entry_count_{stat}' for stat in ['mean', 'max', 'min', 'std', 'count']]

# Filter the data for the specified time interval
filtered_oura = oura_3min_stats.loc[start_time:end_time]
filtered_mz3 = mz3_3min_stats.loc[start_time:end_time]
filtered_empatica = empatica_3min_stats.loc[start_time:end_time]

# Combine the statistics into one summary DataFrame
summary_stats = pd.concat([filtered_oura, filtered_mz3, filtered_empatica], axis=1)

# Print the summary statistics for each 3-minute interval
print(summary_stats)

                      bpm_mean  bpm_max  bpm_min   bpm_std  bpm_count  \
Timestamp                                                               
2024-01-22 06:15:00  66.500000     67.0     66.0  0.707107          2   
2024-01-22 06:18:00  64.000000     64.0     64.0       NaN          1   
2024-01-22 06:21:00  65.000000     67.0     63.0  2.000000          3   
2024-01-22 06:24:00        NaN      NaN      NaN       NaN          0   
2024-01-22 06:27:00  66.333333     68.0     65.0  1.527525          3   
2024-01-22 06:30:00  66.000000     67.0     65.0  1.414214          2   
2024-01-22 06:33:00  63.000000     63.0     63.0       NaN          1   
2024-01-22 06:36:00  65.666667     68.0     64.0  2.081666          3   
2024-01-22 06:39:00        NaN      NaN      NaN       NaN          0   
2024-01-22 06:42:00  66.000000     67.0     65.0  1.000000          3   

                        hr_mean  hr_max  hr_min    hr_std  hr_count  \
Timestamp                                           

In [None]:
# Merge the datasets
merged_df = pd.merge(oura_3min, mz3_3min, left_index=True, right_index=True, suffixes=('_oura', '_mz3'))
merged_df = pd.merge(merged_df, empatica_3min, left_index=True, right_index=True)

# Define intervals for testing
intervals = [
    ("2024-01-22T05:36:00", "2024-01-22T06:06:00"), ("2024-01-22T06:14:00", "2024-01-22T06:44:00"),
    ("2024-01-22T07:34:00", "2024-01-22T08:04:00"), ("2024-01-22T06:14:00", "2024-01-22T06:44:00"  )
    # Add more intervals as needed
]

calculate statistical functions: ks test, correlation and p -value
- for every 30 min window, script prints these stats for oura, mz3 and empatica

In [None]:
# Statistical functions
def ks_test(series1, series2):
    stat, p_value = ks_2samp(series1.dropna(), series2.dropna())
    return stat, p_value

def correlation(series1, series2):
    return series1.corr(series2)

# Iterate through intervals
for start_time, end_time in intervals:
    print(f"\nInterval: {start_time} to {end_time}")
    filtered_df = merged_df.loc[start_time:end_time]

    if filtered_df.empty:
        print("No data available for this interval.")
        continue

    # Perform KS Test and Correlation for each stat ('mean', 'max', 'min')
    for stat in ['mean', 'max', 'min']:
        col_oura = f'bpm_{stat}'
        col_mz3 = f'hr_{stat}'
        col_empatica = f'entry_count_{stat}'

        # Skip if columns are missing
        if not all(col in filtered_df.columns for col in [col_oura, col_mz3, col_empatica]):
            print(f"Missing columns for {stat}. Skipping...")
            continue

        ks_oura_mz3 = ks_test(filtered_df[col_oura], filtered_df[col_mz3])
        ks_oura_empatica = ks_test(filtered_df[col_oura], filtered_df[col_empatica])
        ks_mz3_empatica = ks_test(filtered_df[col_mz3], filtered_df[col_empatica])

        corr_oura_mz3 = correlation(filtered_df[col_oura], filtered_df[col_mz3])
        corr_oura_empatica = correlation(filtered_df[col_oura], filtered_df[col_empatica])
        corr_mz3_empatica = correlation(filtered_df[col_mz3], filtered_df[col_empatica])

        print(f"\nStatistic Results for {stat.capitalize()}:")
        print(f"KS Test (Oura vs MZ3): Statistic={ks_oura_mz3[0]:.4f}, p-value={ks_oura_mz3[1]:.4f}")
        print(f"KS Test (Oura vs Empatica): Statistic={ks_oura_empatica[0]:.4f}, p-value={ks_oura_empatica[1]:.4f}")
        print(f"KS Test (MZ3 vs Empatica): Statistic={ks_mz3_empatica[0]:.4f}, p-value={ks_mz3_empatica[1]:.4f}")
        print(f"Correlation (Oura vs MZ3): {corr_oura_mz3:.4f}")
        print(f"Correlation (Oura vs Empatica): {corr_oura_empatica:.4f}")
        print(f"Correlation (MZ3 vs Empatica): {corr_mz3_empatica:.4f}")


Interval: 2024-01-22T05:36:00 to 2024-01-22T06:06:00

Statistic Results for Mean:
KS Test (Oura vs MZ3): Statistic=0.7273, p-value=0.0041
KS Test (Oura vs Empatica): Statistic=0.4444, p-value=0.2083
KS Test (MZ3 vs Empatica): Statistic=0.9091, p-value=0.0001
Correlation (Oura vs MZ3): 0.1586
Correlation (Oura vs Empatica): 0.6623
Correlation (MZ3 vs Empatica): 0.0617

Statistic Results for Max:
KS Test (Oura vs MZ3): Statistic=0.8182, p-value=0.0008
KS Test (Oura vs Empatica): Statistic=0.1313, p-value=0.9998
KS Test (MZ3 vs Empatica): Statistic=0.9091, p-value=0.0001
Correlation (Oura vs MZ3): 0.2431
Correlation (Oura vs Empatica): 0.7334
Correlation (MZ3 vs Empatica): -0.0444

Statistic Results for Min:
KS Test (Oura vs MZ3): Statistic=0.6364, p-value=0.0187
KS Test (Oura vs Empatica): Statistic=0.4444, p-value=0.2083
KS Test (MZ3 vs Empatica): Statistic=0.7273, p-value=0.0044
Correlation (Oura vs MZ3): 0.1007
Correlation (Oura vs Empatica): 0.0734
Correlation (MZ3 vs Empatica): 0.1

OLD ANALYSIS

In [None]:
import pandas as pd


def extract_device_data(device_name, start_time, end_time, output_file):
    # Load datasets (files that we have)
    oura_df = pd.read_csv('oura.csv', parse_dates=['timestamp'])
    mz3_df = pd.read_csv('mz3.csv', parse_dates=['Time'])
    empatica_df = pd.read_csv('empatica.csv', parse_dates=['minute'])

    # Standardize column names so that when we Filter data, it is easier to call columns 'timestamp'
    oura_df.rename(columns={'timestamp': 'Timestamp'}, inplace=True)
    mz3_df.rename(columns={'Time': 'Timestamp'}, inplace=True)
    empatica_df.rename(columns={'minute': 'Timestamp'}, inplace=True)

    # Remove timezone info if present
    #for df in [oura_df, mz3_df, empatica_df]:
        #df['Timestamp'] = df['Timestamp'].dt.tz_localize(None)

    # Set Timestamp as index
    oura_df.set_index('Timestamp', inplace=True)
    mz3_df.set_index('Timestamp', inplace=True)
    empatica_df.set_index('Timestamp', inplace=True)

    # Dictionary mapping device names to their corresponding data and column name
    devices = {
        'oura': (oura_df, 'bpm'),
        'mz3': (mz3_df, 'hr'),
        'empatica': (empatica_df, 'entry_count')
    }

    if device_name not in devices:
        print("Invalid device name. Choose from: oura, mz3, empatica")
        return

    df, col_name = devices[device_name]

    # Resample and compute statistics
    df_resampled = df.resample('3min').agg({col_name: ['mean', 'max', 'min', 'std']})
    df_resampled.columns = ['mean', 'max', 'min', 'std']

    # Filter by the specified time range
    filtered_data = df_resampled.loc[start_time:end_time].reset_index()
    filtered_data['date'] = filtered_data['Timestamp'].dt.date

    # Save to CSV
    filtered_data.to_csv(output_file, index=False)
    print(f"Extracted data saved to {output_file}")

# Example usage in Google Colab:
device = 'oura'  # Change to 'mz3' or 'empatica' as needed
start_time = '2024-01-22 05:36:00'
end_time = '2024-01-22 06:06:00'
output_file = 'output.csv'

extract_device_data(device, start_time, end_time, output_file)


Extracted data saved to output.csv


NEW ANALYSIS


In [None]:
import pandas as pd

def extract_device_data(device_name, time_ranges, output_file):
    # Load datasets
    oura_df = pd.read_csv('oura.csv', parse_dates=['timestamp'])
    mz3_df = pd.read_csv('mz3.csv', parse_dates=['Time'])
    empatica_df = pd.read_csv('empatica.csv', parse_dates=['minute'])

    # Standardize column names
    oura_df.rename(columns={'timestamp': 'Timestamp'}, inplace=True)
    mz3_df.rename(columns={'Time': 'Timestamp'}, inplace=True)
    empatica_df.rename(columns={'minute': 'Timestamp'}, inplace=True)

    # Remove timezone info if present
    for df in [oura_df, mz3_df, empatica_df]:
        df['Timestamp'] = df['Timestamp'].dt.tz_localize(None)


    # Convert the timestamp to datetime and convert to Central Time (CT)
    for df in [oura_df, mz3_df, empatica_df]:
    # Convert to datetime (if it's not already)
      df['Timestamp'] = pd.to_datetime(df['Timestamp'])

    # If the Timestamp is already timezone-aware, convert it to Central Time
      if df['Timestamp'].dt.tz is not None:
          df['Timestamp'] = df['Timestamp'].dt.tz_convert('US/Central')
      else:
        # If the Timestamp is naive (no timezone), localize it to UTC first, then convert
          df['Timestamp'] = df['Timestamp'].dt.tz_localize('UTC').dt.tz_convert('US/Central')


    # Retain original timezone instead of removing it
    #for df in [oura_df, mz3_df, empatica_df]:
      #df['Timestamp'] = pd.to_datetime(df['Timestamp'], ct=True)


    # Set Timestamp as index
    oura_df.set_index('Timestamp', inplace=True)
    mz3_df.set_index('Timestamp', inplace=True)
    empatica_df.set_index('Timestamp', inplace=True)



    # Dictionary mapping device names to their corresponding data and column name
    devices = {
        'oura': (oura_df, 'bpm'),
        'mz3': (mz3_df, 'hr'),
        'empatica': (empatica_df, 'entry_count')
    }

    if device_name not in devices:
        print("Invalid device name. Choose from: oura, mz3, empatica")
        return

    df, col_name = devices[device_name]

    # Resample and compute statistics
    df_resampled = df.resample('3min').agg({col_name: ['mean', 'max', 'min', 'std', 'count']})
    df_resampled.columns = ['mean', 'max', 'min', 'std', 'count']

    # Collect filtered data for multiple time ranges
    results = []
    for start_time, end_time in time_ranges:
        filtered_data = df_resampled.loc[start_time:end_time].reset_index()
        filtered_data['Date'] = filtered_data['Timestamp'].dt.strftime('%m/%d/%Y')
        filtered_data['Time'] = filtered_data['Timestamp'].dt.strftime('%H:%M:%S')
        results.append(filtered_data)

    # Concatenate all results
    final_data = pd.concat(results, ignore_index=True)
    final_data = final_data[['Date', 'Time', 'mean', 'max', 'min', 'std', 'count']]

    # Save to CSV
    final_data.to_csv(output_file, index=False)
    print(f"Extracted data saved to {output_file}")

# Example usage in Google Colab:
device = 'oura'  # Change to 'mz3' or 'empatica' as needed
time_ranges = [

    ('2024-01-22 04:00:00', '2024-01-22 04:30:00'),
    ('2024-01-22 05:36:00', '2024-01-22 06:06:00'),
    ('2024-01-22 06:14:00', '2024-01-22 06:44:00'),#6pm
    ('2024-01-22 07:36:00', '2024-01-22 07:45:00'),
    ('2024-01-22 08:07:00', '2024-01-22 08:37:00'),
    ('2024-01-23 06:07:00', '2024-01-23 06:37:00'),
    ('2024-01-23 11:30:00', '2024-01-23 12:00:00'),
    ('2024-01-23 12:15:00', '2024-01-23 12:45:00'),
    ('2024-01-23 17:30:00', '2024-01-23 18:00:00'),
    ('2024-01-23 10:10:00', '2024-01-23 10:40:00')
]
output_file = 'output.csv'

extract_device_data(device, time_ranges, output_file)


Extracted data saved to output.csv


In [None]:
import pytz

def extract_device_data(device_name, time_ranges, output_file):
    # Load datasets
    oura_df = pd.read_csv('oura.csv', parse_dates=['timestamp'])
    mz3_df = pd.read_csv('mz3.csv', parse_dates=['Time'])
    empatica_df = pd.read_csv('empatica.csv', parse_dates=['minute'])

    # Standardize column names
    oura_df.rename(columns={'timestamp': 'Timestamp'}, inplace=True)
    mz3_df.rename(columns={'Time': 'Timestamp'}, inplace=True)
    empatica_df.rename(columns={'minute': 'Timestamp'}, inplace=True)

    # Retain original timezone as UTC and then convert to Central Time
    for df in [oura_df, mz3_df, empatica_df]:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], utc=True)
        df['Timestamp'] = df['Timestamp'].dt.tz_convert('US/Central')

    # Set Timestamp as index
    oura_df.set_index('Timestamp', inplace=True)
    mz3_df.set_index('Timestamp', inplace=True)
    empatica_df.set_index('Timestamp', inplace=True)

    # Dictionary mapping device names to their corresponding data and column name
    devices = {
        'oura': (oura_df, 'bpm'),
        'mz3': (mz3_df, 'hr'),
        'empatica': (empatica_df, 'entry_count')
    }

    if device_name not in devices:
        print("Invalid device name. Choose from: oura, mz3, empatica")
        return

    df, col_name = devices[device_name]

    # Resample and compute statistics
    df_resampled = df.resample('3min').agg({col_name: ['mean', 'max', 'min', 'std', 'count']})
    df_resampled.columns = ['mean', 'max', 'min', 'std', 'count']

    # Collect filtered data for multiple time ranges
    results = []
    for start_time, end_time in time_ranges:
        filtered_data = df_resampled.loc[start_time:end_time].reset_index()
        filtered_data['Date'] = filtered_data['Timestamp'].dt.strftime('%m/%d/%Y')
        filtered_data['Time'] = filtered_data['Timestamp'].dt.strftime('%H:%M:%S')
        results.append(filtered_data)

    # Concatenate all results
    final_data = pd.concat(results, ignore_index=True)
    final_data = final_data[['Date', 'Time', 'mean', 'max', 'min', 'std', 'count']]

    # Save to CSV
    final_data.to_csv(output_file, index=False)
    print(f"Extracted data saved to {output_file}")

    device = 'oura'  # Change to 'mz3' or 'empatica' as needed
    time_ranges = [

      ('2024-01-22 04:00:00', '2024-01-22 04:30:00'),
      ('2024-01-22 05:36:00', '2024-01-22 06:06:00'),
      ('20 24-01-22 06:14:00', '2024-01-22 06:44:00'),
      ('2024-01-22 07:36:00', '2024-01-22 07:45:00'),
      ('2024-01-22 08:07:00', '2024-01-22 08:37:00'),
      ('2024-01-23 06:07:00', '2024-01-23 06:37:00'),
      ('2024-01-23 11:30:00', '2024-01-23 12:00:00'),
      ('2024-01-23 12:15:00', '2024-01-23 12:45:00'),
      ('2024-01-23 17:30:00', '2024-01-23 18:00:00'),
      ('2024-01-23 10:10:00', '2024-01-23 10:40:00')
    ]
output_file = 'output.csv'

extract_device_data(device, time_ranges, output_file)



NameError: name 'device' is not defined

In [None]:
import pandas as pd


def extract_device_data(device_name, start_time, end_time, output_file):
    # Load datasets (files that we have)
    oura_df = pd.read_csv('oura.csv', parse_dates=['timestamp'])
    mz3_df = pd.read_csv('mz3.csv', parse_dates=['Time'])
    empatica_df = pd.read_csv('empatica.csv', parse_dates=['minute'])

    # Standardize column names so that when we Filter data, it is easier to call columns 'timestamp'
    oura_df.rename(columns={'timestamp': 'Timestamp'}, inplace=True)
    mz3_df.rename(columns={'Time': 'Timestamp'}, inplace=True)
    empatica_df.rename(columns={'minute': 'Timestamp'}, inplace=True)

    # Remove timezone info if present
    for df in [oura_df, mz3_df, empatica_df]:
        df['Timestamp'] = df['Timestamp'].dt.tz_localize(None)

    # Set Timestamp as index
    oura_df.set_index('Timestamp', inplace=True)
    mz3_df.set_index('Timestamp', inplace=True)
    empatica_df.set_index('Timestamp', inplace=True)

    # Dictionary mapping device names to their corresponding data and column name
    devices = {
        'oura': (oura_df, 'bpm'),
        'mz3': (mz3_df, 'hr'),
        'empatica': (empatica_df, 'entry_count')
    }

    if device_name not in devices:
        print("Invalid device name. Choose from: oura, mz3, empatica")
        return

    df, col_name = devices[device_name]

    # Resample and compute statistics
    df_resampled = df.resample('3min').agg({col_name: ['mean', 'max', 'min', 'std']})
    df_resampled.columns = ['mean', 'max', 'min', 'std']

    # Filter by the specified time range
    filtered_data = df_resampled.loc[start_time:end_time].reset_index()
    filtered_data['date'] = filtered_data['Timestamp'].dt.date

    # Save to CSV
    filtered_data.to_csv(output_file, index=False)
    print(f"Extracted data saved to {output_file}")

# Example usage in Google Colab:
device = 'oura'  # Change to 'mz3' or 'empatica' as needed
start_time = '2024-01-22 05:36:00'
end_time = '2024-01-22 06:06:00'
output_file = 'output.csv'

extract_device_data(device, start_time, end_time, output_file)

Extracted data saved to output.csv
