# Check Workspace and Import Packages

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd
import pandas
from datetime import datetime
now = datetime.now()

In [None]:
# Get the bucket name
my_bucket = os.getenv('WORKSPACE_BUCKET')

# List objects in the bucket
print(subprocess.check_output(f"gsutil ls -r {my_bucket}", shell=True).decode('utf-8'))

----

Create function to set seeds for reproducibility

----

In [None]:
def set_seeds(offset=0):
    import os
    import random
    import numpy as np
    import tensorflow as tf

    # Set the seed for numpy
    np.random.seed(42+offset)

    # Set the seed for the Python random module
    random.seed(42+offset)

    # Set the seed for TensorFlow
    tf.random.set_seed(42+offset)

    # Ensure reproducibility with certain environment variables
    os.environ['PYTHONHASHSEED'] = str(42+offset)


    ### Hold off on more extensive seeds (below) until verified necessary


    # # Configure TensorFlow to use a single thread if required
    # tf.config.threading.set_intra_op_parallelism_threads(1)
    # tf.config.threading.set_inter_op_parallelism_threads(1)

    # # Optionally, set environment variables to control NumPy threading behavior
    # os.environ['OMP_NUM_THREADS'] = '1'
    # os.environ['MKL_NUM_THREADS'] = '1'

    # # Example to demonstrate reproducibility
    # print("Numpy Random:", np.random.rand(3))
    # print("Python Random:", random.random())

    # # TensorFlow example
    # tf_example = tf.random.uniform([3])
    # print("TensorFlow Random:", tf_example)

    # # PyTorch Example (if using PyTorch)
    # import torch

    # torch.manual_seed(42+offset)
    # if torch.cuda.is_available():
    #     torch.cuda.manual_seed(42+offset)
    #     torch.cuda.manual_seed_all(42+offset)  # if using multi-GPU.
    #     torch.backends.cudnn.deterministic = True  # cuDNN
    #     torch.backends.cudnn.benchmark = False

    # # Generate reproducible random numbers with PyTorch
    # print("PyTorch Random:", torch.rand(3))

----

Create function to start/stop logging RAM usage to file

----

In [None]:
import os
import psutil
import threading
import time
from google.cloud import storage

def log_memory_usage(stop_event, file_name):
    with open(file_name, 'w') as f:
        while not stop_event.is_set():
            # Log memory usage to a local file
            memory_info = psutil.virtual_memory()
            gb_used = memory_info.used / (1024 ** 3)
            mem_usage = f"{time.ctime()}: {gb_used:.2f} GB\n"
            print(mem_usage)
            f.write(mem_usage)
            f.flush()
            
            # Upload the local file to GCS
            try:
                destination_blob_name = f'logs/{file_name}'
                upload_to_gcs(file_name, destination_blob_name)
            except Exception as e:
                print(f"Failed to upload to GCS: {e}")
                
            time.sleep(30)
            
def upload_to_gcs(source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # Get the bucket name
    my_bucket = os.getenv('WORKSPACE_BUCKET')
    # Initialize a storage client
    storage_client = storage.Client()
    bucket = storage_client.bucket(my_bucket[5:])
    blob = bucket.blob(destination_blob_name)

    # Upload the file
    blob.upload_from_filename(source_file_name)

#     print(f"File {source_file_name} uploaded to {destination_blob_name}.")

In [None]:
stop_event = threading.Event()
memory_thread = None
thread_lock = threading.Lock()  # To ensure thread-safe operations

def RAM_start():
    global stop_event
    global memory_thread

    with thread_lock:
        # Clear the stop event if it is set
        if stop_event.is_set():
            stop_event.clear()

        file_name = 'memory_usage.txt'
        
        # Stop the existing thread if it is running
        if memory_thread and memory_thread.is_alive():
            RAM_stop()
        
        # Create and start a new memory logging thread
        memory_thread = threading.Thread(target=log_memory_usage, args=(stop_event, file_name))
        memory_thread.start()
        print("Memory logging started")

def RAM_stop():
    global stop_event
    global memory_thread

    with thread_lock:
        # Set the stop event to signal the thread to stop
        stop_event.set()

        # Wait for the thread to finish if it exists
        if memory_thread:
            memory_thread.join()
            memory_thread = None  # Reset the thread to None
            print("Memory logging stopped")

In [None]:
RAM_start()

----
----

----
----

---
---
---
---
---
---
---

# Read `daily_df` from CSV

In [None]:
!gsutil cp {my_bucket}/data/dfs/daily_df_v2.csv daily_df_v2.csv
!gsutil cp {my_bucket}/data/dfs/daily_df_labels_v2.csv daily_df_labels_v2.csv
!gsutil cp {my_bucket}/data/dfs/demographics_df.csv demographics_df.csv

In [None]:
# Read Demographics CSV
# demo_df = pd.read_csv(f"{my_bucket}/data/dfs/demographics_df.csv")
demo_df = pd.read_csv(f"demographics_df.csv")

In [None]:
# daily_df = pd.read_csv(f"{my_bucket}/data/dfs/daily_df.csv")
# daily_df = pd.read_csv(f"{my_bucket}/data/dfs/daily_df_v2.csv")
daily_df = pd.read_csv(f"daily_df_v2.csv")

In [None]:
# labels = pd.read_csv(f"{my_bucket}/data/dfs/daily_df_labels_v2.csv", index_col=0)
labels = pd.read_csv(f"daily_df_labels_v2.csv", index_col=0)

# Prepare Data

## Set Seeds for Reproducibility

In [None]:
set_seeds()

## Data Preparation

### Remove naps

In [None]:
daily_df.shape

__Remove naps and only keep rows for main sleep__

In [None]:
# keep data for main sleep, not naps
daily_df = daily_df[daily_df['is_main_sleep']]

# remove is_main_sleep column
daily_df = daily_df.drop(columns=['is_main_sleep'])
daily_df.columns.values

In [None]:
daily_df.shape

### Check % 0's and remove columns if appropriate

In [None]:
def perc_zeros(df):
    # Calculate the percentage of zeros in each column
    zero_percent = (df == 0).mean() * 100

    # Print the percentage of zero values in each column
    for column, percentage in zero_percent.items():
        if percentage > 0:
            print(f"Column {column:>25}: {percentage:8.4f}% zeros")
            
perc_zeros(daily_df)

### Consider removing `minute_after_wakeup` (leaving for now)

`minute_after_wakeup`: The total number of minutes after the user woke up

In [None]:
daily_df.minute_after_wakeup.describe()

In [None]:
for i in [75,76,80,85,90,95,99]:
    print(f'minute_after_wakeup {i} percentile: {np.percentile(daily_df.minute_after_wakeup, q=i):4.1f}')

leaving `minute_after_wakeup` for now

---


In [None]:
print(daily_df.columns.values)

## Imputation

In [None]:
daily_df.isnull().any()

### __If there are NaN values that make sense to fill with 0's, replace__

In [None]:
def perc_nan(df):
    # Calculate the percentage of NaN values in each column
    nan_percent = df.isna().mean() * 100

    # Print the percentage of NaN values in each column
    for column, percentage in nan_percent.items():
        if percentage > 0:
            print(f"Column {column:>18}: {percentage:8.4f}% NaN")
            
perc_nan(daily_df)

### Drop `minute_restless` per ~70% missing

In [None]:
daily_df = daily_df.drop(columns=['minute_restless'])

In [None]:
perc_nan(daily_df)

### ~~Fill `calorie_count` and `minute_in_zone` with 0's~~

All of Us Controlled Tier Dataset v7 CDR Data Dictionary (C2022Q4R9)

`calorie_count`: Number calories burned within the custom heart rate zone.

`minute_in_zone`: Number minutes within the specified heart rate zone.

In [None]:
# daily_df.loc[:, ['calorie_count','minute_in_zone']] = daily_df.loc[:, ['calorie_count','minute_in_zone']].fillna(0)

In [None]:
# perc_nan(daily_df)

### Drop `elevation` 

__per don't have `device.src_id` to determine unit of measure (using `floors` per unified measure)__  


All of Us Controlled Tier Dataset v7 CDR Data Dictionary (C2022Q4R9)

`elevation`: The elevation traveled for the day displayed in the units defined by the data source. When __src_id__ is __PTSC__, the unit is feet. When __src_id__ is __TPC__, the unit is meters.

In [None]:
daily_df = daily_df.drop(columns=['elevation'])

### Fill `floors` with 0's
All of Us Controlled Tier Dataset v7 CDR Data Dictionary (C2022Q4R9)

`floors`: The floors provides ONLY the count of how many floors the Fitbit device counted as the user ascended in elevation. The researcher can determine how many "feet" or "meters" the user climbed as the device determines a floor every time the user ascends 10 feet (3 meters). Essentially 1 Floor = 10 Feet (3 Meters).  

In [None]:
daily_df.loc[:, ['floors']] = daily_df.loc[:, ['floors']].fillna(0)

In [None]:
perc_nan(daily_df)

~~### __Fill `minute_`* for sleep w/0; `floors` w/0__~~
### __Drop `'minute_deep', 'minute_light','minute_rem', 'minute_wake'` per ~29% missing__

In [None]:
# # Replace NaN values in those columns with 0 and reassign using .loc
# daily_df.loc[:, ['minute_deep', 'minute_light','minute_rem', 'minute_wake']] = daily_df.loc[:, 
#                 ['minute_deep', 'minute_light', 'minute_rem', 'minute_wake']].fillna(0)

daily_df = daily_df.drop(columns=['minute_deep', 'minute_light', 'minute_rem', 'minute_wake'])

In [None]:
perc_nan(daily_df)

In [None]:
columns_to_check = ['std_hr', 'morning_hr', 'afternoon_hr', 'evening_hr', 'night_hr']
nan_person_ids = daily_df[daily_df[columns_to_check].isna().any(axis=1)]['person_id'].unique()
print(len(nan_person_ids)/len(daily_df.person_id.unique()))

---

98% have missing values for at least one of these  

will simply handle in `get_random_chunk` by making multiple attempts to get a 10-day span w/out NaN's

---

In [None]:
print(daily_df.columns.values)

In [None]:
print(demo_df.columns.values)

# Write Prepared (unchunked) Data to CSV

In [None]:
daily_df = daily_df.sort_values(['person_id','date'])
daily_df.head(5)

In [None]:
daily_df.to_csv(f"daily_df_v2_prepped.csv", index=False)

In [None]:
!gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp daily_df_v2_prepped.csv {my_bucket}/data/dfs/daily_df_v2_prepped.csv

In [None]:
RAM_stop()

---