In [None]:
import sys
import os
import logging
import pandas as pd
import datasets
from datasets import disable_caching; disable_caching()
from pprint import pprint
KEY = '2-NOTEBOOK'
WORKSPACE_PATH = os.getcwd().split(KEY)[0]
print(WORKSPACE_PATH); os.chdir(WORKSPACE_PATH)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s:%(asctime)s:(%(filename)s@%(lineno)d %(name)s)]: %(message)s')

SPACE = {
    'DATA_RAW': f'_Data/0-Data_Raw',
    'DATA_RFT': f'_Data/1-Data_RFT',
    'DATA_CASE': f'_Data/2-Data_CASE',
    'DATA_AIDATA': f'_Data/3-Data_AIDATA',
    'DATA_EXTERNAL': f'code/external',
    'DATA_HFDATA': f'_Data/5-Data_HFData',
    'CODE_FN': f'code/pipeline',
    'MODEL_ROOT': f'./_Model',
}
assert os.path.exists(SPACE['CODE_FN']), f'{SPACE["CODE_FN"]} not found'
print(SPACE['CODE_FN'])
sys.path.append(SPACE['CODE_FN'])

# os.environ["CUDA_VISIBLE_DEVICES"]="1"

# AI Data

In [None]:
HFDataName = 'FairGlucoBench-Bf24h-Af8h-Split'
path = os.path.join(SPACE['DATA_HFDATA'], HFDataName)
split_to_dataset = datasets.load_from_disk(path)
remove_unused_columns = True # if using the processed dataset, set to True. 
print(split_to_dataset)
Name_to_Data = {i: {'ds_tfm': split_to_dataset[i]} for i in split_to_dataset}


In [None]:
data_config = {}
CF_to_CFvocab = {}
data_config['CF_to_CFvocab'] = CF_to_CFvocab

CFName = 'HM5MinStep'
interval_delta = pd.Timedelta(minutes=5)
idx2tkn = [pd.Timestamp('2022-01-01 00:00:00') + interval_delta * i for i in range(24 * 12)]
idx2tkn = [f'{i.hour:02d}:{i.minute:02d}' for i in idx2tkn]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab = data_config['CF_to_CFvocab']
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

In [None]:
CFName = 'CGMValue'
idx2tkn = ["PAD", "UNKNOWN", "MASK"] + [f'Other_{i}' for i in range(0, 7)] + [str(i) for i in range(10, 401)]
tkn2idx = {tkn: idx for idx, tkn in enumerate(idx2tkn)}
CF_to_CFvocab[CFName] = {'idx2tkn': idx2tkn, 'tkn2idx': tkn2idx}

# Config

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error

def arima_forecast_with_rmse(example):
    series = np.array(example['input_ids'])
    labels = np.array(example['labels'])
    forecast_horizon = len(labels)

    # Fit ARIMA
    model = auto_arima(
        series,
        start_p=0, start_q=0,
        max_p=5, max_q=5,
        d=None,
        seasonal=False,
        stepwise=True,
        error_action='ignore',
        suppress_warnings=True,
        trace=False
    )

    # Forecast
    forecast = model.predict(n_periods=forecast_horizon)

    # RMSE computation
    rmse_results = {}
    for n in [6, 12, 24, 72]:
        if len(labels) >= n:
            rmse = np.sqrt(mean_squared_error(labels[:n], forecast[:n]))
            rmse_results[f'rmse@{n}'] = rmse
        else:
            rmse_results[f'rmse@{n}'] = None  # Not enough labels

    results = {}
    # results['rmse_results'] = rmse_results
    results['forecast'] = forecast.tolist()
    # results['labels'] = labels.tolist()
    # results['series'] = series.tolist()
    results.update(rmse_results)
    return results

In [None]:
dataset = Name_to_Data['test-id']['ds_tfm']
dataset

In [None]:
df = dataset.to_pandas()
df

In [None]:
x = df.iloc[0]['input_ids']

def get_mode_stats(arr):
    """
    Calculate the mode and its percentage in an array.
    
    Args:
        arr: Array-like object
        
    Returns:
        dict: Dictionary containing mode value, count, and percentage
    """
    from scipy import stats
    mode_result = stats.mode(arr)
    mode_value = mode_result.mode# [0]
    mode_count = mode_result.count# [0]
    mode_percentage = (mode_count / len(arr))
    
    return mode_percentage


mode_percentage = get_mode_stats(x)
mode_percentage
# x


In [None]:
s = df['input_ids'].apply(lambda x: get_mode_stats(x))
s

In [None]:
df[s == 1]

In [None]:
dataset = Name_to_Data['test-od']['ds_tfm']
dataset = dataset.shuffle(seed=42).select(range(100))
dataset = dataset.map(arima_forecast_with_rmse, num_proc=10)
dataset

In [None]:
df = dataset.to_pandas()
df[['rmse@6', 'rmse@12', 'rmse@24', 'rmse@72']].describe()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_forecast(input_ids, labels, predicted_values, title="ARIMA Forecast vs Ground Truth"):
    forecast_horizon = len(predicted_values)

    plt.figure(figsize=(12, 5))

    # Plot historical series
    plt.plot(input_ids, label='Historical', color='blue')

    # Plot forecasted values
    plt.plot(
        range(len(input_ids), len(input_ids) + forecast_horizon), 
        predicted_values, 
        label='Forecast', 
        color='red'
    )

    # Plot ground truth labels
    plt.plot(
        range(len(input_ids), len(input_ids) + len(labels)), 
        labels, 
        label='Ground Truth (Labels)', 
        color='green', 
        linestyle='dashed'
    )

    plt.title(title)
    plt.xlabel("Time Step")
    plt.ylabel("Value")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
# plot_forecast(results['series'], results['labels'], results['forecast'])