# Data Imputation
As the data has many missing values, we need to look at methods of imputation to maximise the usefulness of the existing data

In [1]:
# IMPORTS
import numpy as np
import pandas as pd
import polars as pl
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from helper.helper import float_to_time, time_to_float, float_time_range, float_time_minus
import random
import tqdm
import xgboost as xgb
import optuna
from sklearn.model_selection import KFold, RandomizedSearchCV,train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Set seed for repeatability
def seed_everything(seed):
    np.random.seed(seed) # np random seed
    random.seed(seed) # py random seed
seed_everything(seed=1024)
import torch
print('torch version: ',torch.__version__)
print('Cuda available: ',torch.cuda.is_available())
print('Running on ',torch.cuda.get_device_name(torch.cuda.current_device()))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch version:  2.5.1+cu124
Cuda available:  True
Running on  NVIDIA GeForce GTX 1060 6GB


Schema and Useful lists to memory

In [3]:
activities = ['Aerobic Workout', 'Dancing', 'HIIT', 'Indoor climbing', 'Outdoor Bike', 'Run', 'Spinning', 'Sport', 'Swim', 'Walk', 'Weights', 'Workout', 'Yoga']

lstm__train_features = [
    'id', 'p_num_p01', 'p_num_p02', 'p_num_p03', 'p_num_p04', 'p_num_p05', 'p_num_p06',
    'p_num_p10', 'p_num_p11', 'p_num_p12', 'p_num_p15', 'p_num_p16', 'p_num_p18', 'p_num_p19',
    'p_num_p21', 'p_num_p22', 'p_num_p24','bg', 'insulin', 'carbs', 'hr', 'steps', 
    'cals'] + activities

lstm_target = ['bg+1:00']

lstm_train_schema = {
    'id': pl.String,
    'time_delta': pl.Float64,
    'time': pl.Float64,  
    'p_num_p01': pl.Boolean, 
    'p_num_p02': pl.Boolean,
    'p_num_p03': pl.Boolean,
    'p_num_p04': pl.Boolean,
    'p_num_p05': pl.Boolean,
    'p_num_p06': pl.Boolean,
    'p_num_p10': pl.Boolean,
    'p_num_p11': pl.Boolean,
    'p_num_p12': pl.Boolean,
    'p_num_p15': pl.Boolean,
    'p_num_p16': pl.Boolean,
    'p_num_p18': pl.Boolean,
    'p_num_p19': pl.Boolean,
    'p_num_p21': pl.Boolean,
    'p_num_p22': pl.Boolean,
    'p_num_p24': pl.Boolean,
    'bg': pl.Float64,  
    'insulin': pl.Float64,
    'carbs': pl.Float64,
    'hr': pl.Float64,  
    'steps': pl.Float64, 
    'cals': pl.Float64,
}
for activity in activities:
    lstm_train_schema[activity] = pl.Boolean
lstm_train_schema['bg+1:00'] = pl.Float64

lstm_test_schema = lstm_train_schema.copy()
del lstm_test_schema['bg+1:00']


In [4]:
train_participants = ['p_num_p01','p_num_p02','p_num_p03','p_num_p04','p_num_p05','p_num_p06','p_num_p10','p_num_p11','p_num_p12']
test_participants = [ 'p_num_p01', 'p_num_p02', 'p_num_p04', 'p_num_p05', 'p_num_p06', 'p_num_p10', 'p_num_p11', 'p_num_p12', 'p_num_p15', 'p_num_p16', 'p_num_p18', 'p_num_p19', 'p_num_p21', 'p_num_p22', 'p_num_p24']
all_participants = sorted(set(train_participants + test_participants))

Features selected for predicting heart rate

In [5]:
hr_pred_feature_cols = ['time','bg','insulin','carbs','hr','steps','cals'] + activities + all_participants

In [29]:
lstm_train_df = pl.read_csv('../../data/lstm_sg_train.csv',schema_overrides= lstm_train_schema)
lstm_test_df = pl.read_csv('../../data/lstm_sg_test.csv', schema_overrides=lstm_test_schema)

In [42]:
all_hr_pred_df = pl.concat([lstm_train_df.drop(['id','time_delta','bg+1:00']), lstm_test_df.drop(['id','time_delta'])])
all_hr_pred_df = all_hr_pred_df.filter(pl.col('hr').is_not_null())
print(all_hr_pred_df.columns)

t = all_hr_pred_df['hr'].to_numpy().reshape(-1,1)
X = all_hr_pred_df.drop('hr').to_numpy()

X_scaler = MinMaxScaler()
t_scaler = MinMaxScaler()
X = X_scaler.fit_transform(X)
t = t_scaler.fit_transform(t).ravel()

X.shape, t.shape

['time', 'p_num_p01', 'p_num_p02', 'p_num_p03', 'p_num_p04', 'p_num_p05', 'p_num_p06', 'p_num_p10', 'p_num_p11', 'p_num_p12', 'p_num_p15', 'p_num_p16', 'p_num_p18', 'p_num_p19', 'p_num_p21', 'p_num_p22', 'p_num_p24', 'bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'Aerobic Workout', 'Dancing', 'HIIT', 'Indoor climbing', 'Outdoor Bike', 'Run', 'Spinning', 'Sport', 'Swim', 'Walk', 'Weights', 'Workout', 'Yoga']


((9673371, 35), (9673371,))

In [44]:
t_scaler.data_max_, t_scaler.data_min_

(array([199.3]), array([37.6]))

In [None]:
dtrain_full = xgb.DMatrix(X, label=t, missing=np.nan)

best_boost_round = 467

best_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 10,
    'learning_rate': 0.7057888338320615,
    'min_child_weight': 8,
    'reg_lambda': 0.01696500898391533
}

cv_scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    t_train, t_val = t[train_index], t[val_index]
    
    dtrain = xgb.DMatrix(X_train, label=t_train, missing=np.nan)
    dval = xgb.DMatrix(X_val, label=t_val, missing=np.nan)
    
    model = xgb.train(
        best_params,
        dtrain,
        num_boost_round=best_boost_round,
        evals=[(dval, 'eval')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    preds = model.predict(dval)
    score = mean_squared_error(t_val, preds)
    print(f'Fold score: {score}')
    cv_scores.append(score)
    
mean_cv = np.mean(cv_scores)
mean_cv

KeyboardInterrupt: 

In [None]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'tree_method': 'hist',
        'device': 'cuda',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True)  
    }

    num_boost_round = trial.suggest_int('num_boost_round', 50, 500)

    cv_scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        t_train, t_val = t[train_index], t[val_index]
        
        dtrain = xgb.DMatrix(X_train, label=t_train, missing=np.nan)
        dval = xgb.DMatrix(X_val, label=t_val, missing=np.nan)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        preds = model.predict(dval)
        score = mean_squared_error(t_val, preds)
        cv_scores.append(score)
    
    return np.mean(cv_scores)


def optimize_hyperparameters(n_trials=20):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    print("Best trial:")
    trial = study.best_trial
    print(f"  Value: {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    return study.best_params

best_params = optimize_hyperparameters(n_trials=50)


[I 2024-11-27 03:50:49,983] A new study created in memory with name: no-name-30c34ab3-b6b5-4f76-848d-f6f993e510d7
[I 2024-11-27 03:53:28,203] Trial 0 finished with value: 0.0039870210187792085 and parameters: {'max_depth': 4, 'learning_rate': 0.534466221123504, 'min_child_weight': 9, 'reg_lambda': 2.7560299352139753e-08, 'num_boost_round': 130}. Best is trial 0 with value: 0.0039870210187792085.
[I 2024-11-27 04:01:57,864] Trial 1 finished with value: 0.004107003943048157 and parameters: {'max_depth': 8, 'learning_rate': 0.00652046496450349, 'min_child_weight': 6, 'reg_lambda': 7.779330490591112e-07, 'num_boost_round': 466}. Best is trial 0 with value: 0.0039870210187792085.
[I 2024-11-27 04:05:59,179] Trial 2 finished with value: 0.0025609188299128964 and parameters: {'max_depth': 7, 'learning_rate': 0.36065959143027126, 'min_child_weight': 9, 'reg_lambda': 0.1624143688183076, 'num_boost_round': 292}. Best is trial 2 with value: 0.0025609188299128964.
[I 2024-11-27 04:10:04,050] Trial

Best trial:
  Value: 0.0004114389510458025
  Params: 
    max_depth: 10
    learning_rate: 0.7057888338320615
    min_child_weight: 8
    reg_lambda: 0.01696500898391533
    num_boost_round: 467


Load the datasets

# Imputating the bg rows

In [4]:
lstm_hr_train_df = pl.read_csv('../../data/lstm_hr_train.csv',schema_overrides= lstm_train_schema)
lstm_hr_test_df = pl.read_csv('../../data/lstm_hr_test.csv', schema_overrides=lstm_test_schema)

In [5]:
all_bg_pred_df = pl.concat([lstm_hr_train_df.drop(['id','time_delta','bg+1:00']), lstm_hr_test_df.drop(['id','time_delta'])])
all_bg_pred_df = all_bg_pred_df.filter(pl.col('bg').is_not_null())

t = all_bg_pred_df['bg'].to_numpy().reshape(-1,1)
X = all_bg_pred_df.drop('bg').to_numpy()

X_scaler = MinMaxScaler()
t_scaler = MinMaxScaler()
X = X_scaler.fit_transform(X)
t = t_scaler.fit_transform(t)

X.shape, t.shape

((12848343, 35), (12848343, 1))

In [8]:
all_bg_pred_df.columns

['time',
 'p_num_p01',
 'p_num_p02',
 'p_num_p03',
 'p_num_p04',
 'p_num_p05',
 'p_num_p06',
 'p_num_p10',
 'p_num_p11',
 'p_num_p12',
 'p_num_p15',
 'p_num_p16',
 'p_num_p18',
 'p_num_p19',
 'p_num_p21',
 'p_num_p22',
 'p_num_p24',
 'bg',
 'insulin',
 'carbs',
 'hr',
 'steps',
 'cals',
 'Aerobic Workout',
 'Dancing',
 'HIIT',
 'Indoor climbing',
 'Outdoor Bike',
 'Run',
 'Spinning',
 'Sport',
 'Swim',
 'Walk',
 'Weights',
 'Workout',
 'Yoga']

In [6]:
t_scaler.data_max_, t_scaler.data_min_

(array([27.8]), array([2.2]))

In [7]:
import xgboost as xgb
import optuna
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'tree_method': 'hist',
        'device': 'cuda',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True)  
    }

    num_boost_round = trial.suggest_int('num_boost_round', 50, 500)

    cv_scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        t_train, t_val = t[train_index], t[val_index]
        
        dtrain = xgb.DMatrix(X_train, label=t_train, missing=np.nan)
        dval = xgb.DMatrix(X_val, label=t_val, missing=np.nan)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dval, 'eval')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        preds = model.predict(dval)
        score = mean_squared_error(t_val, preds)
        cv_scores.append(score)
    
    return np.mean(cv_scores)


def optimize_hyperparameters(n_trials=20):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    print("Best trial:")
    trial = study.best_trial
    print(f"  Value: {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    return study.best_params

best_params = optimize_hyperparameters(n_trials=25)


[I 2024-11-27 11:12:25,999] A new study created in memory with name: no-name-08104fb7-8c00-4e54-a446-7942fc5b97b1
[I 2024-11-27 11:14:44,338] Trial 0 finished with value: 0.006891279743088864 and parameters: {'max_depth': 8, 'learning_rate': 0.3043410100585595, 'min_child_weight': 2, 'reg_lambda': 1.0246609805689392e-07, 'num_boost_round': 76}. Best is trial 0 with value: 0.006891279743088864.
[I 2024-11-27 11:17:19,851] Trial 1 finished with value: 0.007333521855663806 and parameters: {'max_depth': 5, 'learning_rate': 0.8695852939610327, 'min_child_weight': 4, 'reg_lambda': 3.917752038200939e-05, 'num_boost_round': 153}. Best is trial 0 with value: 0.006891279743088864.
[I 2024-11-27 11:21:17,513] Trial 2 finished with value: 0.009861646730914718 and parameters: {'max_depth': 9, 'learning_rate': 0.0047074619929575795, 'min_child_weight': 4, 'reg_lambda': 8.941119954360705e-07, 'num_boost_round': 150}. Best is trial 0 with value: 0.006891279743088864.
[I 2024-11-27 11:22:59,160] Trial 

Best trial:
  Value: 0.0020402501658796112
  Params: 
    max_depth: 9
    learning_rate: 0.9654913454263729
    min_child_weight: 7
    reg_lambda: 0.06472962504852985
    num_boost_round: 423


# Validation Plotting

In [None]:
lstm_train_df = pl.read_csv('../../data/lstm_train.csv', schema_overrides=lstm_train_schema)
lstm_test_df = pl.read_csv('../../data/lstm_test.csv', schema_overrides=lstm_test_schema)
lstm_hr_train_df = pl.read_csv('../../data/lstm_hr_train.csv', schema_overrides=lstm_train_schema)
lstm_hr_test_df = pl.read_csv('../../data/lstm_hr_test.csv', schema_overrides=lstm_test_schema)

Split the data into some small sections to assess different imputation methods

In [None]:
bg_train_pred_df = lstm_train_df.group_by('id').agg(
    pl.col('time'),
    pl.col('hr')
)

bg_train_pred_df = bg_train_pred_df.filter(
    pl.col('hr').list.eval(pl.element().is_not_null()).list[0]
)
bg_train_pred_df



In [None]:
ids = 'p05_550'
train_section = lstm_train_df.filter(pl.col('id') == ids).fill_null(0)
hr_section = lstm_hr_train_df.filter(pl.col('id') == ids).fill_null(0)
knn_section = imputed_clean_hr_df.filter(pl.col('id') == ids).fill_null(0)
train_section

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(3, 1, 1)
ax1 = sns.lineplot(data=train_section.to_pandas(), x='time', y='hr', marker='.', label='hr',color='red')
ax2 = plt.gca().twinx()  
sns.scatterplot(data=train_section.to_pandas(), x='time', y='bg', marker='.', label='bg', ax=ax2, color='green')
sns.scatterplot(data=train_section.to_pandas(), x='time', y='insulin', marker='.', label='insulin', ax=ax2, color='blue')
sns.scatterplot(data=train_section.to_pandas(), x='time', y='carbs', marker='.', label='carbs', ax=ax2, color='orange')
sns.scatterplot(data=train_section.to_pandas(), x='time', y='steps', marker='.', label='steps', ax=ax2, color='purple')
sns.scatterplot(data=train_section.to_pandas(), x='time', y='cals', marker='.', label='cals', ax=ax2, color='pink')



plt.subplot(3, 1, 2)
ax3 = sns.lineplot(data=hr_section.to_pandas(), x='time', y='hr', marker='.', label='hr', color='red')
ax4 = plt.gca().twinx()  
sns.scatterplot(data=hr_section.to_pandas(), x='time', y='bg', marker='.', label='bg', ax=ax4, color='green')
sns.scatterplot(data=hr_section.to_pandas(), x='time', y='insulin', marker='.', label='insulin', ax=ax4, color='blue')
sns.scatterplot(data=hr_section.to_pandas(), x='time', y='carbs', marker='.', label='carbs', ax=ax4, color='orange')
sns.scatterplot(data=hr_section.to_pandas(), x='time', y='steps', marker='.', label='steps', ax=ax4, color='purple')
sns.scatterplot(data=hr_section.to_pandas(), x='time', y='cals', marker='.', label='cals', ax=ax4, color='pink')

plt.subplot(3, 1, 3)
ax5 = sns.lineplot(data=train_section.to_pandas(), x='time', y='hr', marker='.', label='hr', color='red')



plt.tight_layout()
plt.show()


In [None]:
best_params = { # obtained from hyperparameter tuning on google colab
    'subsample': 1.0,
    'n_estimators': 200,
    'max_depth': 10,
    'learning_rate': 0.2,
    'colsample_bytree': 0.8
}

final_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    device='cuda',
    **best_params
)

clean_train_hr_df = lstm_train_df.filter(pl.col('hr').is_not_null()).select(hr_pred_feature_cols + ['hr'])
clean_test_hr_df = lstm_test_df.filter(pl.col('hr').is_not_null()).select(hr_pred_feature_cols + ['hr'])

clean_hr_df = pl.concat([clean_train_hr_df, clean_test_hr_df])

X = clean_hr_df.select(hr_pred_feature_cols)
y = clean_hr_df.select('hr')
X_np = X.to_numpy()
y_np = y.to_numpy().ravel()

final_model.fit(X_np, y_np)

def imputate_hr_values(df, model, output_path):

    missing_hr_df = df.filter(pl.col('hr').is_null())

    X_missing = missing_hr_df.select(hr_pred_feature_cols)
    X_missing_np = X_missing.to_numpy()

    predicted_hr = final_model.predict(X_missing_np)
    predicted_hr_lst = [hr for hr in predicted_hr]

    pd_df = df.to_pandas()

    predicted_hr_lst_copy = pd.Series(predicted_hr_lst)

    missing_mask = pd_df['hr'].isnull() 
    missing_count = missing_mask.sum()
    print(f"Number of missing 'hr' values: {missing_count}")

    if missing_count > 0:
        pd_df.loc[missing_mask, 'hr'] = predicted_hr_lst_copy[:missing_count].values

    pd_df.to_csv(output_path, index=False)

# imputate_hr_values(lstm_train_df, final_model, '../../data/lstm_hr_train.csv')
# imputate_hr_values(lstm_test_df, final_model, '../../data/lstm_hr_test.csv')

    




In [None]:
importances = final_model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(hr_pred_feature_cols, importances, color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importance from XGBoost')


Trying to use the Iterative Imputer from scikitlearn

In [None]:
print('Train columns not in test'
)
for col in lstm_train_df.columns:
    if col not in lstm_test_df.columns:
        print(col)

print('Test columns not in train')
for col in lstm_test_df.columns:
    if col not in lstm_train_df.columns:
        print(col)

In [None]:
from sklearn.impute import KNNImputer

knn_data = pl.concat([lstm_train_df.drop(['bg+1:00','id']), lstm_test_df.drop('id')]).to_numpy()


knn_imputer = KNNImputer(n_neighbors=5, weights='uniform')

knn_cleaned = knn_imputer.fit_transform(knn_data)

imputed_clean_hr_df = pl.DataFrame(knn_cleaned, schema = lstm_test_schema)



