<a href="https://colab.research.google.com/github/jhdaws/mhealth_classification/blob/main/lstm/lstm_split%2Bstandardize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Split + Standardize

## 1) Loading Data and Packages

In [1]:
import os
from google.colab import files
import shutil
import pickle as pkl
import random

import math
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
def save_pkl(data, save_path):
    with open(save_path, "wb") as f:
        pkl.dump(data, f)

def load_pkl(file_path):
    with open(file_path,'rb') as f:
        data = pkl.load(f)
    return data

Load data from local folder

In [3]:
os.makedirs('/content/data', exist_ok=True)
destination_folder = '/content/data'
uploaded = files.upload()
for filename in uploaded.keys():
    shutil.move(filename, os.path.join(destination_folder, filename))

Saving fitbit_seq_dicts_study_14.pkl to fitbit_seq_dicts_study_14.pkl
Saving fitbit_seq_dicts_sequence_14.pkl to fitbit_seq_dicts_sequence_14.pkl
Saving fitbit_seq_dicts_sequence_7.pkl to fitbit_seq_dicts_sequence_7.pkl
Saving fitbit_seq_dicts_study_7.pkl to fitbit_seq_dicts_study_7.pkl
Saving fitbit_seq_dicts_impute_sequence_7.pkl to fitbit_seq_dicts_impute_sequence_7.pkl
Saving fitbit_seq_dicts_impute_study_7.pkl to fitbit_seq_dicts_impute_study_7.pkl
Saving fitbit_seq_dicts_impute_sequence_14.pkl to fitbit_seq_dicts_impute_sequence_14.pkl
Saving fitbit_seq_dicts_impute_study_14.pkl to fitbit_seq_dicts_impute_study_14.pkl


In [37]:
fitbit_seq_dicts_study_14 = load_pkl('/content/data/fitbit_seq_dicts_study_14.pkl')
fitbit_seq_dicts_sequence_14 = load_pkl('/content/data/fitbit_seq_dicts_sequence_14.pkl')
fitbit_seq_dicts_study_7 = load_pkl('/content/data/fitbit_seq_dicts_study_7.pkl')
fitbit_seq_dicts_sequence_7 = load_pkl('/content/data/fitbit_seq_dicts_sequence_7.pkl')
fitbit_seq_dicts_impute_sequence_7 = load_pkl('/content/data/fitbit_seq_dicts_impute_sequence_7.pkl')
fitbit_seq_dicts_impute_study_7 = load_pkl('/content/data/fitbit_seq_dicts_impute_study_7.pkl')
fitbit_seq_dicts_impute_sequence_14 = load_pkl('/content/data/fitbit_seq_dicts_impute_sequence_14.pkl')
fitbit_seq_dicts_impute_study_14 = load_pkl('/content/data/fitbit_seq_dicts_impute_study_14.pkl')

## 2) Splitting Data

In [5]:
splits = {'train': 0.6, 'val': 0.2, 'test': 0.2}

want to split with stratification

In [6]:
def user_id_w_visit(fitbit_seq_dict):
  visit_dict = {}
  for fid, fid_dict in fitbit_seq_dict.items():
        visit_count = 0
        for wk, seqs_labels in fid_dict.items():
            _, label = seqs_labels

            if label == 1:
                visit_count += 1
        visit_dict[fid] = visit_count

  visit_ids = []
  for key, value in visit_dict.items():
    if value > 0:
      visit_ids.append(key)

  return visit_ids

In [7]:
def train_test_val_split(id_list, splits):
  id_count = len(id_list)

  test_count = math.floor(id_count * splits['test'])
  train_count = math.floor(id_count * splits['train'])
  val_count = id_count - test_count - train_count

  train_ids = random.sample(id_list, train_count)
  remaining_ids = [id for id in id_list if id not in train_ids]
  val_ids = random.sample(remaining_ids, val_count)
  test_ids = [id for id in remaining_ids if id not in val_ids]

  return train_ids, val_ids, test_ids

In [8]:
def get_splits(visit_ids, all_ids, splits):
  train_ids_visit, val_ids_visit, test_ids_visit = train_test_val_split(visit_ids, splits)

  remaining_ids = [id for id in all_ids if id not in visit_ids]

  train_ids_novisit, val_ids_novisit, test_ids_novisit = train_test_val_split(remaining_ids, splits)

  train_ids = train_ids_visit + train_ids_novisit
  val_ids = val_ids_visit + val_ids_novisit
  test_ids = test_ids_visit + test_ids_novisit

  split_ids = {'train': train_ids, 'val': val_ids, 'test': test_ids}

  return split_ids

In [9]:
vids_study_14 = user_id_w_visit(fitbit_seq_dicts_study_14)
vids_sequence_14 = user_id_w_visit(fitbit_seq_dicts_sequence_14)
vids_study_7 = user_id_w_visit(fitbit_seq_dicts_study_7)
vids_sequence_7 = user_id_w_visit(fitbit_seq_dicts_sequence_7)
vids_study_14_impute = user_id_w_visit(fitbit_seq_dicts_impute_study_14)
vids_sequence_14_impute = user_id_w_visit(fitbit_seq_dicts_impute_sequence_14)
vids_study_7_impute = user_id_w_visit(fitbit_seq_dicts_impute_study_7)
vids_sequence_7_impute = user_id_w_visit(fitbit_seq_dicts_impute_sequence_7)

In [10]:
split_study_14 = get_splits(vids_study_14, fitbit_seq_dicts_study_14.keys(), splits)
split_sequence_14 = get_splits(vids_sequence_14, fitbit_seq_dicts_sequence_14.keys(), splits)
split_study_7 = get_splits(vids_study_7, fitbit_seq_dicts_study_7.keys(), splits)
split_sequence_7 = get_splits(vids_sequence_7, fitbit_seq_dicts_sequence_7.keys(), splits)
split_study_14_impute = get_splits(vids_study_14_impute, fitbit_seq_dicts_impute_study_14.keys(), splits)
split_sequence_14_impute = get_splits(vids_sequence_14_impute, fitbit_seq_dicts_impute_sequence_14.keys(), splits)
split_study_7_impute = get_splits(vids_study_7_impute, fitbit_seq_dicts_impute_study_7.keys(), splits)
split_sequence_7_impute = get_splits(vids_sequence_7_impute, fitbit_seq_dicts_impute_sequence_7.keys(), splits)

In [11]:
save_pkl(split_study_14, '/content/data/split_study_14.pkl')
save_pkl(split_sequence_14, '/content/data/split_sequence_14.pkl')
save_pkl(split_study_7, '/content/data/split_study_7.pkl')
save_pkl(split_sequence_7, '/content/data/split_sequence_7.pkl')
save_pkl(split_study_14_impute, '/content/data/split_study_14_impute.pkl')
save_pkl(split_sequence_14_impute, '/content/data/split_sequence_14_impute.pkl')
save_pkl(split_study_7_impute, '/content/data/split_study_7_impute.pkl')
save_pkl(split_sequence_7_impute, '/content/data/split_sequence_7_impute.pkl')

files.download('/content/data/split_study_14.pkl')
files.download('/content/data/split_sequence_14.pkl')
files.download('/content/data/split_study_7.pkl')
files.download('/content/data/split_sequence_7.pkl')
files.download('/content/data/split_study_14_impute.pkl')
files.download('/content/data/split_sequence_14_impute.pkl')
files.download('/content/data/split_study_7_impute.pkl')
files.download('/content/data/split_sequence_7_impute.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 3) Standardize

In [12]:
oh_features = ['diet_response_0.0', 'diet_response_1.0',
       'diet_response_2.0', 'diet_response_nan', 'medication_response_0.0',
       'medication_response_1.0', 'medication_response_2.0',
       'medication_response_nan', 'symptoms_response_0.0',
       'symptoms_response_1.0', 'symptoms_response_2.0',
       'symptoms_response_nan']

all_features = ['avgWeight_per_day','calories', 'heart', 'steps',
        'minutes_asleep', 'minutes_awake', 'temp/skin_nightlyRelative',
       'spo2_avg', 'spo2_min', 'spo2_max', 'hrv_dailyRmssd', 'hrv_deepRmssd',
       'br_breathingRate', 'out_of_range_zone_cal', 'out_of_range_zone_max_hr',
       'out_of_range_zone_min_hr', 'fat_burn_zone_cal', 'fat_burn_zone_max_hr',
       'fat_burn_zone_min_hr', 'cardio_zone_cal', 'cardio_zone_max_hr',
       'cardio_zone_min_hr', 'peak_zone_cal', 'peak_zone_max_hr',
       'peak_zone_min_hr']

base_features = ['calories', 'heart', 'steps']

In [13]:
def std_mean_values(train_ids, fitbit_seq_dict, scale_features, oh_features):

  features = []
  for id in train_ids:
    id_dict = fitbit_seq_dict[id]
    for key, value in id_dict.items():
      features +=[value[0]]

  features = pd.concat(features).reset_index(drop=True)

  scaler = StandardScaler()
  scaler.fit(features[scale_features])

  scaling_vals = {}

  for i in range(scaler.n_features_in_):
    scaling_vals[scaler.feature_names_in_[i]] = (scaler.mean_[i], math.sqrt(scaler.var_[i]))

    if scaler.var_[i] == 0:
      print(scaler.feature_names_in_[i], ' has 0 variance')

  for feature in oh_features:
    scaling_vals[feature] = (0, 1)

  return scaling_vals

In [38]:
print('study 7')
scale_vals_study_7 = std_mean_values(split_study_7['train'], fitbit_seq_dicts_study_7, all_features, oh_features)
print('study 14')
scale_vals_study_14 = std_mean_values(split_study_14['train'], fitbit_seq_dicts_study_14, all_features, oh_features)
print('sequence 7')
scale_vals_sequence_7 = std_mean_values(split_sequence_7['train'], fitbit_seq_dicts_sequence_7, all_features, oh_features)
print('sequence 14')
scale_vals_sequence_14 = std_mean_values(split_sequence_14['train'], fitbit_seq_dicts_sequence_14, all_features, oh_features)
print('study 7 impute')
scale_vals_study_7_impute = std_mean_values(split_study_7_impute['train'], fitbit_seq_dicts_impute_study_7, base_features, oh_features = [])
print('study 14 impute')
scale_vals_study_14_impute = std_mean_values(split_study_14_impute['train'], fitbit_seq_dicts_impute_study_14, base_features, oh_features = [])
print('sequence 7 impute')
scale_vals_sequence_7_impute = std_mean_values(split_sequence_7_impute['train'], fitbit_seq_dicts_impute_sequence_7, base_features, oh_features = [])
print('sequence 14 impute')
scale_vals_sequence_14_impute = std_mean_values(split_sequence_14_impute['train'], fitbit_seq_dicts_impute_sequence_14, base_features, oh_features = [])

study 7
out_of_range_zone_min_hr  has 0 variance
peak_zone_max_hr  has 0 variance
study 14
out_of_range_zone_min_hr  has 0 variance
peak_zone_max_hr  has 0 variance
sequence 7
out_of_range_zone_min_hr  has 0 variance
peak_zone_max_hr  has 0 variance
sequence 14
out_of_range_zone_min_hr  has 0 variance
peak_zone_max_hr  has 0 variance
study 7 impute
study 14 impute
sequence 7 impute
sequence 14 impute


In [18]:
drop = ['out_of_range_zone_min_hr', 'peak_zone_max_hr']

In [31]:
def standardize_data(fitbit_seq_dict, scale_vals, drop):
  fitbit_seq_dict = fitbit_seq_dict.copy()
  for id, id_dict in tqdm(fitbit_seq_dict.items()):
    for seq, seq_dict in id_dict.items():
      feat = seq_dict[0]

      for col, (mean, std) in scale_vals.items():
        feat[col] = (feat[col] - mean) / std

      feat = feat.drop(columns=drop)

      id_dict[seq] = (feat, seq_dict[1])
    fitbit_seq_dict[id] = id_dict

  return fitbit_seq_dict

In [39]:
final_study_7 = standardize_data(fitbit_seq_dicts_study_7, scale_vals_study_7, drop)
final_study_14 = standardize_data(fitbit_seq_dicts_study_14, scale_vals_study_14, drop)
final_sequence_7 = standardize_data(fitbit_seq_dicts_sequence_7, scale_vals_sequence_7, drop)
final_sequence_14 = standardize_data(fitbit_seq_dicts_sequence_14, scale_vals_sequence_14, drop)
final_study_7_impute = standardize_data(fitbit_seq_dicts_impute_study_7, scale_vals_study_7_impute, drop = [])
final_study_14_impute = standardize_data(fitbit_seq_dicts_impute_study_14, scale_vals_study_14_impute, drop = [])
final_sequence_7_impute = standardize_data(fitbit_seq_dicts_impute_sequence_7, scale_vals_sequence_7_impute, drop = [])
final_sequence_14_impute = standardize_data(fitbit_seq_dicts_impute_sequence_14, scale_vals_sequence_14_impute, drop = [])

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

In [41]:
save_pkl(final_study_7, '/content/data/final_study_7.pkl')
save_pkl(final_study_14, '/content/data/final_study_14.pkl')
save_pkl(final_sequence_7, '/content/data/final_sequence_7.pkl')
save_pkl(final_sequence_14, '/content/data/final_sequence_14.pkl')
save_pkl(final_study_7_impute, '/content/data/final_study_7_impute.pkl')
save_pkl(final_study_14_impute, '/content/data/final_study_14_impute.pkl')
save_pkl(final_sequence_7_impute, '/content/data/final_sequence_7_impute.pkl')
save_pkl(final_sequence_14_impute, '/content/data/final_sequence_14_impute.pkl')

files.download('/content/data/final_study_7.pkl')
files.download('/content/data/final_study_14.pkl')
files.download('/content/data/final_sequence_7.pkl')
files.download('/content/data/final_sequence_14.pkl')
files.download('/content/data/final_study_14_impute.pkl')
files.download('/content/data/final_sequence_14_impute.pkl')
files.download('/content/data/final_study_7_impute.pkl')
files.download('/content/data/final_sequence_7_impute.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>