In [1]:
# pip install umap-learn

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
import pandas as pd
from econml.dml import CausalForestDML, SparseLinearDML
from econml.dr import SparseLinearDRLearner, ForestDRLearner
from econml.metalearners import XLearner
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
from flaml import AutoML
from sklearn.base import BaseEstimator, clone
import warnings

import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats
import scipy.special
from sklearn.linear_model import LassoCV, LinearRegression, ElasticNetCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.base import clone
import joblib
import flaml
from statsmodels.api import OLS
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

import plotnine as p9
import shap
import math

import dill as pickle
import shap
import re
import umap

In [None]:
import numpy as np
from flaml import AutoML
from sklearn.base import BaseEstimator, clone
import warnings
warnings.simplefilter('ignore')
###################################
# AutoML models
###################################

# FLAML models don't return "self" at end of fit. We create this wrapper.

class AutoMLWrap(BaseEstimator):

    def __init__(self, *, model, automl):
        self.model = model
        self.automl = automl

    def fit(self, X, y, **kwargs):
        self.model_ = clone(self.model)
        self.model_.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        return self.model_.predict(X)

# Custom r2 loss for regression, for more trustworthy learning curves.
def reg_r2(
        X_val, y_val, estimator, labels,
        X_train, y_train, weight_val=None, weight_train=None,
        *args,):
    mse = np.mean((estimator.predict(X_val) - y_val)**2)
    r_2 = 1-mse/np.mean((y_val - y_val.mean())**2)
    return -1*r_2, {"val_loss": r_2}

def auto_reg(X, y, *, groups=None, n_splits=5, split_type='auto', time_budget=60, verbose=0, estimator_list='auto', log_file_name='flaml_log.txt'):
    X = np.array(X)
    automl = AutoML(task='regression', time_budget=time_budget, early_stop=True,
                    eval_method='cv', n_splits=n_splits, split_type=split_type,
                    metric=reg_r2, verbose=verbose, estimator_list=estimator_list)
    if groups is None:
        automl.fit(X, y, log_file_name=log_file_name)
    else:
        automl.fit(X, y, groups=groups, log_file_name=log_file_name)
    best_est = automl.best_estimator
    return lambda: AutoMLWrap(model=clone(automl.best_model_for_estimator(best_est)), automl=automl)


class AutoMLWrapCLF(BaseEstimator):

    def __init__(self, *, model, automl, prop_lb):
        self.model = model
        self.automl = automl
        self.prop_lb = prop_lb

    def fit(self, X, y, **kwargs):
        self.model_ = clone(self.model)
        self.model_.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        preds = self.model_.predict_proba(X) 
        preds = np.clip(preds, self.prop_lb, 1-self.prop_lb)
        return preds
    
    def predict_proba(self, X):
        preds = self.model_.predict_proba(X) 
        preds = np.clip(preds, self.prop_lb, 1-self.prop_lb)
        return preds

# Custom r2 loss for classification, for more trustworthy learning curves.
def clf_r2(
        X_val, y_val, estimator, labels,
        X_train, y_train, weight_val=None, weight_train=None,
        *args,):
    mse = np.mean((estimator.predict_proba(X_val)[:, 1] - y_val)**2)
    r_2 = 1-mse/np.mean((y_val - y_val.mean())**2)
    return -1*r_2, {"val_loss": r_2}

def clf_mod_log_loss(
    X_val, y_val, estimator, labels,
    X_train, y_train, weight_val=None, weight_train=None,
    *args,):
    
    preds = estimator.predict_proba(X_val)[:,1]

    mod_log_loss = np.mean(-1* ( (.01 + y_val)*np.log(preds) + (1.01 - y_val)*np.log(1-preds)))

    return mod_log_loss, {"val_loss": mod_log_loss}

def auto_clf(
        X, y, *, groups=None, n_splits=5, split_type='auto', time_budget=60, verbose=0, estimator_list='auto', 
        log_file_name='flaml_log.txt', prop_lb=0.02):
    X = np.array(X)
    automl = AutoML(task='classification', time_budget=time_budget, early_stop=True,
                    eval_method='cv', n_splits=n_splits, split_type=split_type,
                    metric='log_loss', verbose=verbose, estimator_list=estimator_list,
                   )
    if groups is None:
        automl.fit(X, y, log_file_name=log_file_name)
    else:
        automl.fit(X, y, groups=groups, log_file_name=log_file_name)
    best_est = automl.best_estimator
    return lambda: AutoMLWrapCLF(model=clone(automl.best_model_for_estimator(best_est)), automl=automl, prop_lb=prop_lb)

In [4]:
df = pd.read_parquet('../../output/analytic_views/agg_cgm_msg_demog_raw_cgm.parquet')
print(len(df))

# Pre-train on training data
df = df[df.data_split == 'train']
print(len(df))

281400
95460


In [6]:
# Define treatment
df['treated'] = df['received_message']

# Transform binary features to int
binary_columns = df.select_dtypes(include=[bool]).columns
df[binary_columns] = df[binary_columns].astype(int)

# Define reward (outcome)
df['reward'] = 100*df.delta_in_range_fw_7d 
lb, ub = df.reward.quantile([0.05,0.95])
df['reward'] = df.reward.clip(lb, ub)
print(df.reward.describe())
df.groupby('treated')[['reward']].mean()

count    72042.000000
mean        -0.019374
std          9.564143
min        -18.276764
25%         -6.325950
50%         -0.017465
75%          6.223284
max         18.412388
Name: reward, dtype: float64


Unnamed: 0_level_0,reward
treated,Unnamed: 1_level_1
0.0,-0.080967
1.0,1.024199


In [7]:
df.treated.mean()

0.05037712130735387

# Calculate Doubly Robust (DR) scores to use for evaluating UMAP projections

In [None]:
df['large_tir_drop'] = ((df['in_range_7dr_7d_delta'] < -0.15) & (df['time_worn_7dr'] > 0.5)).astype(int)
df['low_tir'] = ((df['in_range_7dr'] < 0.65) & (df['time_worn_7dr'] > 0.5)).astype(int)
df['lows'] = (df['low_7dr'] > 0.04).astype(int)
df['very_lows'] = (df['very_low_7dr'] > 0.01).astype(int)

control_features = [
    'in_range_7dr_7d_delta','time_worn_7dr','in_range_7dr', 'low_7dr', 'very_low_7dr',
    'low_tir','lows','large_tir_drop','very_lows',
    'using_pump'
]

df = df.dropna(subset=control_features + ['reward'])
print(df.shape[0])

X = df[control_features].astype(float)
Y = df['reward'].values
D = df['treated'].values
groups = df.mrn.values

print(X.shape)
print(Y.shape)
print(D.shape)

In [None]:
# AutoML

TESTING = False

time_budget = 1 if TESTING else 60 # time budget for auto-ml in seconds (advisable at least 120) [400 good from testing]
verbose = 1  # verbosity of auto-ml
n_splits = 10 # cross-fitting and cross-validation splits

# Find DR score outcome and treatment models with AutoML

model_reg_zero = auto_reg(X[D==0], Y[D==0], groups=groups[D==0], n_splits=n_splits, split_type='auto',
                            verbose=verbose, time_budget=time_budget, estimator_list=['rf'])

model_reg_one = auto_reg(X[D==1], Y[D==1], groups=groups[D==1], n_splits=n_splits, split_type='auto',
                            verbose=verbose, time_budget=time_budget, estimator_list=['rf'])

model_t = auto_clf(X, D, groups=groups, n_splits=n_splits, split_type='auto',
                   verbose=verbose, time_budget=time_budget, estimator_list=['rf'])

In [None]:
# X-fit DR scores

cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=123)
splits = list(cv.split(X, D, groups=groups))

n = X.shape[0]
reg_zero_preds_t = np.zeros(n)
reg_one_preds_t = np.zeros(n)
reg_preds_t = np.zeros(n)

for train, test in splits:
    reg_zero = model_reg_zero().fit(X.iloc[train][list(D[train]==0)], Y[train][list(D[train]==0)])
    reg_one = model_reg_one().fit(X.iloc[train][list(D[train]==1)], Y[train][list(D[train]==1)])
    reg_zero_preds_t[test] = reg_zero.predict(X.iloc[test])
    reg_one_preds_t[test] = reg_one.predict(X.iloc[test])
    reg_preds_t[test] = reg_zero_preds_t[test] * (1 - D[test]) + reg_one_preds_t[test] * D[test]

prop_preds = cross_val_predict(model_t(), X, D, cv=splits)[:,1]

dr_preds = reg_one_preds_t - reg_zero_preds_t
dr_preds += (Y - reg_preds_t) * (D - prop_preds) / np.clip(prop_preds * (1 - prop_preds), .5, np.inf) # aggressive clip bc we're ok with more bias and lower variance when using scores for UMAP K selection

display(OLS(dr_preds, np.ones((len(dr_preds), 1))).fit(cov_type='cluster', cov_kwds={'groups': groups}).summary())

# Generate UMAP projections of raw glucose readings

In [None]:
columns_with_g_number = [col for col in df.columns if re.match(r'^g\d', col)]
print(len(columns_with_g_number))
columns_with_g_number[:5]

In [None]:
reducer = umap.UMAP()
reducer.fit(df[columns_with_g_number].sample(n=100,random_state=123).to_numpy())
print('fit done')
embedding = reducer.transform(df[columns_with_g_number].to_numpy())
embedding.shape

In [None]:
# Add components to df
df['c1'] = embedding[:,0]
df['c2'] = embedding[:,1]

In [None]:
# Plot components

plot_df = pd.concat([
    df.query('time_worn_7dr>0.5').sample(n=1000, random_state=123)
])

p9.options.figure_size = (12, 6)

(
    p9.ggplot(df)  
    + p9.aes(x="c1", y="c2", color="gri_14dr")
    + p9.geom_point(size=1, alpha=0.5) + p9.theme_bw()
)

In [None]:
(
    p9.ggplot(df) 
    + p9.aes(x="c1", y="c2", color="time_worn_7dr")
    + p9.geom_point(size=1, alpha=0.5) + p9.theme_bw()
)

Test DR prediction error of different numbers of UMAP components

In [None]:
def get_umap_dims(n_comp):
    reducer = umap.UMAP(n_components=n_comp)
    reducer.fit(df[columns_with_g_number].sample(n=10000,random_state=123).to_numpy())
    embedding = reducer.transform(df[columns_with_g_number].to_numpy())
    return(embedding)

args = range(1,11)
umap_components = {arg: get_umap_dims(arg) for arg in args}

In [None]:
# Calculate R-sq on DR scores for each number of components, fitting w AutoML
# 1) fit AutoMLs for each number of components
# 2) score models for each number of components
umap_regs = {
    arg: auto_reg(
        umap_components[arg], dr_preds, groups=groups, n_splits=n_splits, split_type='auto',
        verbose=verbose, time_budget=time_budget, estimator_list=['rf'])
    for arg in args}

In [None]:
cv = GroupKFold(n_splits=n_splits)

umap_scores = {
    arg: np.mean(cross_val_score(umap_regs[arg](), umap_components[arg], dr_preds, groups=groups, cv=cv, scoring='r2'))
    for arg in args}

In [None]:
umap_scores

# Generate best performing K=4 UMAP components for entire DF (not just train)

In [None]:
# Train UMAP on training data
reducer = umap.UMAP(n_components=4)
reducer.fit(df[columns_with_g_number].sample(n=10000,random_state=123).to_numpy())

In [None]:
# Apply UMAP to all data
full_df = pd.read_parquet('../../output/analytic_views/agg_cgm_msg_demog_raw_cgm.parquet')
embedding = reducer.transform(full_df[columns_with_g_number].to_numpy())

In [None]:
# Save best performing UMAP components
umap_df = pd.DataFrame(embedding, columns = [f'umap{i}' for i in range(4)])
umap_df = pd.concat([full_df.drop(columns_with_g_number, axis=1), umap_df], axis=1)
umap_df 

In [None]:
umap_df.to_parquet('../../output/pretraining/agg_cgm_msg_demog_umap.parquet')