In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from scipy.stats import zscore
import os, pickle, time
from sklearn.model_selection import train_test_split

In [2]:
#local imports
from deeplearning.util import *
from deeplearning import loss_fxs, transforms
from deeplearning import callbacks
import deeplearning.metrics as metrics_api
from deeplearning import datasets
from deeplearning import fitting

In [3]:
device = "mps" if getattr(torch,'has_mps',False) \
    else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


In [4]:
df = pd.read_csv("smoking_drinking_hyperparams.csv")
df['DRK_YN'] = df['DRK_YN'].map({'Y': 1, 'N': 0})
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})

In [5]:
mod_args = {
    'emb_szs': [],
    'out_sz': 2,
    'layers': [3000, 850, 120],
    'ps': [0.2, 0.2, 0.05],
    'sigma': 0.8,
}
n_iter = 0
max_lr = 1e-5,
n_epochs = 2
pct_start = 0.3
grad_clip = 1.0
num_workers = 0
lams = 0.05
cb_metrics = [metrics_api.AUC(), metrics_api.Last()]
train_dl_shuffle = True
train_batch_size = 128
val_batch_size = 128
train_df_pre, val_df_pre = train_test_split(df, test_size=0.2, random_state=42)
opt_kw = {'weight_decay': 0.01}

In [6]:
x_cols = df.columns.tolist()
x_cols.remove('DRK_YN')
# cat_cols = ['sex']
# cont_cols = x_cols.copy()
# cont_cols.remove('sex')
y_cols = ['DRK_YN']

In [7]:
save_dir = 'deeplearning/models/newmodel'
cwd = os.getcwd()
save_dir = os.path.join(cwd, save_dir)
# os.makedirs(save_dir, exist_ok=False)

In [8]:
lr_kwargs = {}
lr_kwargs['max_lr'] = max_lr
lr_kwargs['n_epochs'] = n_epochs
lr_kwargs['pct_start'] = pct_start

In [9]:
def lr_sched_factory(opt, max_lr=None, n_epochs=None, steps_per_epoch=None, **kwargs):
    return torch.optim.lr_scheduler.OneCycleLR(opt, max_lr, epochs=n_epochs, steps_per_epoch=steps_per_epoch, **kwargs)

def make_cbs(metrics, val_dl): 
    cbs=[]
    sd_savename=''
    for m in metrics:
        metric_cb = callbacks.MetricTrackerCB(m, d_out_val=val_dl.dataset.tfm_df)
        cbs.append(metric_cb)
        cbs.append(callbacks.SaveModelCBExt(metric_cb, every='best_only', name=f'{sd_savename}_epoch', parent_dir=save_dir))
    return cbs

In [10]:
preprocess_pipeline=transforms.Pipeline([
    transforms.PrelimPipeline(y_cols), 
    # transforms.TabPipeline(x_cols, x_cont_cols=cont_cols, x_cat_cols=cat_cols, sk_kwargs={'handle_unknown':'ignore'}), 
    transforms.TabPipeline(x_cols, x_cont_cols=x_cols, x_cat_cols=None, sk_kwargs={}), 
    ])

This will set self.tr_mean and self.tr_std based on this df. All subsequent calls to transform (ie for val, test, or inf sets) should be with is_train_df=False


In [11]:
print('Preprocessing data...')
train_df = preprocess_pipeline(train_df_pre.copy(), inference_mode=False, is_train_df=True)
val_df = preprocess_pipeline(val_df_pre.copy(), inference_mode=False)

Preprocessing data...


In [12]:
unimodal_ds_configs = dict(x_cols=x_cols, pipeline=transforms.Pipeline(), with_label=True)
train_ds = datasets.TabDfDataset(train_df, **unimodal_ds_configs, is_train_df=True, y_cols=y_cols)
val_ds = datasets.TabDfDataset(val_df, **unimodal_ds_configs, y_cols=y_cols)

In [13]:
train_dl = DataLoader(train_ds, batch_size=train_batch_size,
                      num_workers=num_workers, shuffle=train_dl_shuffle, pin_memory=True, drop_last=True)
val_dl = DataLoader(val_ds, batch_size=len(val_ds), num_workers=num_workers, shuffle=False, pin_memory=True)

In [14]:
tpd = {}
tpd['n_iter']=n_iter
tpd['max_lr']=max_lr
tpd['n_epochs']=n_epochs
tpd['pct_start']=pct_start
tpd['grad_clip']=grad_clip
tpd['num_workers']=num_workers
tpd['lams']=lams
tpd['cb_metrics']=cb_metrics
tpd['train_dl_shuffle']=train_dl_shuffle
tpd['train_batch_size']=train_batch_size
tpd['val_batch_size']=val_batch_size
tpd['model_args']=mod_args
tpd['opt_kw']=opt_kw

In [15]:
tpd['model_args']

{'emb_szs': [],
 'out_sz': 2,
 'layers': [3000, 850, 120],
 'ps': [0.2, 0.2, 0.05],
 'sigma': 0.8}

In [17]:
steps_per_epoch = len(train_dl)
lr_kwargs['steps_per_epoch']=steps_per_epoch

fitters = []
start=time.time()
model = make_model(len(train_dl.dataset.x_cols), tpd['model_args'])
model.to(device);

criterion = loss_fxs.EvidentialMSELoss() 
criterion = loss_fxs.CombinedInternalLosses(model.feature_selector, lambdas=torch.Tensor([lams]), supervised_loss=criterion)

fitters.append(fitting.EvidentialFitter(model, train_dl, val_dl, criterion, 
                                        grad_clip, device, lr_sched_factory, lr_kwargs=lr_kwargs, 
                                        callbacks=make_cbs(cb_metrics, val_dl), opt_kwargs=opt_kw,
                                        quiet=True))
predictors = [f.fit(n_epochs=n_epochs) for f in fitters]
end=time.time()

In [None]:
print(f"{(end-start)/60:.2f} mins")
tpd['total_train_time']=f"{(end-start)/60:.2f} mins"

In [None]:
with open(os.path.join(save_dir,'tpd.pkl'), 'wb') as f:
    pickle.dump(tpd, f)

# Dr. Heaton Method

In [None]:
df = pd.read_csv("smoking_drinking_hyperparams.csv")
df['DRK_YN'] = df['DRK_YN'].map({'Y': 1, 'N': 0})

#dummies for sex
df = pd.concat([df,pd.get_dummies(df['sex'],prefix="sex",dtype=int)],axis=1)
df.drop('sex', axis=1, inplace=True)

In [None]:
#convert to pytorch tensors
x_columns = df.columns.drop(['DRK_YN'])
x = torch.tensor(df[x_columns].values, dtype=torch.float32, device=device)
y = torch.tensor(df['DRK_YN'].values, dtype=torch.float32, device=device).view(-1, 1)

In [None]:
torch.manual_seed(42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
patience = 10

fold = 0
for train_idx, test_idx in kf.split(x):
    fold += 1
    print(f"Fold #{fold}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # PyTorch DataLoader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=108, shuffle=True)

    # Create the model and optimizer
    model = make_model(len(train_loader.dataset.tensors[0][0]), mod_args)

        # Create the model and optimizer
    # model = nn.Sequential(
    #     nn.Linear(x.shape[1], 20),
    #     nn.BatchNorm1d(20),  # BatchNorm layer
    #     nn.ReLU(),
    #     nn.Linear(20, 10),
    #     nn.BatchNorm1d(10),  # BatchNorm layer
    #     nn.ReLU(),
    #     nn.Linear(10, 1),
    #     nn.Sigmoid()  # Sigmoid activation for binary classification
    # )
    # model = torch.compile(model,backend="aot_eager").to(device)

    # Early Stopping variables
    best_loss = float('inf')
    early_stopping_counter = 0

    # Training loop
    EPOCHS = 5
    epoch = 0
    done = False
    es = EarlyStopping()

    optimizer = optim.Adam(model.parameters())
    loss_fn = loss_fxs.EvidentialMSELoss(evidence_activation_fx=loss_fxs.relu_evidence, device=device).edl_mse_loss()
    # loss_fn = edl_mse_loss(device=device)

    while not done and epoch < EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            # Ensure y_batch is the correct shape, e.g., (batch_size, 1) for BCEWithLogitsLoss
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        total=0
        correct=0
        with torch.no_grad():
            val_output = model(x_test)
            # Ensure y_test is the correct shape, similar to y_batch
            val_loss = loss_fn(val_output, y_test)

        if es(model, val_loss):
            done = True

    print(f"Epoch {epoch}/{EPOCHS}, Validation Loss: {val_loss.item()}, {es.status}")

In [None]:
raise

In [None]:
df

In [None]:
df_heaton

In [None]:
df = pd.read_csv("smoking_drinking_hyperparams.csv")

X = df.drop(columns=["DRK_YN"])
y = df['DRK_YN'].map({'Y': 1, 'N': 0})

In [None]:
ordinal_cols = [
    "hear_left", 
    "hear_right", 
    "urine_protein", 
    "SMK_stat_type_cd",
    ]

nominal_cols = [
    "sex",
    "SMK_stat_type_cd",
    ]

numeric_cols = [
    "age",
    "height",
    "weight",
    "waistline",
    "sight_left",
    "sight_right",
    "SBP",
    "DBP",
    "BLDS",
    "tot_chole",
    "HDL_chole",
    "LDL_chole",
    "triglyceride",
    "hemoglobin",
    "serum_creatinine",
    "SGOT_AST",
    "SGOT_ALT",
    "gamma_GTP",
]

In [None]:
# change preprocessor to FIRST encode ordinal/nominal columns, THEN standardize all columns using zscore instead of standard scaler

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ],
    remainder='passthrough'  #stops pipeline from dropping numeric columns
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('zscore_scaler', ZScoreScaler())
])

In [None]:
X['SMK_stat_type_cd']

In [None]:
X_transformed = pipeline.fit_transform(X)

In [None]:
X_transformed

In [None]:
df_heaton = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])
df_heaton

In [None]:
df_heaton = pd.concat([df_heaton,pd.get_dummies(df_heaton['job'],prefix="job",dtype=int)],axis=1)
df_heaton.drop('job', axis=1, inplace=True)
df_heaton