In [1]:
!git clone https://kekayan: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@github.com/kekayan/progNet-SAINT.git

Cloning into 'progNet-SAINT'...
remote: Enumerating objects: 78, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 78 (delta 36), reused 58 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (78/78), 971.48 KiB | 8.16 MiB/s, done.
Resolving deltas: 100% (36/36), done.


In [1]:
%cd progNet-SAINT/src/
%mkdir output

/content/progNet-SAINT/src
mkdir: cannot create directory ‘output’: File exists


In [2]:
import numpy as np
import pandas as pd

import torch
from torch import nn


import torch.optim as optim
from utils import count_parameters, classification_scores, mean_sq_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader


from models import SAINT
from augmentations import embed_data_mask
from augmentations import add_noise
from pretraining import SAINT_pretrain

In [3]:
df = pd.read_csv("../data/clinical_and_other_features.csv")
df2 = pd.read_csv('../data/clinical_and_other_features_filtered.csv')

In [4]:
df= df[df["Overall Near-complete Response:  Stricter Definition"].isna()]
df.reset_index(inplace=True, drop=True)
print(df.shape, df2.shape)

(610, 84) (312, 84)


In [5]:
opt_dict = {
    'd_task': 'clf',
    'dtask': 'clf',
    'task': 'multiclass',
    'batchsize': 32,
    'pt_aug': ['mixup', 'cutmix'],
    'pt_aug_lam': 0.1,
    'pretrain_epochs': 250, #50
    'nce_temp': 0.7,
    'lam0': 0.5,
    'lam1': 10,
    'lam2': 1,
    'lam3': 10,
    'pt_projhead_style': 'diff',
    'pt_tasks': ['contrastive','denoising'],
    'mixup_lam': 0.3,
    'ssl_samples': 312,
    'lr':0.0001,
    'train_noise_type':"missing",
    'train_noise_level':0.3,
    'save_path':"./output/model.pt"
}

class AttributeDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

opt = AttributeDict(opt_dict)

In [6]:
def data_split(X,y,nan_mask,indices):
    x_d = {
        'data': X.values[indices],
        'mask': nan_mask.values[indices]
    }

    if x_d['data'].shape != x_d['mask'].shape:
        raise'Shape of data not same as that of nan mask!'

    y_d = {
        'data': y[indices].reshape(-1, 1)
    }
    return x_d, y_d

In [7]:
class DataSetCatCon(Dataset):
    def __init__(self, X, Y, cat_cols,task='clf',continuous_mean_std=None):

        cat_cols = list(cat_cols)
        X_mask =  X['mask'].copy()
        X = X['data'].copy()
        con_cols = list(set(np.arange(X.shape[1])) - set(cat_cols))
        self.X1 = X[:,cat_cols].copy().astype(np.int64) #categorical columns
        self.X2 = X[:,con_cols].copy().astype(np.float32) #numerical columns
        self.X1_mask = X_mask[:,cat_cols].copy().astype(np.int64) #categorical columns
        self.X2_mask = X_mask[:,con_cols].copy().astype(np.int64) #numerical columns
        self.y = Y['data']#.astype(np.float32) if regression
        self.cls = np.zeros_like(self.y,dtype=int)
        self.cls_mask = np.ones_like(self.y,dtype=int)
        if continuous_mean_std is not None:
            mean, std = continuous_mean_std
            self.X2 = (self.X2 - mean) / std

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # X1 has categorical data, X2 has continuous
        return np.concatenate((self.cls[idx], self.X1[idx])), self.X2[idx],self.y[idx], np.concatenate((self.cls_mask[idx], self.X1_mask[idx])), self.X2_mask[idx]

In [8]:
def prepare_dataset(df,p=[.65, .15, .2]):
  df1 = df.drop(['Overall Near-complete Response:  Looser Definition','Near-complete Response (Graded Measure)'],axis=1)
  df1.columns = df1.columns.str.strip()
  pathologic_response_to_neoadjuvant_therapy = ['Pathologic response to Neoadjuvant therapy: Pathologic stage (T) following neoadjuvant therapy',
        'Pathologic response to Neoadjuvant therapy:  Pathologic stage (N) following neoadjuvant therapy',
        'Pathologic response to Neoadjuvant therapy:  Pathologic stage (M) following neoadjuvant therapy']
  # df1.drop(pathologic_response_to_neoadjuvant_therapy, axis=1, inplace=True)
  X = df1.drop('Overall Near-complete Response:  Stricter Definition',axis=1)
  y = df1['Overall Near-complete Response:  Stricter Definition']
  cont_columns = ['Date of Birth (Days)', 'Days to Surgery (from the date of diagnosis)', 'Age at last contact in EMR f/u(days)(from the date of diagnosis) ,last time patient known to be alive, unless age of death is reported(in such case the age of death',
    'Age at mammo (days)', 'Days to distant recurrence(from the date of diagnosis)', 'Days to local recurrence (from the date of diagnosis)',
    'Days to death (from the date of diagnosis)', 'Days to last local recurrence free assessment (from the date of diagnosis)',
    ]
  categorical_columns = list(set(X.columns) - set(cont_columns))

  # convert categorical columns to str type
  X[categorical_columns] = X[categorical_columns].astype(str)

  cat_idxs = [X.columns.get_loc(c) for c in categorical_columns]
  con_idxs = [X.columns.get_loc(c) for c in cont_columns]
  X["Set"] = np.random.choice(["train", "valid", "test"], p = [.65, .15, .2], size=(X.shape[0],))

  train_indices = X[X.Set=="train"].index
  valid_indices = X[X.Set=="valid"].index
  test_indices = X[X.Set=="test"].index

  X = X.drop(columns=['Set'])
  temp = X.fillna("MissingValue")
#   creates a bert style mask for the missing values
  nan_mask = temp.ne("MissingValue").astype(int)

  cat_dims = []
  for col in categorical_columns:
      X[col] = X[col].fillna("MissingValue")
      l_enc = LabelEncoder()
      X[col] = l_enc.fit_transform(X[col].values)
      cat_dims.append(len(l_enc.classes_))

  for col in cont_columns:
      X[col] = pd.to_numeric(X[col], errors='coerce')
      X.fillna(X.loc[train_indices, col].mean(), inplace=True)
  y = y.values
  l_enc = LabelEncoder()
  y = l_enc.fit_transform(y)
  X_train, y_train = data_split(X,y,nan_mask,train_indices)
  X_valid, y_valid = data_split(X,y,nan_mask,valid_indices)
  X_test, y_test = data_split(X,y,nan_mask,test_indices)
  train_mean, train_std = np.array(X_train['data'][:,con_idxs],dtype=np.float32).mean(0), np.array(X_train['data'][:,con_idxs],dtype=np.float32).std(0)
  train_std = np.where(train_std < 1e-6, 1e-6, train_std)
  continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32)
  train_ds = DataSetCatCon(X_train, y_train, cat_idxs,'clf',continuous_mean_std)
  trainloader = DataLoader(train_ds, batch_size=64, shuffle=True,num_workers=1)

  valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs,'clf', continuous_mean_std)
  validloader = DataLoader(valid_ds, batch_size=64, shuffle=False,num_workers=1)

  test_ds = DataSetCatCon(X_test, y_test, cat_idxs,'clf', continuous_mean_std)
  testloader = DataLoader(test_ds, batch_size=64, shuffle=False,num_workers=1)
  y_dim = len(np.unique(y_train['data'][:,0]))
  print('Number of classes in train:',y_dim)
  # in test
  print('Number of classes in test:',len(np.unique(y_test['data'][:,0]))
  )
  #in valid
  print('Number of classes in valid:',len(np.unique(y_valid['data'][:,0]))
  )


  cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int) #Appending 1 for CLS token, this is later used to generate embeddings.

  return trainloader, validloader, testloader, cat_dims, con_idxs , cat_idxs, y_dim , continuous_mean_std , X_train, y_train, X_valid, y_valid, X_test, y_test, train_ds, valid_ds

In [9]:
trainloader, validloader, testloader, cat_dims, con_idxs , cat_idxs, y_dim , continuous_mean_std, X_train, y_train, X_valid, y_valid, X_test, y_test, train_ds, valid_ds = prepare_dataset(df,[.8, .2, 0])

Number of classes in train: 1
Number of classes in test: 1
Number of classes in valid: 1


In [10]:
y_dim = 4 # ssl will have unlabelled data

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
criterion = nn.CrossEntropyLoss().to(device)

In [13]:
model = SAINT(
categories = tuple(cat_dims),
num_continuous = len(con_idxs),
dim = 64,              # embedding dimension
dim_out = 1,
depth = 1,             # depth of the network (nr. of transformer blocks)
heads = 8,             # number of attention heads 8
attn_dropout = 0.1,
ff_dropout = 0.8,
mlp_hidden_mults = (4, 2),
cont_embeddings = 'MLP', # options: 'MLP', 'linear', 'hybrid' (MLP with continuous embeddings concatenated to the transformer block outputs)
attentiontype = 'colrow', # options: 'col', 'row', 'colrow', 'colrowv2'
final_mlp_style = 'sep',
y_dim = y_dim
)
model.to(device)

SAINT(
  (norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  (simple_MLP): ModuleList(
    (0-7): 8 x simple_MLP(
      (layers): Sequential(
        (0): Linear(in_features=1, out_features=100, bias=True)
        (1): ReLU()
        (2): Linear(in_features=100, out_features=64, bias=True)
      )
    )
  )
  (transformer): RowColTransformer(
    (embeds): Embedding(1108, 64)
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (fn): Residual(
            (fn): Attention(
              (to_qkv): Linear(in_features=64, out_features=384, bias=False)
              (to_out): Linear(in_features=128, out_features=64, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (fn): Residual(
            (fn): FeedForward(
              (net):

In [14]:
model = SAINT_pretrain(model,train_ds, valid_ds , opt, device=device)

Pretraining begins!
Model Checkpoint Saved!
Epoch: 0, Running Loss: 172655.7021484375 , Val Loss: 10552.115234375
Early Stopping Counter:  1
Epoch: 1, Running Loss: 157343.81103515625 , Val Loss: 13212.927734375
Model Checkpoint Saved!
Epoch: 2, Running Loss: 147862.611328125 , Val Loss: 9488.12109375
Model Checkpoint Saved!
Epoch: 3, Running Loss: 118935.58862304688 , Val Loss: 8743.982421875
Model Checkpoint Saved!
Epoch: 4, Running Loss: 73798.60559082031 , Val Loss: 2338.560791015625
Early Stopping Counter:  1
Epoch: 5, Running Loss: 73377.20483398438 , Val Loss: 6915.70556640625
Early Stopping Counter:  2
Epoch: 6, Running Loss: 80101.65258789062 , Val Loss: 4413.203125
Early Stopping Counter:  3
Epoch: 7, Running Loss: 60870.944580078125 , Val Loss: 4389.3388671875
Early Stopping Counter:  4
Epoch: 8, Running Loss: 64221.003662109375 , Val Loss: 2530.5146484375
Model Checkpoint Saved!
Epoch: 9, Running Loss: 47011.30236816406 , Val Loss: 726.5587768554688
Early Stopping Counter: 

In [15]:
# Labelled Data

In [16]:
trainloader, validloader, testloader, cat_dims, con_idxs , cat_idxs, y_dim , continuous_mean_std, X_train, y_train, X_valid, y_valid, X_test, y_test,_,_ = prepare_dataset(df2)

Number of classes in train: 4
Number of classes in test: 4
Number of classes in valid: 4


In [17]:
optimizer = optim.AdamW(model.parameters(),lr=0.0001, betas=(0.9,0.999))

In [18]:
modelsave_path='outputs'

In [19]:
print('We are in semi-supervised learning case')

train_bsize = min(opt.ssl_samples//4,opt.batchsize)

We are in semi-supervised learning case


In [20]:
train_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask,continuous_mean_std)
trainloader = DataLoader(train_ds, batch_size=train_bsize, shuffle=True,num_workers=2)

## Fine-tuning the pretrained

In [21]:
# Start K-Fold Cross Validation
# Define the number of splits
n_splits = 4
best_valid_auroc = 0
best_valid_accuracy = 0
best_test_auroc = 0
best_test_accuracy = 0
best_valid_rmse = 100000

early_stop_counter = 0
early_stop_patience = 20

# fold_dict = {}

# Define the KFold object
kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)

# Initialize lists to store the train and validation indices for each fold
train_indices_list = []
valid_indices_list = []

# Loop over the splits and get the train and validation indices for each fold
for train_indices, valid_indices in kf.split(X_train['data']):
    train_indices_list.append(train_indices)
    valid_indices_list.append(valid_indices)
best_test_accuracy_list = []
# Loop over the folds and train the model on each fold
for fold in range(n_splits):
    # Get the train and validation indices for this fold
    train_indices = train_indices_list[fold]
    valid_indices = valid_indices_list[fold]

    # Create the train and validation datasets and dataloaders for this fold
    train_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask,continuous_mean_std)
    trainloader = DataLoader(train_ds, batch_size=train_bsize,num_workers=2, sampler=torch.utils.data.SubsetRandomSampler(train_indices))
    valid_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask,continuous_mean_std)
    validloader = DataLoader(valid_ds, batch_size=train_bsize, shuffle=False,num_workers=2, sampler=torch.utils.data.SubsetRandomSampler(valid_indices))
    print(f'Training begins now for # {fold} Fold.')
    # Train the model on this fold
    for epoch in range(300):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            for j in range(len(data)):
                print(f"Data[{j}] shape: {data[j].shape}, dtype: {data[j].dtype}")
            optimizer.zero_grad()
            # x_categ is the the categorical data, with y appended as last feature. x_cont has continuous data. cat_mask is an array of ones same shape as x_categ except for last column(corresponding to y's) set to 0s. con_mask is an array of ones same shape as x_cont.
            x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
            if opt.train_noise_type is not None and opt.train_noise_level>0:
                noise_dict = {
                    'noise_type' : opt.train_noise_type,
                    'lambda' : opt.train_noise_level
                }
                if opt.train_noise_type == 'cutmix':
                    x_categ, x_cont = add_noise(x_categ,x_cont, noise_params = noise_dict)
                elif opt.train_noise_type == 'missing':
                    cat_mask, con_mask = add_noise(cat_mask, con_mask, noise_params = noise_dict)
            # We are converting the data to embeddings in the next step

            _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model)
            reps = model.transformer(x_categ_enc, x_cont_enc)
            # select only the representations corresponding to y and apply mlp on it in the next step to get the predictions.
            y_reps = reps[:,0,:]

            y_outs = model.mlpfory(y_reps)
            if opt.task == 'regression':
                loss = criterion(y_outs,y_gts)
            else:
                loss = criterion(y_outs,y_gts.squeeze())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(running_loss)
        if epoch%5==0:
            model.eval()
            with torch.no_grad():
                if opt.task in ['binary','multiclass']:
                    accuracy, auroc = classification_scores(model, validloader, device, opt.task)
                    # test_accuracy, test_auroc = classification_scores(model, testloader, device, opt.task)

                    print('[EPOCH %d] VALID ACCURACY: %.3f' %
                        (epoch + 1, accuracy ))
                    # print('[EPOCH %d] TEST ACCURACY: %.3f' %
                    #     (epoch + 1, test_accuracy ))

            if opt.task =='multiclass':
                if accuracy > best_valid_accuracy:
                    best_valid_accuracy = accuracy
                    early_stop_counter = 0
                    print("save model")
                    torch.save({'model': model, 'state_dict': model.state_dict(),'optimizer' : optimizer.state_dict()},modelsave_path+f"model-{fold}.pt")
                else:
                  early_stop_counter +=1
                  if early_stop_counter > early_stop_patience:
                    break

    model.eval()
    with torch.no_grad():
            accuracy, auroc = classification_scores(model, testloader, device, opt.task)
            print('TEST ACCURACY: %.3f' % accuracy)
            best_test_accuracy_list.append(accuracy)



# End K Fold
# Calculate the average of the best accuracy from each fold
average_best_valid_accuracy = sum(best_test_accuracy_list) / len(best_test_accuracy_list)
print('Average best validation accuracy from all folds:', average_best_valid_accuracy)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Data[1] shape: torch.Size([32, 8]), dtype: torch.float32
Data[2] shape: torch.Size([32, 1]), dtype: torch.int64
Data[3] shape: torch.Size([32, 74]), dtype: torch.int64
Data[4] shape: torch.Size([32, 8]), dtype: torch.int64
Data[0] shape: torch.Size([20, 74]), dtype: torch.int64
Data[1] shape: torch.Size([20, 8]), dtype: torch.float32
Data[2] shape: torch.Size([20, 1]), dtype: torch.int64
Data[3] shape: torch.Size([20, 74]), dtype: torch.int64
Data[4] shape: torch.Size([20, 8]), dtype: torch.int64
1.1003119349479675
Data[0] shape: torch.Size([32, 74]), dtype: torch.int64
Data[1] shape: torch.Size([32, 8]), dtype: torch.float32
Data[2] shape: torch.Size([32, 1]), dtype: torch.int64
Data[3] shape: torch.Size([32, 74]), dtype: torch.int64
Data[4] shape: torch.Size([32, 8]), dtype: torch.int64
Data[0] shape: torch.Size([32, 74]), dtype: torch.int64
Data[1] shape: torch.Size([32, 8]), dtype: torch.float32
Data[2] shape: torch.S

In [22]:
best_test_accuracy_list

[array(90.909096, dtype=float32),
 array(86.36364, dtype=float32),
 array(87.878784, dtype=float32),
 array(89.39394, dtype=float32)]

## Supervised Training

In [23]:
model = SAINT(
categories = tuple(cat_dims),
num_continuous = len(con_idxs),
dim = 64,              # embedding dimension
dim_out = 1,
depth = 1,             # depth of the network (nr. of transformer blocks)
heads = 8,             # number of attention heads 8
attn_dropout = 0.1,
ff_dropout = 0.8,
mlp_hidden_mults = (4, 2),
cont_embeddings = 'MLP', # options: 'MLP', 'linear', 'hybrid' (MLP with continuous embeddings concatenated to the transformer block outputs)
attentiontype = 'colrow', # options: 'col', 'row', 'colrow', 'colrowv2'
final_mlp_style = 'sep',
y_dim = y_dim
)
model.to(device)

SAINT(
  (norm): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  (simple_MLP): ModuleList(
    (0-7): 8 x simple_MLP(
      (layers): Sequential(
        (0): Linear(in_features=1, out_features=100, bias=True)
        (1): ReLU()
        (2): Linear(in_features=100, out_features=64, bias=True)
      )
    )
  )
  (transformer): RowColTransformer(
    (embeds): Embedding(828, 64)
    (layers): ModuleList(
      (0): ModuleList(
        (0): PreNorm(
          (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (fn): Residual(
            (fn): Attention(
              (to_qkv): Linear(in_features=64, out_features=384, bias=False)
              (to_out): Linear(in_features=128, out_features=64, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
        )
        (1): PreNorm(
          (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (fn): Residual(
            (fn): FeedForward(
              (net): 

In [24]:
# Start K-Fold for without pre-train
# Define the number of splits
n_splits = 4
best_valid_auroc = 0
best_valid_accuracy = 0
best_test_auroc = 0
best_test_accuracy = 0
best_valid_rmse = 100000

early_stop_counter = 0
early_stop_patience = 20

# fold_dict = {}

# Define the KFold object
kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)

# Initialize lists to store the train and validation indices for each fold
train_indices_list = []
valid_indices_list = []

# Loop over the splits and get the train and validation indices for each fold
for train_indices, valid_indices in kf.split(X_train['data']):
    train_indices_list.append(train_indices)
    valid_indices_list.append(valid_indices)
best_valid_accuracy_list = []
# Loop over the folds and train the model on each fold
for fold in range(n_splits):
    # Get the train and validation indices for this fold
    train_indices = train_indices_list[fold]
    valid_indices = valid_indices_list[fold]

    # Create the train and validation datasets and dataloaders for this fold
    train_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask,continuous_mean_std)
    trainloader = DataLoader(train_ds, batch_size=train_bsize,num_workers=2, sampler=torch.utils.data.SubsetRandomSampler(train_indices))
    valid_ds = DataSetCatCon(X_train, y_train, cat_idxs,opt.dtask,continuous_mean_std)
    validloader = DataLoader(valid_ds, batch_size=train_bsize, shuffle=False,num_workers=2, sampler=torch.utils.data.SubsetRandomSampler(valid_indices))
    print(f'Training begins now for # {fold} Fold.')
    # Train the model on this fold
    for epoch in range(300):
        model.train()
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            optimizer.zero_grad()
            # x_categ is the the categorical data, with y appended as last feature. x_cont has continuous data. cat_mask is an array of ones same shape as x_categ except for last column(corresponding to y's) set to 0s. con_mask is an array of ones same shape as x_cont.
            x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(device), data[1].to(device),data[2].to(device),data[3].to(device),data[4].to(device)
            if opt.train_noise_type is not None and opt.train_noise_level>0:
                noise_dict = {
                    'noise_type' : opt.train_noise_type,
                    'lambda' : opt.train_noise_level
                }
                if opt.train_noise_type == 'cutmix':
                    x_categ, x_cont = add_noise(x_categ,x_cont, noise_params = noise_dict)
                elif opt.train_noise_type == 'missing':
                    cat_mask, con_mask = add_noise(cat_mask, con_mask, noise_params = noise_dict)
            # We are converting the data to embeddings in the next step
            _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask,model)
            reps = model.transformer(x_categ_enc, x_cont_enc)
            # select only the representations corresponding to y and apply mlp on it in the next step to get the predictions.
            y_reps = reps[:,0,:]

            y_outs = model.mlpfory(y_reps)
            if opt.task == 'regression':
                loss = criterion(y_outs,y_gts)
            else:
                loss = criterion(y_outs,y_gts.squeeze())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(running_loss)
        if epoch%5==0:
            model.eval()
            with torch.no_grad():
                if opt.task in ['binary','multiclass']:
                    accuracy, auroc = classification_scores(model, validloader, device, opt.task)
                    # test_accuracy, test_auroc = classification_scores(model, testloader, device, opt.task)

                    print('[EPOCH %d] VALID ACCURACY: %.3f' %
                        (epoch + 1, accuracy ))
                    # print('[EPOCH %d] TEST ACCURACY: %.3f' %
                    #     (epoch + 1, test_accuracy ))

            if opt.task =='multiclass':
                if accuracy > best_valid_accuracy:
                    best_valid_accuracy = accuracy
                    early_stop_counter = 0
                    print("save model")
                    torch.save({'model': model, 'state_dict': model.state_dict(),'optimizer' : optimizer.state_dict()},modelsave_path+f"model-{fold}.pt")
                else:
                  early_stop_counter +=1
                  if early_stop_counter > early_stop_patience:
                    break


    model.eval()
    with torch.no_grad():
            accuracy, auroc = classification_scores(model, testloader, device, opt.task)
            print('TEST ACCURACY: %.3f' % accuracy)
            best_valid_accuracy_list.append(accuracy)



# End K Fold
# Calculate the average of the best accuracy from each fold
average_best_valid_accuracy = sum(best_valid_accuracy_list) / len(best_valid_accuracy_list)
print('Average best validation accuracy from all folds:', average_best_valid_accuracy)

Training begins now for # 0 Fold.
7.028339862823486
[EPOCH 1] VALID ACCURACY: 6.000
save model
6.99616265296936
6.988369345664978
7.020027160644531
7.047019362449646
6.932610750198364
[EPOCH 6] VALID ACCURACY: 6.000
6.965574741363525
6.961134672164917
7.009928226470947
6.929276943206787
7.008740782737732
[EPOCH 11] VALID ACCURACY: 6.000
6.945749640464783
7.012830018997192
6.946619510650635
6.936102867126465
6.939149856567383
[EPOCH 16] VALID ACCURACY: 6.000
6.931910753250122
6.953510761260986
6.980416655540466
6.941442847251892
6.965010285377502
[EPOCH 21] VALID ACCURACY: 6.000
6.98518431186676
6.999499678611755
6.971749424934387
7.000017404556274
6.979048371315002
[EPOCH 26] VALID ACCURACY: 6.000
6.952268600463867
6.974586248397827
6.9552788734436035
6.983617663383484
6.891362428665161
[EPOCH 31] VALID ACCURACY: 6.000
6.92424750328064
6.990475535392761
7.008220195770264
6.994825720787048
6.952037572860718
[EPOCH 36] VALID ACCURACY: 6.000
6.886337637901306
6.977173686027527
6.933290958

In [25]:
best_valid_accuracy_list

[array(9.090909, dtype=float32),
 array(9.090909, dtype=float32),
 array(9.090909, dtype=float32),
 array(9.090909, dtype=float32)]