In [1]:
import sys
sys.path.append('/Users/kianyewngieng/PycharmProjects/my_learning/')

import pickle
with open('train_model_input.pickle', 'rb') as stream:
    train_model_input = pickle.load(stream)
with open('test_model_input.pickle', 'rb') as stream:
    test_model_input = pickle.load(stream)
with open('feature_max_idx.pickle', 'rb') as stream:
    feature_max_idx = pickle.load(stream)

In [3]:
import pandas as pd
import torch.nn as nn
import torch
import numpy as np

train_pd = pd.DataFrame.from_dict(train_model_input, orient='index').T.applymap(lambda x: x.detach().numpy())
test_pd = pd.DataFrame.from_dict(test_model_input, orient='index').T.applymap(lambda x: x.detach().numpy())

data_pd = pd.DataFrame.from_dict(train_model_input,orient='index').T.applymap(lambda x: x.detach().numpy())
data_pd.head()

Unnamed: 0,user_id,movie_id,hist_movie_id,hist_genres,hist_len,genres,gender,age,occupation,zip,label
0,5561,368,"[497, 170, 209, 2362, 70, 538, 2252, 217, 111,...","[1, 1, 4, 8, 8, 8, 8, 1, 3, 8, 1, 8, 1, 8, 7, ...",20,1,1,3.0,4,2311,0
1,4530,3681,"[399, 684, 103, 29, 27, 2879, 1081, 1430, 2971...","[5, 5, 3, 3, 6, 8, 13, 5, 8, 12, 8, 5, 5, 5, 3...",20,7,1,4.0,2,2913,0
2,2895,1487,"[30, 48, 187, 136, 125, 341, 118, 212, 406, 43...","[1, 1, 5, 8, 15, 5, 8, 5, 5, 8, 1, 10, 5, 1, 1...",20,8,2,3.0,12,309,0
3,4082,684,"[582, 1512, 876, 216, 291, 215, 167, 1614, 282...","[5, 8, 14, 5, 5, 1, 5, 5, 1, 11, 8, 5, 8, 8, 5...",20,5,2,3.0,21,2515,0
4,3294,3036,"[28, 69, 1177, 158, 260, 2240, 55, 145, 48, 50...","[1, 5, 5, 5, 11, 8, 1, 8, 1, 8, 5, 3, 5, 5, 5,...",20,6,2,2.0,5,2497,0


In [4]:
from dataclasses import dataclass
from collections import defaultdict, OrderedDict
import torch.nn as nn
import torch
import itertools

DEFAULT_GROUP_NAME = 'default_group'

@dataclass
class SparseFeat:
    """ Class to store metadata related to a categorical feature,
    that will be converted to pytorch tensor via torch.nn.Embedding()
    """
    name: str
    vocabulary_size: int
    embedding_dim: int
    embedding_name: str = None
    group_name: str = DEFAULT_GROUP_NAME
    dtype: str = torch.long
            
    def __post_init__(self):
        if self.embedding_name is None:
            self.embedding_name = self.name
        
@dataclass
class VarLenSparseFeat:
    """Class to store metadata related to a variable length feature (i.e. list)
    - Note: Must create a new SparseFeat with different name, if the feature is a variable length feature of a categorical column. Cannot reuse the instantiated sparse_feat because the embeddings are created separately.
    """
    sparse_feat: SparseFeat
    max_len: int
    combiner: str = 'mean'
    length_name: str = None

    @property
    def name(self):
        return self.sparse_feat.name
    
    @property
    def vocabulary_size(self):
        return self.sparse_feat.vocabulary_size
    
    @property
    def embedding_dim(self):
        return self.sparse_feat.embedding_dim
    
    @property
    def embedding_name(self):
        return self.sparse_feat.embedding_name
    
    @property
    def dtype(self):
        return self.sparse_feat.dtype
    
    @property
    def group_name(self):
        return self.sparse_feat.group_name
    
@dataclass
class DenseFeat:
    name: str
    dimension: int
    dtype: str = None
        
def build_feature_positions(feature_columns):
    """Based on the ordering of the feature columns, get the position index"""
    feature_positions = OrderedDict()
    start = 0
    for feat in feature_columns:
        feat_name = feat.name
        if isinstance(feat, SparseFeat):
            feature_positions[feat_name] = (start, start+1)
            start += 1
            
        elif isinstance(feat, DenseFeat):
            feature_positions[feat_name] = (start, start + feat.dimension)
            start += feat.dimension
            
        elif isinstance(feat, VarLenSparseFeat):
            feature_positions[feat_name] = (start, start + feat.max_len)
            start += feat.max_len
    
            if feat.length_name not in feature_positions and feat.length_name is not None:
                feature_positions[feat.length_name] = (start, start+1)
                start +=1 
        else:
            raise TypeError('Invalid feature columns type, got', type(feat))
    return feature_positions

def build_pytorch_dataset(df_pd, feature_columns):
    torch_df = {}
    for feat in feature_columns:
        feat_name = feat.name
        if isinstance(feat, SparseFeat) or isinstance(feat, DenseFeat):
            input_tensor = torch.tensor(df_pd[feat_name].values).reshape(-1,1)
            torch_df[feat_name] = input_tensor
            
        elif isinstance(feat, VarLenSparseFeat):
            input_tensor = torch.stack(list(map(lambda x: torch.tensor(x[:feat.max_len]), df_pd[feat_name].values)))
            torch_df[feat_name] = input_tensor
            
            if feat.length_name is not None and feat.length_name not in torch_df:
                torch_df[feat.length_name] = torch.tensor(df_pd[feat.length_name].values).reshape(-1,1)
        else:
            raise TypeError('Invalid feature columns type, got,', type(feat))
    torch_df = torch.cat(list(torch_df.values()), dim=-1)
    return torch_df


def build_embedding_dict(all_sparse_feature_columns, init_std=0.001, device='cpu'):
    """ Returns a dictionary with key as the feat.name and value of the instantiated nn.Embedding
    """
    embedding_dict = nn.ModuleDict(
    {feat.name: nn.Embedding(feat.vocabulary_size,
                            feat.embedding_dim) for feat in all_sparse_feature_columns})
    if init_std is not None:
        for tensor in embedding_dict.values():
            # nn.init is in_place
            nn.init.normal_(tensor.weight, mean=0, std=init_std)
            
    return embedding_dict.to(device)

class TensorLookup:
    """Wrapper for a torch.tensor dataframe to allow selection of data according to the corresponding column name
    Args:
        X (torch.Tensor): torch tensor that is created from
    """
    def __init__(self, X, feature_positions):
        self.X = X
        self.feature_positions = feature_positions
        
    def __getitem__(self, feature_names):
        if type(feature_names) == str:
            idx_lookup = self.feature_positions[feature_names]
            return self.X[:, idx_lookup[0]:idx_lookup[1]]
        
        if type(feature_names) == list:
            features_tuple = (
                self.X[:,self.feature_positions[feat_name][0]: 
                       self.feature_positions[feat_name][1]] for feat_name in feature_names
            )
            return tuple(features_tuple)
            
def embedding_lookup(X, 
                     feature_positions,
                     embedding_dict,
                     sparse_feature_columns,
                     return_feat_list=(),
                     mask_feat_list=(),
                     to_list=False):
    """returns a dictionary with key as the group name, value as the list of embedding output of shape (B,1, E)
    if specify to list, then we get a list of embedding output of shape (B,1, E)
    """
    group_embedding_dict = defaultdict(list)
    for feat in sparse_feature_columns:
        feat_name = feat.name
        embedding_name = feat.embedding_name
        if feat_name in return_feat_list or len(return_feat_list)==0:
            lookup_idx = feature_positions[feat_name]
            input_tensor =  X[:, lookup_idx[0]:lookup_idx[1]].long()
            embedding = embedding_dict[embedding_name](input_tensor)
            group_embedding_dict[feat.group_name].append(embedding)
    if to_list == True:
        return list(itertools.chain.from_iterable(group_embedding_dict.values()))
    return group_embedding_dict


def varlen_embedding_lookup(X, 
                     feature_positions,
                     embedding_dict,
                     varlen_feature_columns):
    """Returns a dictionary with key as the feat.name and value as the embedding output of shape (B,T,E)"""
    varlen_group_embedding_dict = {}
    for feat in varlen_feature_columns:
        feat_name = feat.name
        embedding_name = feat.embedding_name
        lookup_idx = feature_positions[feat_name]
        input_tensor =  X[:, lookup_idx[0]:lookup_idx[1]].long()
        embedding = embedding_dict[embedding_name](input_tensor)
        varlen_group_embedding_dict[feat_name] = embedding
    return varlen_group_embedding_dict

def varlen_embedding_pooled_lookup(X,
                                   feature_positions,
                                  embedding_dict,
                                  varlen_sparse_feature_columns,
                                   return_feat_list=(),
                                   to_list=False):
    """returns a dictionary with with key as the group name, 
    value as the list of embedding output of shape (B,1, E)"""
    group_embedding_dict = defaultdict(list)
    for feat in varlen_sparse_feature_columns:
        feat_name = feat.name
        embedding_name = feat.embedding_name
        if feat_name in return_feat_list or len(return_feat_list) == 0:
            lookup_idx = feature_positions[feat_name]
            input_tensor = X[:, lookup_idx[0]:lookup_idx[1]].long()
            seq_embedding = embedding_dict[embedding_name](input_tensor)
            
            if feat.length_name is None:
                lookup_idx = feature_positions[feat_name]
                seq_mask = X[:, lookup_idx[0]:lookup_idx[1]].long() != 0 # (B,T)
                emb = SequencePoolingLayer(mode=feat.combiner, supports_masking=True, device=seq_embedding.device)(seq_embedding, seq_mask)
            else:
                lookup_idx = feature_positions[feat.length_name]
                seq_length = X[:, lookup_idx[0]:lookup_idx[1]].long() # (B, 1)
                emb = SequentialPoolingLayer(mode=feat.combiner, supports_masking=False, device=seq_embedding.device)(seq_embedding, seq_length)
                
            group_embedding_dict[feat.group_name].append(emb)
    if to_list:
        return list(itertools.chain.from_iterable(group_embedding_dict.values()))
    return group_embedding_dict

def dense_lookup(X, feature_positions, dense_feat_columns):
    dense_out = []
    for feat in dense_feat_columns:
        feat_name = feat.name
        lookup_idx = feature_positions[feat_name]
        dense_out.append(X[:, lookup_idx[0]:lookup_idx[1]])
    return dense_out

class SequentialPoolingLayer(nn.Module):
    def __init__(self, mode='mean', supports_masking=False, device='cpu'):
        super().__init__()
        self.mode = mode
        self.supports_masking = supports_masking
        self.device = device
        self.to(device)
        
    def _sequence_mask(self, seq_emb, actual_seq_length):
        batch_size, max_len, emb_dim = seq_emb.shape
        max_length_tensor = torch.arange(0,max_len,1).reshape(1,-1).to(self.device) # (T,) -> (1, T)
        actual_length_of_seq = actual_seq_length.reshape(-1,1) # (B,) -> (B,1)
        mask = max_length_tensor < actual_length_of_seq # (1,T), (B,1) -> (B,T)
        mask = mask.unsqueeze(-1) # (B,T,1)
        return mask
    
    def forward(self, seq_emb, actual_seq_length):
        """
        Seq_emb: (B,T,E)
        actual_seq_length: (B,1) or (B,T) if self.supports_masking=True"""
        if self.supports_masking:
            mask = actual_seq_length.float().unsqueeze(-1) # (B, T) -> (B,T,1)
            actual_seq_length = torch.sum(mask, dim=1) # (B,T,1) -> (B,1)
        else:
            mask = self._sequence_mask(seq_emb, actual_seq_length) # (B, T)
        self.mask = mask
        
        if self.mode == 'max':
            # apply mask. make masked positions as negative as possible
            masked_seq_emb = seq_emb - (1 - mask.float()) * 1e9 # (B,T,C) , (B,T,1) -> (B,T,C)
            max_masked_seq_emb= torch.max(masked_seq_emb, dim=1)[0] # (B,T,C) -> (B,C)
            return max_masked_seq_emb.unsqueeze(1)
        else:
            masked_seq_emb = seq_emb * mask # (B,T,C) -> (B,T,1) -> (B,T,C)
            if self.mode == 'sum':
                sum_masked_seq_emb = torch.sum(masked_seq_emb, dim=1)# (B,T,C) -> (B,C)
                return sum_masked_seq_emb.unsqueeze(1)
            elif self.mode == 'mean':
                sum_masked_seq_emb = torch.sum(masked_seq_emb, dim=1) # (B,T, C) -> (B,C)
                mean_masked_seq_emb = sum_masked_seq_emb / actual_seq_length.reshape(-1,1) # (B,C) , (B,1) ->(B,C)
                return mean_masked_seq_emb.unsqueeze(1)
            else:
                raise ValueError(f'mode="{self.mode}" is not supported in {SequentialPoolingLayer.__class__.__name__}')

# Specify Metadata

In [7]:
lst_sparse_categorical_features = ['user_id','movie_id', 'genres','occupation','zip']
lst_varlen_sparse_categorical_features = ['hist_movie_id', 'hist_genres']
lst_numerical_features = ['age']

embedding_dim = 8
sparse_categorical_features = [SparseFeat(name=col, 
                                         vocabulary_size=feature_max_idx[col],
                                         embedding_dim=8,) for col in lst_sparse_categorical_features]

varlen_sparse_categorical_features =[VarLenSparseFeat(
    sparse_feat= SparseFeat(name=col,
                            vocabulary_size = feature_max_idx[col.split('hist_')[-1]],
                            embedding_dim = embedding_dim),
    max_len = 20,
    length_name='hist_len',
    combiner='mean') for col in lst_varlen_sparse_categorical_features]

dense_feats = [DenseFeat(name=col,dimension=1) for col in lst_numerical_features]

feature_columns = sparse_categorical_features + varlen_sparse_categorical_features + dense_feats
feature_positions = build_feature_positions(feature_columns)

torch_dataset = build_pytorch_dataset(data_pd, feature_columns)

In [18]:
def compute_input_dimension(sparse_categorical_features, varlen_sparse_categorical_features, dense_features):
#     sparse_fts = [ft for ft in feature_columns if isinstance(ft, SparseFeat) or isinstance(ft, VarLenSparseFeat)]
    # compute input dimension
    sparse_dims = [ft.embedding_dim for ft in sparse_categorical_features + varlen_sparse_categorical_features]
    dense_dims =[ft.dimension for ft in dense_features]
    input_dim = sum(sparse_dims + dense_dims)
    return input_dim

compute_input_dimension(sparse_categorical_features, varlen_sparse_categorical_features, dense_features=dense_feats)

57

#  Build training and testing data loaders

In [8]:
# create secondary label for gender
train_pd['gender_label'] = train_pd['gender'].apply(lambda x: 1 if x == 1 else 0)

test_pd['gender_label'] = test_pd['gender'].apply(lambda x: 1 if x == 1 else 0)

In [32]:
train_pt = build_pytorch_dataset(train_pd, feature_columns)
test_pt = build_pytorch_dataset(test_pd, feature_columns)

In [33]:
train_X = train_pt
train_y = torch.tensor(train_pd[['label']].values, dtype=torch.long)

test_X = test_pt
test_y = torch.tensor(test_pd[['label']].values, dtype=torch.long)

In [11]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X= X
        self.y = y
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return len(self.X)
    
train_dataset = Dataset(train_X, train_y)
test_dataset = Dataset(test_X, test_y)

BATCH_SIZE = 512
train_dataloader = torch.utils.data.DataLoader(dataset = train_dataset,
                                               batch_size= BATCH_SIZE, 
                                               shuffle=True)

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                             batch_size = BATCH_SIZE,
                                             shuffle=False)

In [34]:
bx.shape

torch.Size([512, 47])

In [13]:
def save_checkpoint(model:nn.Module, model_dir: str):
    # model_dir must end with .pt extension
    torch.save(model.state_dict(), model_dir)

def resume_checkpoint(model:nn.Module, model_dir:str, device:torch.device):
    state_dict = torch.load(model_dir, map_location=device)
    model.load_state_dict(state_dict)

In [54]:
class EarlyStopping:
    """ https://github.com/Bjarten/early-stopping-pytorch/tree/master
    
    # instantiate EarlyStopping
    early_stop = EarlyStopping(patience=5,
                                delta=0, # change in performance
                                path='checkpoint.pt',
                                trace_func=print)
       
    n_epochs = 10
    device = torch.device('cpu')
    LEARNING_RATE = 1e-3
    optimizer = torch.optim.Adamw(model.parameters(), lr=LEARNING_RATE)
    
    # basic training log. Can add in things like l1 reg, l2 reg, etc
    training_logs = {
        'batch_size_train': train_dataloader.batch_size,
        'n_samples_train': len(train_dataset),
        'steps_per_epoch_train': len(train_dataloader),
        'batch_size_test': test_dataloader.batch_size,
        'n_samples_test': len(test_dataset),
        'steps_per_epoch_test': len(test_dataloader),
        'n_epochs': n_epochs,
        'device': str(device),
        'lr': LEARNING_RATE
    }
    
    for ep in range(n_epochs):
        losses = []
        val_losses = []
        
        model.train() # prep model for training
        for idx, (bX, by) in enumerate(train_dataloader):
            bX, by = bX.to(device), by.to(device)
            out, loss = model(bX, by)
            
            losses.append(loss.cpu().item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # validatation loss on validation set
        model.eval() # # prep model for evaluation
        with torch.no_grad():
            # prep model for evalaution
            for idx, (bX, by) in enumerate(test_dataloader):
                bX, by = bX.to(device), by.to(device)
                out, loss = model(bX, by)

                val_losses.append(loss.cpu().item())
            
        # calculate training and validation loss
        avg_train_loss = np.mean(losses)
        avg_val_loss = np.mean(val_losses)
        
        # early stop using validation loss to prevent overfitting
        early_stop(avg_val_loss, model=model)
        if early_stop.early_stop:
            break
            
        print(f'[{ep}/{n_epochs}] [Train] avg loss: {avg_train_loss}, avg auc: {train_auc}')
        print(f'                   [Validation] avg_loss: {avg_val_loss}, avg auc: {val_auc_roc}')
        
   # load the last checkpoint with the best model
    resume_checkpoint(model, early_stop.path, device=device)

    """
    def __init__(self,
                 patience=7,
                 verbose= False,
                delta=0,
                path='checkpoint.pt',
                trace_func=print):
        
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.early_stop = False
        self.val_loss_min = None
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
        
    def __call__(self, val_loss, model):
        # set first val_loss to best_score
        if self.val_loss_min is None:
#             print('val_loss None')
            self.val_loss_min = val_loss
            # save model
            save_checkpoint(model, self.path)
        
        # if validation loss is higher than the best score + delta, increment counter and early stop if exceed counter
        elif val_loss > self.val_loss_min + self.delta:
#             print('valid score higher')
            self.counter +=1
            self.trace_func(f'EarlyStopping Counter: {self.counter} out of {self.patience}')
            
            if self.counter >= self.patience:
                self.early_stop = True
                
        # current validation loss is the best. Reset counter and save the best model
        else:
#             print('valid score better')
            self.val_loss_min = val_loss
            self.counter = 0
            # save model
            save_checkpoint(model, self.path)  
            if self.verbose:
                self.trace_func(f'Validation loss decreased({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...')

In [36]:
class Model(nn.Module):
    def __init__(self, 
                 feature_positions,
                 sparse_categorical_features,
                 varlen_sparse_categorical_features,
                 dense_features,
                 l1_reg=0, l2_reg=0,
                device='cpu'):
        super().__init__()
        self.feature_positions = feature_positions
        self.embedding_dict = build_embedding_dict(sparse_categorical_features + varlen_sparse_categorical_features, device=device)
        self.sparse_categorical_features = sparse_categorical_features
        self.varlen_sparse_categorical_features = varlen_sparse_categorical_features
        self.dense_features = dense_features
        
        in_dim = sum([feat.embedding_dim for feat in sparse_categorical_features] \
                               + [feat.embedding_dim for feat in varlen_sparse_categorical_features]\
                              + [feat.dimension for feat in dense_features])
        
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.regularization_weight = []
        
        self.linear = nn.Sequential(nn.Linear(57,100),
                                    nn.ReLU(),
                                    nn.Linear(100,1),
                                    nn.Sigmoid())
        
        weights_for_regularization =filter(lambda x: 'weight' in x[0] and 'bn' not in [0], 
               self.linear.named_parameters())
        
        self.add_regularization_weight(weights_for_regularization, l2=self.l2_reg, l1=self.l1_reg)
        
    def add_regularization_weight(self, weight_list, l1=0.0, l2=0.0):
        if isinstance(weight_list, torch.nn.parameter.Parameter):
            weight_list = [weight_list]
        else:
            weight_list = list(weight_list) # for generators
        self.regularization_weight.append((weight_list, l1, l2))
        
    def get_regularization_loss(self):
        total_reg_loss = torch.zeros((1,))
        for weight_list, l1, l2 in self.regularization_weight:
            for w in weight_list:
                if isinstance(w, tuple):
                    parameter = w[1] # named_parameters
                else:
                    parameter = w
                if l1 > 0:
                    total_reg_loss += torch.sum(l1 * torch.abs(parameter))
                if l2 > 0:
                    total_reg_loss += torch.sum(l2 * parameter * parameter)
        return total_reg_loss
    
    def forward(self, X, y):
        
        sparse_embeddings = embedding_lookup(X,
                                             feature_positions=self.feature_positions,
                                             embedding_dict = self.embedding_dict,
                                             sparse_feature_columns = self.sparse_categorical_features,
                                             to_list=True)
        
        varlen_sparse_embeddings = varlen_embedding_pooled_lookup(X,
                                                                 feature_positions=feature_positions,
                                                                 embedding_dict = self.embedding_dict,
                                                                 varlen_sparse_feature_columns = self.varlen_sparse_categorical_features,
                                                                 to_list=True)
        
        categorical_features = torch.cat(sparse_embeddings + varlen_sparse_embeddings, dim=2).squeeze()
        
        dense_input = dense_lookup(X, 
                                   feature_positions= feature_positions,
                                   dense_feat_columns = self.dense_features)
        
        numerical_features = torch.cat(dense_input, dim=-1)
        
        # combine the inputs
        X = torch.cat([categorical_features, numerical_features], dim=-1).type(torch.float32) 
        
        out = self.linear(X)
        loss = torch.nn.functional.binary_cross_entropy(out.view(-1), y.view(-1))
        return out, loss
    
model = Model(feature_positions,
              sparse_categorical_features,
              varlen_sparse_categorical_features,
              dense_features=dense_feats,)
out, loss = model(bx.type(torch.float32), by.type(torch.float32))

torch.Size([512, 1])

# Without tensorboard

In [53]:
model = Model(feature_positions,
              sparse_categorical_features,
              varlen_sparse_categorical_features,
              dense_features=dense_feats,)

# instantiate EarlyStopping
early_stop = EarlyStopping(patience=5,
                           delta=0, # change in performance
                           path='checkpoint.pt',
                           trace_func=print,
                           verbose=True)

n_epochs = 10
device = torch.device('cpu')
LEARNING_RATE = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# basic training log. Can add in things like l1 reg, l2 reg, etc
training_logs = {
    'batch_size_train': train_dataloader.batch_size,
    'n_samples_train': len(train_dataset),
    'steps_per_epoch_train': len(train_dataloader),
    'batch_size_test': test_dataloader.batch_size,
    'n_samples_test': len(test_dataset),
    'steps_per_epoch_test': len(test_dataloader),
    'n_epochs': n_epochs,
    'device': str(device),
    'lr': LEARNING_RATE
}

for ep in range(n_epochs):
    losses = []
    val_losses = []

    model.train() # prep model for training
    for idx, (bX, by) in enumerate(train_dataloader):
        bX, by = bX.type(torch.float32).to(device), by.type(torch.float32).to(device)
        out, loss = model(bX, by)

        losses.append(loss.cpu().item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # validatation loss on validation set
    model.eval() # # prep model for evaluation
    with torch.no_grad():
        # prep model for evalaution
        for idx, (bX, by) in enumerate(test_dataloader):
            bX, by = bX.type(torch.float32).to(device), by.type(torch.float32).to(device)
            out, loss = model(bX, by)

            val_losses.append(loss.cpu().item())

    # calculate training and validation loss
    avg_train_loss = np.mean(losses)
    avg_val_loss = np.mean(val_losses)

    # early stop using validation loss to prevent overfitting
    early_stop(avg_val_loss, model=model)
    if early_stop.early_stop:
        break

    print(f'[{ep}/{n_epochs}] [Train] avg loss: {avg_train_loss}') #,' avg auc: {train_auc}')
    print(f'                   [Validation] avg_loss: {avg_val_loss}') #,' avg auc: {val_auc_roc}')

# load the last checkpoint with the best model
resume_checkpoint(model, early_stop.path, device=device)

val_loss None
[0/10] [Train] avg loss: 0.6720972619950771
                   [Validation] avg_loss: 0.6708048433065414
valid score better
Validation loss decreased(0.669731 --> 0.669731). Saving model...
[1/10] [Train] avg loss: 0.6710621789097786
                   [Validation] avg_loss: 0.6697314232587814
valid score better
Validation loss decreased(0.668664 --> 0.668664). Saving model...
[2/10] [Train] avg loss: 0.670013889670372
                   [Validation] avg_loss: 0.6686636060476303
valid score better
Validation loss decreased(0.667593 --> 0.667593). Saving model...
[3/10] [Train] avg loss: 0.6689565144479275
                   [Validation] avg_loss: 0.6675928086042404
valid score better
Validation loss decreased(0.666523 --> 0.666523). Saving model...
[4/10] [Train] avg loss: 0.6679726727306843
                   [Validation] avg_loss: 0.6665227711200714
valid score better
Validation loss decreased(0.665462 --> 0.665462). Saving model...
[5/10] [Train] avg loss: 0.6669275462

# With TensorBoard

In [56]:
from torch.utils.tensorboard import SummaryWriter

model = Model(feature_positions,
              sparse_categorical_features,
              varlen_sparse_categorical_features,
              dense_features=dense_feats,)

# instantiate EarlyStopping
early_stop = EarlyStopping(patience=5,
                           delta=0, # change in performance
                           path='checkpoint.pt',
                           trace_func=print,
                           verbose=True)

n_epochs = 10
device = torch.device('cpu')
LEARNING_RATE = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# log_dir will contain all the information about whatis to be logged
writer = SummaryWriter(log_dir=f"pytorch_template/") # tensorboard write

# basic training log. Can add in things like l1 reg, l2 reg, etc
training_logs = {
    'batch_size_train': train_dataloader.batch_size,
    'n_samples_train': len(train_dataset),
    'steps_per_epoch_train': len(train_dataloader),
    'batch_size_test': test_dataloader.batch_size,
    'n_samples_test': len(test_dataset),
    'steps_per_epoch_test': len(test_dataloader),
    'n_epochs': n_epochs,
    'device': str(device),
    'lr': LEARNING_RATE
}


writer.add_text('training_logs', str(training_logs), 0)

for ep in range(n_epochs):
    losses = []
    val_losses = []

    model.train() # prep model for training
    for idx, (bX, by) in enumerate(train_dataloader):
        bX, by = bX.type(torch.float32).to(device), by.type(torch.float32).to(device)
        out, loss = model(bX, by)

        losses.append(loss.cpu().item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # validatation loss on validation set
    model.eval() # # prep model for evaluation
    with torch.no_grad():
        # prep model for evalaution
        for idx, (bX, by) in enumerate(test_dataloader):
            bX, by = bX.type(torch.float32).to(device), by.type(torch.float32).to(device)
            out, loss = model(bX, by)

            val_losses.append(loss.cpu().item())

    # calculate training and validation loss
    avg_train_loss = np.mean(losses)
    avg_val_loss = np.mean(val_losses)
    
    writer.add_scalar(tag='training/training_loss', scalar_value = avg_train_loss, global_step=ep,)
    writer.add_scalar(tag='training/validation_loss', scalar_value = avg_val_loss, global_step=ep,)
    
    # early stop using validation loss to prevent overfitting
    early_stop(avg_val_loss, model=model)
    if early_stop.early_stop:
        break

    print(f'[{ep}/{n_epochs}] [Train] avg loss: {avg_train_loss}') #,' avg auc: {train_auc}')
    print(f'    [Validation] avg_loss: {avg_val_loss}') #,' avg auc: {val_auc_roc}')

# load the last checkpoint with the best model
resume_checkpoint(model, early_stop.path, device=device)

[0/10] [Train] avg loss: 0.6072225123643875
    [Validation] avg_loss: 0.545238196849823
Validation loss decreased(0.476654 --> 0.476654). Saving model...
[1/10] [Train] avg loss: 0.5131176691502333
    [Validation] avg_loss: 0.47665419429540634
Validation loss decreased(0.454250 --> 0.454250). Saving model...
[2/10] [Train] avg loss: 0.4701571110635996
    [Validation] avg_loss: 0.45424970984458923
Validation loss decreased(0.448170 --> 0.448170). Saving model...
[3/10] [Train] avg loss: 0.45124273374676704
    [Validation] avg_loss: 0.44817017018795013
Validation loss decreased(0.441148 --> 0.441148). Saving model...
[4/10] [Train] avg loss: 0.431005647405982
    [Validation] avg_loss: 0.4411483556032181
Validation loss decreased(0.433791 --> 0.433791). Saving model...
[5/10] [Train] avg loss: 0.4022911563515663
    [Validation] avg_loss: 0.4337909370660782
Validation loss decreased(0.427999 --> 0.427999). Saving model...
[6/10] [Train] avg loss: 0.36380185931921005
    [Validation] 

In [None]:
writer.close()

!tensorboard --logdir=pytorch_template --port=8008

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:8008/ (Press CTRL+C to quit)
E1023 20:04:54.646395 6123122688 directory_watcher.py:254] File pytorch_template/events.out.tfevents.1698062572.Kians-MacBook-Air.local.40712.0 updated even though the current file is pytorch_template/events.out.tfevents.1698062588.Kians-MacBook-Air.local.40712.1
