In [71]:
"""Train the model"""
"""This is from the pytorch_shuffle dir"""

import argparse
import logging
import os
import numpy as np
import tqdm
import torch
import torch.optim as optim
from tqdm import trange
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
import time
import pickle
import utils
import model.data_loader as dl
import model.dataset as dataset
from model import recNet as net
from sklearn.metrics import roc_curve, auc
from scipy import interp

In [130]:


#-------------------------------------------------------------------------------------------------------------
#/////////////////////    TRAINING AND EVALUATION FUNCTIONS     //////////////////////////////////////////////
#-------------------------------------------------------------------------------------------------------------
def train(model, optimizer, loss_fn, data_iterator, metrics, params, num_steps):
    """Train the model on `num_steps` batches
    Args:
        model: (torch.nn.Module) the neural network superclass
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = utils.RunningAverage()
    
    ##-----------------------------
    # Use tqdm for progress bar
    t = trange(num_steps) 
    data_iterator_iter = iter(data_iterator)
    
    for i in t:
    
        time_before_batch=time.time() 
        
        # fetch the next training batch
        levels, children, n_inners, contents, n_level, labels_batch=next(data_iterator_iter)

        # shift tensors to GPU if available
        if params.cuda:
          levels = levels.cuda()
          children=children.cuda()
          n_inners=n_inners.cuda()
          contents=contents.cuda()
          n_level= n_level.cuda()
          labels_batch =labels_batch.cuda()
      
        # convert them to Variables to record operations in the computational graph
        levels=torch.autograd.Variable(levels)
        children=torch.autograd.Variable(children)
        n_inners=torch.autograd.Variable(n_inners)
        contents = torch.autograd.Variable(contents)
        n_level=torch.autograd.Variable(n_level)
        labels_batch = torch.autograd.Variable(labels_batch)    
    
        time_after_batch=time.time()
#         logging.info("Batch creation time" + str(time_after_batch-time_before_batch))
        
        ##-----------------------------
        # Feedforward pass through the NN
        output_batch = model(params, levels, children, n_inners, contents, n_level)
        
        
        # compute model output and loss
        labels_batch = labels_batch.float()  #Uncomment if using torch.nn.BCELoss() loss function
        output_batch=output_batch.view((params.batch_size)) # For 1 final neuron 
        loss = loss_fn(output_batch, labels_batch)
        
        print('labels_batch=',labels_batch)
        print('y_pred=',output_batch)

        # clear previous gradients, compute gradients of all variables wrt loss
        optimizer.zero_grad()
        loss.backward()

        # performs updates using calculated gradients
        optimizer.step()
        
        ##-----------------------------
        # Evaluate summaries only once in a while
        if i % params.save_summary_steps == 0:
            # extract data from torch Variable, move to cpu, convert to numpy arrays
            output_batch = output_batch.data.cpu().numpy()
            labels_batch = labels_batch.data.cpu().numpy()

            # compute all metrics on this batch
            summary_batch = {metric:metrics[metric](output_batch, labels_batch)
                             for metric in metrics}
            summary_batch['loss'] = loss.item()
            summ.append(summary_batch)
  
        # update the average loss
        loss_avg.update(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg())) #Uncomment once tqdm is installed
     
#     print('summ=',summ)    
    ##-----------------------------
    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.4f}".format(k, v) for k, v in metrics_mean.items())
    logging.info("- Train metrics: " + metrics_string)
    print('metrics_mean=',metrics_mean)
#     print('metrics_string=',metrics_string)
    return metrics_mean
    
#-------------------------------------------------------------------------------------------------------------
def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps):
    """Evaluate the model on `num_steps` batches.
    Args:
        model: (torch.nn.Module) the neural network superclass
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        data_iterator: (generator) a generator that generates batches of data and labels
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to evaluation mode
    model.eval()

    # summary for current eval loop
    summ = []
    
    output_all=[]
    labels_all=[]
    ##-----------------------------
    # compute metrics over the dataset
    
    data_iterator_iter = iter(data_iterator)
    
    for _ in range(num_steps):
    
        # fetch the next evaluation batch
        levels, children, n_inners, contents, n_level, labels_batch=next(data_iterator_iter)

        # shift tensors to GPU if available
        if params.cuda:
          levels = levels.cuda()
          children=children.cuda()
          n_inners=n_inners.cuda()
          contents=contents.cuda()
          n_level= n_level.cuda()
          labels_batch =labels_batch.cuda()

        # convert them to Variables to record operations in the computational graph
        levels=torch.autograd.Variable(levels)
        children=torch.autograd.Variable(children)
        n_inners=torch.autograd.Variable(n_inners)
        contents = torch.autograd.Variable(contents)
        n_level=torch.autograd.Variable(n_level)
        labels_batch = torch.autograd.Variable(labels_batch)    

        ##-----------------------------
        # Feedforward pass through the NN
        output_batch = model(params, levels, children, n_inners, contents, n_level)


        # compute model output
        labels_batch = labels_batch.float() #Uncomment if using torch.nn.BCELoss() loss function
        output_batch=output_batch.view((params.batch_size)) # For 1 final neuron 
        loss = loss_fn(output_batch, labels_batch)
        print('labels for loss=',labels_batch)
#         print('y_pred=',output_batch)

        # extract data from torch Variable, move to cpu, convert to numpy arrays
        output_batch = output_batch.data.cpu().numpy()
        labels_batch = labels_batch.data.cpu().numpy()

        # Save labels and output prob of the current batch
        labels_all=np.concatenate((labels_all,labels_batch))        
        output_all=np.concatenate((output_all,output_batch))

        # compute all metrics on this batch
        summary_batch = {metric: metrics[metric](output_batch, labels_batch)
                         for metric in metrics}
#         summary_batch['loss'] = loss.data[0]
        summary_batch['loss'] = loss.item()
        summ.append(summary_batch)
        
    ##-----------------------------
    
    ##Get the bg rejection at 30% tag eff: 0.05 + 125*(1 - 0.05)/476=0.3). That's why we pick 125
    fpr, tpr, thresholds = roc_curve(labels_all, output_all,pos_label=1, drop_intermediate=False)
    base_tpr = np.linspace(0.05, 1, 476)
    inv_fpr = interp(base_tpr, tpr, 1. / fpr)[125]
#     print('inv_fpr at 30% tag eff=',inv_fpr)
    
    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]} 
    metrics_string = " ; ".join("{}: {:05.4f}".format(k, v) for k, v in metrics_mean.items())
    logging.info("- Eval metrics : " + metrics_string)
    return metrics_mean, inv_fpr


#-------------------------------------------------------------------------------------------------------------
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, step_size, restore_file):
    """Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the neural network superclass
        train_data: array with levels, children, n_inners, contents, n_level and labels_batch lists
        val_data: array levels, children, n_inners, contents, n_level and labels_batch lists
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log files
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(model_dir, restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_acc = 0.0
#     best_val_acc = np.inf
    
    #Save loss, accuracy history
    history={'train_loss':[],'val_loss':[],'train_accuracy':[],'val_accuracy':[],'val_bg_reject':[]}
    
    ##------
    #Create lists to access the lenght below
    train_data=list(train_data)
    val_data=list(val_data)    
#    print('train data length=',len(train_data))
#    print('train data[0]=',train_data[0])
#    print('batch_size=',params.batch_size)
    num_steps_train=len(train_data)//params.batch_size
#    print('num_steps_train=',num_steps_train)
    num_steps_val=len(val_data)//params.batch_size
      
    # We truncate the dataset so that we get an integer number of batches    
    train_x=np.asarray([x for (x,y) in train_data][0:num_steps_train*params.batch_size])
    train_y=np.asarray([y for (x,y) in train_data][0:num_steps_train*params.batch_size])        
    val_x=np.asarray([x for (x,y) in val_data][0:num_steps_val*params.batch_size])
    val_y=np.asarray([y for (x,y) in val_data][0:num_steps_val*params.batch_size])
#    print('truncated train data length=',len(train_x))
#    print('truncated label length=',len(train_y))
    ##------
    # Create train and val datasets. Customized dataset class: dataset.TreeDataset that will create the batches by calling data_loader.batch_nyu_pad. 
    data_train=train_x
    labels_train=train_y
    data_val=val_x
    labels_val=val_y
    transform=data_loader.batch_nyu_pad
    batch_size=params.batch_size
    features=params.features
    print('transform=',transform)
    print('features=',features)
    shuffle_train=False
    shuffle_val=False
    train_data = dataset.TreeDataset(data_train,labels_train,transform,batch_size,features,shuffle_train)
    print('train data=',train_data)

    val_data = dataset.TreeDataset(data_val,labels_val,transform,batch_size,features,shuffle_val)
  
    ##------
    # Create the dataloader for the train and val sets (default Pytorch dataloader). Paralelize the batch generation with num_workers. BATCH SIZE SHOULD ALWAYS BE = 1 (batches are only loaded here as a single element, and they are created with dataset.TreeDataset).
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=False,
                                               num_workers=4, pin_memory=True, collate_fn=dataset.customized_collate) 
                                               
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=1, shuffle=False,
                                               num_workers=4, pin_memory=True, collate_fn=dataset.customized_collate) 
    
    ##------
    # Train/evaluate for each epoch
    for epoch in range(params.num_epochs):
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))
        # Train one epoch
        print('model type and dir =', type(model), dir(model))
        print('optimizer =', optimizer)
        print('loss_fun =', loss_fn)
        print('train_loader type and dir =', type(train_loader),dir(train_loader))
        print('metrics =', metrics)
        print('params type and dir=', type(params),dir(params))
        print('num_ateps_train =', num_steps_train)
        train_metrics = train(model, optimizer, loss_fn, train_loader, metrics, params, num_steps_train)
            
        # Evaluate for one epoch on validation set
        val_metrics, inv_fpr = evaluate(model, loss_fn, val_loader, metrics, params, num_steps_val)      

          # Minimize the accuracy on the val set  
#         val_acc = val_metrics['accuracy']
#         is_best = val_acc >= best_val_acc

#         
#         # Minimize the loss on the val set
#         val_acc = val_metrics['loss']
#         is_best = val_acc <= best_val_acc
        
        
#         # Maximize the bg rejection at 30% tag eff on the val set
#         val_acc = inv_fpr
#         print('val_acc=',val_acc)
#         is_best = val_acc >= best_val_acc
        
#         # Save history
#         history['train_loss'].append(train_metrics['loss'])
#         history['val_loss'].append(val_metrics['loss'])
#         history['train_accuracy'].append(train_metrics['accuracy'])
#         history['val_accuracy'].append(val_metrics['accuracy'])
#         history['val_bg_reject'].append(inv_fpr)
        
#         scheduler.step()
#         step_size = step_size * decay
        
#         # Save weights
#         utils.save_checkpoint({'epoch': epoch + 1,
#                                'state_dict': model.state_dict(),
#                                'optim_dict' : optimizer.state_dict()}, 
#                                is_best=is_best,
#                                checkpoint=model_dir)
            
#         # If best_eval, best_save_path        
#         if is_best:
# #             logging.info("- Found new best accuracy")
# #             logging.info("- Found new lowest loss")
#             best_val_acc = val_acc
#             logging.info('- Found new best bg rejection = {}'.format(best_val_acc))
            
#             # Save best val metrics in a json file in the model directory
#             best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
#             utils.save_dict_to_json(val_metrics, best_json_path)

#         # Save latest val metrics in a json file in the model directory
#         last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
#         utils.save_dict_to_json(val_metrics, last_json_path)

#         # Save loss history in a json file in the model directory
#         # print('loss_hist=')
#         hist_json_path = os.path.join(model_dir, "metrics_history.json")
#         utils.save_dict_list_to_json(history, hist_json_path)    

    

In [131]:
algo='kt'
sample_type = 'ginkgo'
architecture='simpleRecNN'
model_dir = 'experiments/ginkgo_kt_48jets'
data_dir = '../data/preprocessed_trees/'
restore_file = None

print('Model dir=',model_dir)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
##-------------------
# Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))
# Load the parameters from json file
json_path = os.path.join(model_dir, 'params.json')
assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)

sample_filename = model_dir.split('/')[1]
logging.info('sample_filename={}'.format(sample_filename))
train_data=data_dir+'train_'+sample_filename+'.pkl'
val_data=data_dir+'dev_'+sample_filename+'.pkl'
test_data=data_dir+'test_'+sample_filename+'.pkl'

start_time = time.time() 

sample_filename=ginkgo_kt_48jets


Model dir= experiments/ginkgo_kt_48jets


In [132]:
##----------------------------------------------------------------------------------------------------------
###   TRAINING
##----------------------------------------------------------------------------------------------------------
data_loader=dl.DataLoader # Main class with the methods to load the raw data, create and preprocess the trees


# use GPU if available
params.cuda = torch.cuda.is_available()

# Set the random seed for reproducible experiments
#   torch.manual_seed(230)
#   if params.cuda: torch.cuda.manual_seed(230)
if params.cuda: torch.cuda.seed()
##-----------------------------
# Create the input data pipeline 
logging.info('---'*20)
logging.info("Loading the datasets...")

# Load data 
with open(train_data, "rb") as f: train_data=pickle.load(f)
with open(val_data, "rb") as f: val_data=pickle.load(f) 


logging.info("- done loading the datasets") 
logging.info('---'*20)  

------------------------------------------------------------
Loading the datasets...
- done loading the datasets
------------------------------------------------------------


In [133]:


##----------------------------------------------------------------------
## Architecture

# Define the model and optimizer

## a) Simple RecNN 
if architecture=='simpleRecNN': 
    print(params.cuda)
    model = net.PredictFromParticleEmbedding(params,make_embedding=net.GRNNTransformSimple).cuda() if params.cuda else net.PredictFromParticleEmbedding(params,make_embedding=net.GRNNTransformSimple) 

##----
## b) Gated RecNN
elif architecture=='gatedRecNN':
    model = net.PredictFromParticleEmbeddingGated(params,make_embedding=net.GRNNTransformGated).cuda() if params.cuda else net.PredictFromParticleEmbeddingGated(params,make_embedding=net.GRNNTransformGated) 

## c) Leaves/inner different weights -  RecNN 
elif architecture=='leaves_inner_RecNN': 
    model = net.PredictFromParticleEmbeddingLeaves(params,make_embedding=net.GRNNTransformLeaves).cuda() if params.cuda else net.PredictFromParticleEmbeddingLeaves(params,make_embedding=net.GRNNTransformLeaves) 

##----
## d) Network in network (NiN) - Simple RecNN
elif architecture=='NiNRecNN':
    model = net.PredictFromParticleEmbeddingNiN(params,make_embedding=net.GRNNTransformSimpleNiN).cuda() if params.cuda else net.PredictFromParticleEmbeddingNiN(params,make_embedding=net.GRNNTransformSimpleNiN)  

##-----
## e) Network in network (NiN) - Simple RecNN
elif architecture=='NiNRecNN2L3W':
    model = net.PredictFromParticleEmbeddingNiN2L3W(params,make_embedding=net.GRNNTransformSimpleNiN2L3W).cuda() if params.cuda else net.PredictFromParticleEmbeddingNiN2L3W(params,make_embedding=net.GRNNTransformSimpleNiN2L3W)  

##-----
## f) Network in network (NiN) - Gated RecNN
elif architecture=='NiNgatedRecNN':
    model = net.PredictFromParticleEmbeddingGatedNiN(params,make_embedding=net.GRNNTransformGatedNiN).cuda() if params.cuda else net.PredictFromParticleEmbeddingGatedNiN(params,make_embedding=net.GRNNTransformGatedNiN) 


##-----
## g) Network in network (NiN) -- NiN RecNN ReLU
elif architecture=='NiNRecNNReLU':
    model = net.PredictFromParticleEmbeddingNiNReLU(params,make_embedding=net.GRNNTransformSimpleNiNReLU).cuda() if params.cuda else net.PredictFromParticleEmbeddingNiNReLU(params,make_embedding=net.GRNNTransformSimpleNiNReLU) 


##----------------------------------------------------------------------
# Output number of parameters of the model
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_weights = sum(p.numel() for p in model.parameters() if p.requires_grad)

logging.info("Total parameters of the model= {}".format(pytorch_total_params))
logging.info("Total weights of the model= {}".format(pytorch_total_weights))

##----------------------------------------------------------------------
## Optimizer and loss function

logging.info("Model= {}".format(model))
logging.info("---"*20)  
logging.info("Building optimizer...")

step_size=params.learning_rate
decay=params.decay
#   optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=step_size)#,eps=1e-05)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay)

# fetch loss function and metrics
loss_fn = torch.nn.BCELoss()
#   loss_fn = torch.nn.CrossEntropyLoss()
metrics = net.metrics

##----------------------
# Train the model
logging.info("Starting training for {} epoch(s)".format(params.num_epochs))

train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, step_size,restore_file)   

elapsed_time=time.time()-start_time
logging.info('Total time (minutes) ={}'.format(elapsed_time/60))










Total parameters of the model= 34113
Total weights of the model= 34113
Model= PredictFromParticleEmbedding(
  (fc_u): Linear(in_features=7, out_features=64, bias=True)
  (fc_h): Linear(in_features=192, out_features=64, bias=True)
  (transform): GRNNTransformSimple(
    (fc_u): Linear(in_features=7, out_features=64, bias=True)
    (fc_h): Linear(in_features=192, out_features=64, bias=True)
  )
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)
------------------------------------------------------------
Building optimizer...
Starting training for 40 epoch(s)
Epoch 1/40


False
transform= <function DataLoader.batch_nyu_pad at 0x7f80529799d0>
features= 7
train data= <model.dataset.TreeDataset object at 0x7f8052a7b5e0>
model type and dir = <class 'model.recNet.PredictFromParticleEmbedding'> ['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_buffers', '_call_impl', '_forward_hooks', '_forward_pre_hooks', '_get_backward_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters', '_register_load_state_dict_pr

  0%|                                                     | 0/2 [00:05<?, ?it/s]


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/Users/laurengreenspan/miniconda3/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "/Users/laurengreenspan/miniconda3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/Users/laurengreenspan/miniconda3/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/Users/laurengreenspan/GitDLs/TreeNiNNew/code_ginkgo/recnn/model/dataset.py", line 44, in __getitem__
    levels, children, n_inners, contents, n_level= self.transform(self.data[index*self.batch_size:(index+1)*self.batch_size],self.features)
TypeError: 'int' object is not callable


In [None]:
#-------------------------------------------------------------------------------------------------------------
###///////////////////////////////////////////////////////////////////////////////////////////////////////////
#-------------------------------------------------------------------------------------------------------------
if __name__=='__main__':  

  ##----------------------------------------------------------------------------------------------------------
  # Global variables
  ##-------------------

  ##------------------------------------------------------------  
  parser = argparse.ArgumentParser()
  parser.add_argument('--data_dir', default='../data/preprocessed_trees/', help="Directory containing the input batches")

  parser.add_argument('--model_dir', default='experiments/ginkgo_kt_48_jets/run_0', help="Directory containing params.json")
  parser.add_argument('--restore_file', default=None,
                      help="Optional, name of the file in --model_dir containing weights to reload before \
                      training")  # 'best' or 'last'

  parser.add_argument('--jet_algorithm', help="jet algorithm")
  parser.add_argument('--architecture', default='simpleRecNN', help="RecNN architecture")
  parser.add_argument('--sample_type', default='ginkgo', help="sample type")