<a href="https://colab.research.google.com/github/ketanhdoshi/ml/blob/master/lib/app_lib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Application utilities

**Todos**
*   Move Corpus Bleu out of this file?
*   Make set_data_path be regular methods not statics. Then you initialise an App object, set all the data paths for the app once, and then use those everywhere without having to pass them in

**Done**
*   DONE Make AppTabular child of AppBase
*   DONE Make ArchTabular child of ArchBase

### Imports

In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import IPython.core.debugger as db

In [None]:
#export

from pathlib import Path
import math
from functools import partial
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
gd_path = 'gdrive/My Drive/Colab Data/fastai-v3'  #change dir to your project folder
gn_path = 'gdrive/My Drive/Colab Notebooks'  #change dir to your project folder

import sys
sys.path.insert(1, gn_path + '/exp')

In [None]:
#export

from nb_util import mse
from nb_hooks import Hooks, DebugActivationHook
from nb_arch import ArchBase
from nb_training import Trainer, CudaCB, ProgressCallback, MetricsGrp, LossMetricsCB, DebugTracker, DebugYhatLossCB
from nb_optimiser import HyperParams, Recorder, LRRangeFind

### Define Text Translation Data Bundle

In [None]:
#export

from torch.nn.utils.rnn import pad_sequence
from nb_data import DataBundle, CSVItemContainer, DfItemList, SentenceItemList, SentenceWordItemList, SentenceWordIdItemList, SortishSampler, SortSampler

# ----------------------------
# Collate function to convert a list of item tuples into a single tensor which is fed
# as input to the model
#
# 'Samples' is a list of tuples ie. [(x1, y1), (x2, y2), ...] where 'xn' and 'yn' are 
# both lists of word ids in the sentence. All the sentences could have different lengths, so
# after each 'xn' and 'yn' are converted to tensors, they are padded at the end to
# make them the same length.
# ----------------------------
def seq_collate(samples, pad_idx=1, pad_first=False):
  # Convert each sample sentence into a tensor and then create two lists of 
  # tensors ie. [tensor x1, tensor x2, ...] and [tensor y1, tensor y2, ...]
  tx, ty = [torch.tensor(s[0]) for s in samples], [torch.tensor(s[1]) for s in samples]

  # Use the Pytorch pad_sequence function to pad each sample tensor to the same length and then
  # concatenate them into a single tensor. So 'px' and 'py' have shape (samples, max sequence length)
  px, py = pad_sequence(tx, batch_first=True, padding_value=pad_idx), pad_sequence(ty, batch_first=True, padding_value=pad_idx)
  return px, py

#----------------------------------------------------
# The custom sampler functions need to take a sort-key-function as an extra argument. 
# The key function needs the 'data' argument pre-bound using a partial.
# 
# This is a standalone function rather than a lambda function, because pickle is not able to
# save lambda functions.
#----------------------------------------------------
def len_key_fn(i,data): 
  return len(data[i])

#----------------------------------------------------
# Text Translation from CSV data preparation pipeline
#----------------------------------------------------
class TextTranslationCSVDataBundle(DataBundle):
  def __init__(self, csv_path, bs):
    print ('--------- Text Translation DataBundle init', csv_path)

    # Load all rows from the given CSV file
    # Split randomly based on a percentage ratio for training and validation
    # 'x' items are taken from 'fr' column as text sentences and
    # 'y' labels are taken from 'en' column as class name labels
    # Convert the 'x' items from Sentences to Words to Word Ids
    # Convert the 'y' items from Sentences to Words to Word Ids

    load_params = {'source': CSVItemContainer, 'target_cls': DfItemList, 'csv_path': csv_path}
    split_params = {'split_procedure': 'split_random', 'train_ratio': 0.8, 'valid_ratio': 0.2}
    extract_x_params = {'extract_procedure': 'extract_colval', 'target_cls': SentenceItemList, 'col': 'fr', 'lang': 'fr'}
    extract_y_params = {'extract_procedure': 'extract_colval', 'target_cls': SentenceItemList, 'col': 'en'}
    convert_x_params = [
        {'target_cls': SentenceWordItemList, 'convert_procedure': 'SentenceToWord'},
        {'target_cls': SentenceWordIdItemList, 'convert_procedure': 'WordToWordId'}
    ]
    convert_y_params = [
        {'target_cls': SentenceWordItemList, 'convert_procedure': 'SentenceToWord'},
        {'target_cls': SentenceWordIdItemList, 'convert_procedure': 'WordToWordId'}
    ]

    # We use different samples for training and validation. 
    dl_params = (
        {'bs': bs, 'sampler_fn': SortishSampler, 'key_fn': len_key_fn, 'collate_fn': seq_collate},    # for training
        {'bs': bs, 'sampler_fn': SortSampler, 'key_fn': len_key_fn, 'collate_fn': seq_collate}        # for valid/test
    )
    WAIT_dl_params = (
        {'bs': bs, 'shuffle': False, 'collate_fn': seq_collate},    # for training
        {'bs': bs, 'shuffle': False, 'collate_fn': seq_collate}     # for valid/test
    )
    self.display_params = {
        'layout_procedure': 'display_texts'
    }
    super().__init__(load_params, split_params, extract_x_params, extract_y_params, convert_x_params, convert_y_params, dl_params=dl_params)

  # ----------------------------
  # Since we use Sorting Samplers which require a key function (which also requires a partial wrapper)
  # we have to override the parent DataBundle with a custom get_sampler() 
  # ----------------------------
  def get_sampler(self, ds, in_train, bs, sampler_fn, key_fn, **kwargs):
    key=partial(key_fn, data=ds.x)
    # The two sampler functions take different arguments. Since we know the sampler
    # functions we can hardcode their names. The sampler_fn that is passed in is the
    # same, but we ignore it.
    if (in_train):
      sampler = SortishSampler(ds.x, key=key, bs=bs)
    else:
      sampler = SortSampler(ds.x, key=key)
    return sampler

### Bleu Score Metric

In [None]:
#export

from math import exp

class NGram():
    def __init__(self, ngram, max_n=5000):
      #print('===', ngram)
      self.ngram,self.max_n = ngram,max_n
    def __eq__(self, other):
        if len(self.ngram) != len(other.ngram): return False
        return np.all(np.array(self.ngram) == np.array(other.ngram))
    def __hash__(self):
      hml = [o * self.max_n**i for i,o in enumerate(self.ngram)]
      hm = int(sum(hml))
      return hm

def get_grams(x, n, max_n=5000):
    return x if n==1 else [NGram(x[i:i+n], max_n=max_n) for i in range(len(x)-n+1)]

def get_correct_ngrams(pred, targ, n, max_n=5000):
    pred_grams,targ_grams = get_grams(pred, n, max_n=max_n),get_grams(targ, n, max_n=max_n)
    pred_cnt,targ_cnt = Counter(pred_grams),Counter(targ_grams)
    precm = [(c, targ_cnt[g]) for g,c in pred_cnt.items()]
    precl = [min(c, g) for c, g in precm]
    prec, ln = sum(precl),len(pred_grams)
    return prec, ln

def corpus_bleu(preds, targs, max_n=5000):
    pred_len,targ_len,n_precs,counts = 0,0,[0]*4,[0]*4
    tmp_preds = preds.argmax(dim=-1)
    tmp_preds, tmp_targs = tmp_preds.cpu().numpy(), targs.cpu().numpy()
    for pred,targ in zip(tmp_preds, tmp_targs):
        pred_len += len(pred)
        targ_len += len(targ)
        for i in range(4):
            c,t = get_correct_ngrams(pred, targ, i+1, max_n=max_n)
            n_precs[i] += c
            counts[i] += t
    #db.set_trace()
    n_precs = [c/t if (t > 0) else 0 for c,t in zip(n_precs,counts)]
    len_penalty = exp(1 - targ_len/pred_len) if pred_len < targ_len else 1
    return len_penalty * ((n_precs[0]*n_precs[1]*n_precs[2]*n_precs[3]) ** 0.25)


In [None]:
# Not used at the moment
class CorpusBLEU(Callback):
    def __init__(self, vocab_sz):
        self.vocab_sz = vocab_sz
        self.name = 'bleu'
    
    def on_epoch_begin(self, **kwargs):
        self.pred_len,self.targ_len,self.n_precs,self.counts = 0,0,[0]*4,[0]*4
    
    def on_batch_end(self, last_output, last_target, **kwargs):
        last_output = last_output.argmax(dim=-1)
        for pred,targ in zip(last_output.cpu().numpy(),last_target.cpu().numpy()):
            self.pred_len += len(pred)
            self.targ_len += len(targ)
            for i in range(4):
                c,t = get_correct_ngrams(pred, targ, i+1, max_n=self.vocab_sz)
                self.n_precs[i] += c
                self.counts[i] += t
    
    def on_epoch_end(self, last_metrics, **kwargs):
        n_precs = [c/t for c,t in zip(n_precs,counts)]
        len_penalty = exp(1 - targ_len/pred_len) if pred_len < targ_len else 1
        bleu = len_penalty * ((n_precs[0]*n_precs[1]*n_precs[2]*n_precs[3]) ** 0.25)
        return add_metrics(last_metrics, bleu)

### App Base class

In [None]:
#export

class AppBase():
  def __init__(self, loss_type='bin_classif', metrics_cbs=[]):
    self._arch = None
    self.db = None

    self.debug_cbs = []
    self.dtr = None
    self.hooks = None

    # Select the appropriate loss function for the type of problem
    if (loss_type == 'bin_classif'):
      # Binary classification problems
      self.loss_fn = F.binary_cross_entropy_with_logits
      #self.loss_fn = nn.BCELoss()
    elif (loss_type == 'multi_classif'):
      # Multi-class classification problems
      self.loss_fn = F.cross_entropy
    elif (loss_type == 'regression'):
      self.loss_fn = mse
    assert(self.loss_fn)

    metrics_cbs = [LossMetricsCB()] + metrics_cbs
    self.metrics_cbs = metrics_cbs + [MetricsGrp(metrics_cbs)]

  # ----------------------------
  # Create the debug settings
  # ----------------------------
  def create_debug(self, use_dtr=False, track_batches_per_epoch=5, disp_tb=False, disp_pd=True, debug_bkwd=False, debug_fwd=False, abort_iter=0):
    dtr, hooks, debug_cbs = None, None, []
    if (use_dtr):
      dtr = DebugTracker(max_count=track_batches_per_epoch, disp=(disp_tb, disp_pd))
      debug_cbs += [dtr, DebugYhatLossCB(fwd=False, bkwd=debug_bkwd)]

      # Add hooks for the forward pass activations
      if (debug_fwd):
        # Arch and Model should be created already
        assert(self._arch and self._arch.model)
        arch = self._arch
        model = arch.model

        # Add Debug Hooks to the hook_layers and save a list of all the hooks
        hook_cls=[[partial(DebugActivationHook, do_print=False, model=model, dtr=dtr)]]
        hook_groups = arch.hook_groups()
        hooks = Hooks(hook_groups, hook_cls)

    if (abort_iter > 0):
      debug_cbs += [AbortTrainCB(abort_iter)]

    self.dtr, self.hooks, self.debug_cbs = dtr, hooks, debug_cbs

  #----------------------------------------------------
  # Train the model
  #----------------------------------------------------
  def run_train(self, num_epochs=1, split_lr=[1e-3], weight_decay=0.2, one_cycle=False, app_cbs=[]):
    assert(isinstance(one_cycle, bool))

    train_dl = self.db.train_dl
    valid_dl = self.db.valid_dl

    # Loss function
    loss_func = self.loss_fn

    # Model
    arch = self._arch
    model = arch.model

    opt_adamw = partial(optim.AdamW, betas=(0.9, 0.99), weight_decay=weight_decay)
    lr_sched='one_cycle' if one_cycle else None
    opt, hyper_cbs = HyperParams.set(model, module_groups=None, split_lr=split_lr, split=False, lr_sched=lr_sched, opt_func=opt_adamw)

    gpu_cbs = [CudaCB(device = torch.device('cuda',0))]
    track_cbs = [Recorder(), ProgressCallback()]
    callbs = gpu_cbs + track_cbs
    callbs += app_cbs
    callbs += self.metrics_cbs + hyper_cbs + self.debug_cbs

    loop = Trainer(train_dl, valid_dl, model, opt, loss_func, callbs, dtr=self.dtr)
    loop.hooks = self.hooks

    loop.fit(num_epochs=num_epochs)
    return loop

  # ----------------------------
  # Learning Rate Finder
  # ----------------------------
  def lr_find(self, start_lr, end_lr, num_iter, weight_decay=0.01, app_cbs=[]):

    train_dl = self.db.train_dl
    valid_dl = None

    # Loss function
    loss_func = self.loss_fn

    # Model
    arch = self._arch
    model = arch.model

    opt_adamw = partial(optim.AdamW, betas=(0.9, 0.99), weight_decay=weight_decay)
    opt, hyper_cbs = HyperParams.set(model, module_groups=None, split_lr=[start_lr], split=False, lr_sched='lrf', opt_func=opt_adamw, start_lr=start_lr, end_lr=end_lr, num_iter=num_iter)

    gpu_cbs = [CudaCB(device = torch.device('cuda',0))]
    track_cbs = [Recorder(), ProgressCallback()]
    lrf_cbs = [LRRangeFind(num_iter)]
    callbs = gpu_cbs + track_cbs
    callbs += app_cbs
    callbs += lrf_cbs + hyper_cbs

    loop = Trainer(train_dl, valid_dl, model, opt, loss_func, callbs, dtr=None)

    num_epochs = int(math.ceil(num_iter / len(train_dl)))
    loop.fit(num_epochs=num_epochs)
    return loop

### Tabular To Be Sorted

In [None]:

  # DONE Rossman - load using prepare
  # DONE Rossman - run basic model
  # DONE Test this - Change all convert methods for all ItemLists to take 'items' as the first argument
  # DONE Change extract_select to have a flag - take all cat/cont or target. Also update the feature dict cat and cont cols accordingly
  # DONE Remove unnecessary stuff out of HomeCreditDB
  # DONE reduce_mem - should take flag to treat 'object' columns as categorical
  # DONE ROC_AUC Callback
  # DONE getobj() what about datatype of np array - if not float then make it float
  # DONE Document stack trace
  # DONE Document copysettingwitherror
  # DONE Home Credit - One result
  # DONE listing of cont/cat columns in prepare() and feature_dict. Naming of '_y' columns after join.
  # DONE AppTabular run_train should take a flag to enable/disable hooks and dtr
  # DONE Add aggregates for the 4 balance tables
  # DONE Add relative values for amounts
  # DONE Give two letter prefix to all columns from auxiliary tables so there is no name conflict
  # DONE Split MetricsCB into AverageMetricsCB
  # DONE AppTabular init() should take metrics as an argument
  # DONE Generic Metrics Callback base class that AverageMetrics and ROC inherit from
  # DONE Rossman - EDA
  # DONE Utility fn to create 'subset' version of dataset
  # DONE Rossman - One result, with correct accuracy/loss, take log if needed
  # DONE Use One Cycle
  # DONE Categorical columns missing values get set to -1. Check if we have any, and add +1 to all codes like in Fastai, so that missing becomes 0
  # DONE Run with all data as training, no validation, and check what training loss we get for the rows (including the last 20% which would have been validation)
  # DONE Calculate loss on the full accumulated yhat/yb for an epoch
  # Same Model, Diff data:
    # DONE Run train with 80% as normal. For validation, run the full 100% data. Here the model weights are fixed. Now see what loss and yhat we get for the first 80% data and for the final 20% data.
    # DONE Flip train and validation data
  # Same data, diff model:
    # Toward the end of training - check yhat/loss on the last 10 batches or so. Then use those same 10 batches in validation and check yhat/loss.
  # DONE Run the full data with only model forward and loss ie. no backward/opt, and see yhat and loss for first 80% and last 20%
  # DONE Pickle App and Reload it
  # DONE Torch.save Model and Reload it
  # DONE Run prepare with full data and pickle it, and reload it
  # DONE Use old MetricsCB
  # DONE Use no metrics and just check loss
  # DONE Use my adam_opt_func not Pytorch optim.Adam
  # DONE Train a model for 2000 batches and then torch.save the model and reload it, before starting debugging.
  # DONE Run with DTR and DebugYhat and check gradient change on the backward
  # DONE Add Val Pred and Loss in DebugYhat
  # DONE Check gradient with Rossman fastai.
  # NONEED Use my df, sorted by Date/Store with fastai - create Databunch and model
  # NONEED Use fastai df with my model
  # DONE Make data as SequentialSampler
  # DONE Check loss with both BSELoss and BSELossWithLogits
  # DONE Still need to answer why outlier rows produce huge loss only in validation data, and not in training
  # NONEED Compare some outlier rows with non-outlier rows to see how they are different

  # ++++++++++ End-to-end run of Rossman with same results
  # DONE Compute y_range via a param and dont hardcode it
  # DONE Sequential Split should take a flag for 'test_rows' so that all sequential types are supported including by index, not just by random pct.
  # DONE Remove hard-coded CompetitionDistance in reduce_mem()
  # DONE Compute valid_idx automatically
  # DONE Correct size for validation dataset
  # DONE Make the optimiser AdamW usage generic in run_train() by passing in params for weight decay and betas.

  # ++++++++++ Full visibility into end-to-end runs and results
  # DONE Track training and validation loss - val loss not showing up in Tensorboard
  # DONE Track all hyperparameters - mom etc - 
  # NONEED Track metrics - already being outputted per epoch, not critical to include them in DTR for now
  # DONE Unique layer names
  # DONE Recorder and DTR should quickly output all info from a run to quickly view all important parameters
  # DONE Visualise all results in Tensorboard and/or Pandas matplotlib
  # DONE Also have a separate function to create the Debug/DTR with the right arguments. Then run_train() uses that DTR.

  # ++++++++++ Quicker turnaround during development by saving and loading pre-computed intermediate data state
  # DONE Incorporate subsets of different dataset sizes as part of normal workflow
  # DONE Load and save processed data from one run to the next
  # DONE Load and save prepared data from one run to the next
  # DONE Load and save model weights from one run to the next

  # ++++++++++ Ensure reproducibility and no degradation of results by comparing with previous runs
  # DONE Make generic utils to quickly compare data (prepared and processed and np array) from one run to the next
  # DONE Make generic utils to quickly compare results from one model execution run using the results DFs
  # DONE Generic utils/workflow to save good historical runs
  # Generic function for reproducible runs as a single top-level flag

  # ++++++++++ End-to-end run of Home Credit with same results
  # DONE Check if correct LR rate is used - implement LR Find
    # Recorder should plot a graph of Loss vs Log(LR), in addition to Loss vs iteration
    # In lr_find() val_dl is None which will cause exception if num_epochs > 1
    # Tune the exit condition in LRRangeFinder as per Fastai
  # DONE Generic way to pass in different Settings to create_arch() and run_train().
  # DONE Auto-try with different Settings values - for run_train(), create_arch() and later, for load_data()
  # DONE DTR should track (1) DONE epochs (2) DONE metrics per epoch (3) DONE Settings per run (4) DONE Timing per epoch
  # DONE Auto-way to easily store and compare results from these different runs
  # DONE Display results from different runs
  # DONE Smoothed moving average for training loss metric
  # DONE Timing of run is too slow - run with Fastai using my df and measure timing
  # DONE Have a flag for the HomeCreditDB to turn on/off add_col in check_missing(). Also missing threshold for dropping columns.

  # ++++++++++ Compare Fastai and KD results with RandomSampler - why is Fastai still better
  # DONE - removed truncation of last batch.
  # Try with drop_last=True and check if results improve

  # ++++++++++ Compare adam_opt_func timing with Pytorch optimiser.
  # Change my implementation to use in-place computations using func_()


  # ++++++++++ Generic Lib Features
  # ++++++++++
  # Auto detect GPU or not, and include Cuda Callback
  # Don't hardcode loop.cbs[1] to get the Recorder. Have a general way to get a callback
  # Fix HyperParams one cycle phases 0.3/0.7 and mom values .95/.85- create_OneCycleCB(split_lr, phases=[0.5, 0.5], mom_start=0.8, mom_mid=0.7, mom_end=0.8)
  # Hyperparam naming - beta vs momentum for AdamW - cleanup how it is implemented in OptimParamCB
  # DONE Hyperparam 'one_cycle' argument should be renamed to 'lr_sched' or something 
  # Trainer should take a Metrics param as a first class citizen rather than putting it in the Callbacks list
  # Increase momentum in BatchNorm constructor
  # Debug BatchNorm attributes - running_mean, running_var, learnable gamma and beta parameters can be accessed by displaying the weight and bias members of a batch norm layer
  # Include primt_grad() with buffers in the DebugYhat
  # Generic way to pass in first_div and final_div params to Create One Cycle
  # Better filter criteria for epoch and batch for DTR
  # In run_settings(), include a Run Description which can be the filename for saving a run

  # ++++++++++ Tabular Features
  # ++++++++++
  # DONE Fill Missing adds an extra column to indicate that it was filled, and make the extra column categorical
  # DONE Add a check in impute_values to warn about a high % of NaN or inf values, above some threshold. Or make a separate check_missing() converter. 
  # The default for Tabular split_param should be random, not sequential.
  # Add a split_date() function
  # split_idxs() should take a function which gets passed the train and test rows and returns the idxs for splitting
  # Add a convert function to TabularItemList for ordinal category values
  # Then remove columns which have a high missing %age eg. PCT_CREDIT_Refused, Cancelled, PCT_ANNUITY_Refused etc.
  # db.make_subset() should be able to merge one related file at a time and create a subset for that file, and then loop to the 
  #     next related file rather than require a merge_all function which merges everything all at once. Look at the comments in
  #     make_subset() for more details.
  # DONE reduce_mem() downgrade of float values is dangerous - get rid of it
  # Feature Dict stuff
  # Read CSV with low_memory=False and/or giving a list of dtypes
  # Do EDA with bivariate data_target, correlation, stack_hist
  # Write a run_predict() method
  # Add EDA or Converter logic for converting columns to the correct data types, because the initial could be garbage
  # Normalise() - if std is 0, then what to do??
  # Comments in Tabular Item List
  # Normal Workflow - Break down the problem into smaller pieces
     # Run with base-data ie only 'train.csv' columns with no related.
     # Subset with say 10 batches - ie. 640 rows. Or with 10000 rows?
     # Test with only continuous columns first
     # Then add categorical columns
     # Test that fill missing and normalise give us correct results
     # Then add rollup of related, but only one table at first. Then slowly add more tables.
     # Then make a subset of 100 batches, then 1000 and 10000
  # https://stackoverflow.com/questions/55894132/how-to-correct-unstable-loss-and-accuracy-during-training-binary-classificatio
  # https://stats.stackexchange.com/questions/352036/what-should-i-do-when-my-neural-network-doesnt-learn
  #

  # ++++++++++ Rossman Features
  # ++++++++++
  # Rossman - fix the sort order of Date and Store in the prepare()
  # Rossman - add back Customer as a target columns

  # ++++++++++ Home Credit Features
  # ++++++++++
  # Run with Fastai using my df and measure ROC AUC score
  # Verify that ROC AUC is implemented correctly - right now it just stays flat
  # Again compare scores and df with Aguiar
  # Add columsn numerics to bins
  # Convert FLAG_xxx columns from Y/N to bool
  # Check if some numerics should be treated as categorical
  # Include some more columns where mode/missing data is Ok
  # Use automated feature polynomials
  # Sort HC by Curr ID and Rossman by Date -> Store

  # ++++++++++ Optional Util features Features
  # ++++++++++
  # Create a inspect stack trace utility, and simulate an error by returning int array from getobj. Put a try-catch around the loop.fit
  # Make generic utils for Inspect Stack Trace and Memory Consumption in debug_lib
  # Make generic example for Line Profiler timing measurements
  # Cleanup stuff under Temp Obsolete and Random Stuff
  # torch.set_num_threads(2)
  # Dataloader num workers
  # Make a separate NpItemlist
  # In getobj or to_np(), split the categorical np array (np.int64) and continuous np array (np.float32) into two separate arrays
  # Possible to incorporate remove_inf() converter inside fill_missing() or impute_values()
  #
  # Reduce RAM - del variables and call gc.collect regularly, pass in preset datatypes while reading, read fewer rows/skip rows, 
  # NODIFF - merge with index columns
  # INCREASE - convert to Sparse type


In [None]:
#----------------------------------------------------
# So we rename the keys in the weights to correspond to the KD model module names
#----------------------------------------------------
def rename_wgt_keys(model, weights_path):
  wgts = torch.load(weights_path)
  #renamed_wgts=OrderedDict()
  renamed_wgts={}
  for old_key, new_key in zip(wgts.keys(), model.state_dict().keys()):
    renamed_wgts[new_key] = wgts[old_key]
    #print (old_key, new_key, torch.all(torch.eq(renamed_wgts[new_key], wgts[old_key])))
  model.load_state_dict(renamed_wgts)

def print_grad(self, model, before=''):
    all_params=[]
    # Use state_dict() not parameters() as it also returns buffers, not just parameters
    for name, param in model.state_dict().items():
      param_dict = {'name': name, 'shape': tuple(param.data.size()), 'requires': param.requires_grad, 'leaf': param.is_leaf, 'param': param.data.float().mean().item(), 'grad': (param.grad.mean().item() if param.grad is not None else 0)}
      all_params.append(param_dict)
    return pd.DataFrame(all_params)


In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from nb_util import accuracy, accuracy_thresh, mse, rmspe, save_pickle, load_pickle, DebugRand
from nb_training import Trainer, CancelFitException, Callback, MetricsGrp, LossMetricsCB, AverageMetricsCB, RocAucMetricsCB, AbortTrainCB, ProgressCallback, CudaCB, DebugTracker, DebugYhatLossCB
#from nb_optimiser import get_optimiser, Recorder, adam_opt_func, HyperParams
from nb_optimiser import get_optimiser, Recorder, adam_opt_func, HyperParams, LRRangeFind
#from nb_hooks import Hooks, Hook
from nb_hooks import Hooks, DebugActivationHook
from nb_util import test_near_zero

### Tabular Arch

In [None]:
#export

#----------------------------------------------------
# Tabular model with Entity Embedding for categorical features followed
# by Linear blocks
#
# 'emb_szs' is a list of tuples [(num_categories, embedding_dim), (), ...] with one
# tuple per categorical feature
# 'lin_hs' is a list of hidden sizes for each linear block [hid1, hid2, ...]
#----------------------------------------------------
class Tabular(nn.Module):
  def __init__(self, n_cat, n_cont, emb_szs, emb_p, hidden_szs, hidden_ps, out_sz, out_range):
    super().__init__()
    self.n_cat, self.n_cont = n_cat, n_cont
    self.out_range = out_range

    # Embedding layers and Dropout for categorical features
    self.cat_emb = nn.ModuleList([nn.Embedding(ln,dim) for ln, dim in emb_szs])
    self.emb_drop = nn.Dropout(emb_p)

    # BatchNorm layers for continuous features
    # !!!!!!!!!!!!!
    #self.cont_bn = nn.BatchNorm1d(n_cont, momentum=0.5)
    self.cont_bn = nn.BatchNorm1d(n_cont)

    # Input and output feature sizes for each layer
    total_emb_dim = sum([dim for _, dim in emb_szs])
    inp_sz = total_emb_dim + n_cont
    n_ins = [inp_sz] + hidden_szs
    n_outs = hidden_szs + [out_sz]

    # Linear block layers with Linear, ReLU, BatchNorm and Dropout
    layers = []
    for n_in, n_out, drop_p in zip(n_ins, n_outs, hidden_ps):
      lin = nn.Linear(n_in, n_out)
      relu = nn.ReLU(inplace=True)
      bn = nn.BatchNorm1d(n_out)
      drop = nn.Dropout(drop_p)
      layers += [lin, relu, bn, drop]

    # Output layer
    out = nn.Linear(hidden_szs[-1], out_sz)

    # Wrap all the layers into Sequential
    layers += [out]
    self.layers = nn.Sequential(*layers)

  def forward(self, inp):
    # Separate categorical and continuous features. The initial columns are
    # categorical
    cat_inp = inp[:, :self.n_cat].long()
    cont_inp = inp[:, self.n_cat:].float()

    # Pass each categorical feature through its embedding and dropout
    if (self.n_cat > 0):
      emb_vals = [emb(cat_inp[:, i])  for i, emb in enumerate(self.cat_emb)]
      emb_val = torch.cat(emb_vals, axis=1)
      emb_val = self.emb_drop(emb_val)
    else:
      # Empty tensor
      emb_val = cat_inp

    # Process the continuous features through batch norm
    if (self.n_cont > 0):
      cont_val = self.cont_bn(cont_inp)
    else:
      # Empty tensor
      cont_val = cont_inp

    # Concatenate the embedding and continuous values
    combined_val = torch.cat([emb_val, cont_val], axis=1)

    # Process the sequential linear layers
    output = self.layers(combined_val)

    if (self.out_range):
      output = self.out_range[0] + torch.sigmoid(output) * (self.out_range[1] - self.out_range[0])
    return output

#----------------------------------------------------
# Create the Tabular architecture
#----------------------------------------------------
class ArchTabular(ArchBase):
  # ----------------------------
  # Create the Tabular model. Calculates the number and size of all the Entity Embedding
  # layers for the categorical variables
  # ----------------------------
  def create_model(self, cat_szs, n_cont, emb_p, hidden_szs, hidden_ps, out_sz=1, out_range=None):

    #def emb_sz_rule(cat_sz:int)->int: return min(50, (cat_sz//2)+1)
    def emb_sz_rule(cat_sz:int)->int: return min(600, round(1.6 * cat_sz**0.56))

    n_cat = len(cat_szs)
    emb_szs = [(cat_sz, emb_sz_rule(cat_sz)) for cat_sz in cat_szs]

    # Build the Tabular model
    self.model = Tabular(n_cat, n_cont, emb_szs, emb_p, hidden_szs, hidden_ps, out_sz, out_range)

  def hook_groups(self):
    model = self.model
    layer_modules = list(model.layers.modules())
    hk_groups = [list(model.cat_emb) + [model.cont_bn, layer_modules[1], layer_modules[3], layer_modules[5], layer_modules[7], layer_modules[-1]]]
    return hk_groups

### Tabular Application

In [None]:
#export

#----------------------------------------------------
# Tabular Application
#----------------------------------------------------
class AppTabular(AppBase):
  # ----------------------------
  # Load the data using the Tabular Data Bundle
  # ----------------------------
  def load_data(self, app_db, main_file_path, test_file_path, related_csv_paths, steps=['load', 'post_load'], **kwargs):
    if ('load' in steps):
      self.db = app_db(main_file_path, test_file_path, related_csv_paths, **kwargs)
      self.db.process(steps=['load'])

    if ('post_load' in steps):
      self.db.process(steps=['post_load'])
      self.n_cont, self.cat_szs, self.n_tgt, self.tgt_range = self.db.col_szs()

  # ----------------------------
  # Create the architecture
  # ----------------------------
  def create_arch(self, emb_p=0.04, hidden_szs=[1000, 500], hidden_ps=[.001, .01]):
    self._arch = ArchTabular()
    self._arch.create_model(self.cat_szs, self.n_cont, 
                            emb_p=emb_p, hidden_szs=hidden_szs, hidden_ps=hidden_ps, 
                            out_sz=self.n_tgt, out_range=self.tgt_range)
    return self._arch

  @staticmethod
  def subset_path(app_dir, gd_path, subset_sz):
    subset_root_path = Path(gd_path)/'data'
    subset_data_path = subset_root_path/f'{app_dir}_{subset_sz}'
    return subset_data_path

  @staticmethod
  def set_data_path (app_dir, gd_path, subset_sz, data_files):
    root_path = Path.cwd()
    download_path = root_path/app_dir

    subset_data_path = AppTabular.subset_path(app_dir, gd_path, subset_sz)
    debug_path = subset_data_path/'debug'
    hist_path = subset_data_path/'hist'

    if (subset_sz == 'full'):
      # In this case data is taken from the download path, and subset data path is
      # used only for debug data.
      data_path = download_path
    else:
      data_path = subset_data_path

    main_file_path = data_path/data_files['main']
    test_file_path = data_path/data_files['test']
    related_csv_paths = [data_path/file for file in data_files['related']]

    return (root_path, download_path, data_path, debug_path, hist_path, main_file_path, test_file_path, related_csv_paths)

### Export

In [None]:
!wget https://raw.githubusercontent.com/ketanhdoshi/ml/master/lib/nb_export.py

In [None]:
from nb_export import notebook2scriptSingle
notebook2scriptSingle(gn_path + '/lib/app_lib.ipynb', gn_path + '/exp')

Converted gdrive/My Drive/Colab Notebooks/lib/app_lib.ipynb to gdrive/My Drive/Colab Notebooks/exp/nb_app.py


### Junk

In [None]:

  def load_weights(self, weights_path):
    weights = torch.load(weights_path)
    self.model.load_state_dict(weights)

  def save_weights(self, weights_path):
    # Save the full model
    torch.save(self.model.state_dict(), weights_path)

#----------------------------------------------------
# Tabular Application
#----------------------------------------------------
class AppTabular():
  def __init__(self, loss_type='bin_classif', metrics_cbs=[]):
    self._arch = None
    self.db = None

    # Select the appropriate loss function for the type of problem
    if (loss_type == 'bin_classif'):
      # Binary classification problems
      self.loss_fn = F.binary_cross_entropy_with_logits
      #self.loss_fn = nn.BCELoss()
    elif (loss_type == 'multi_classif'):
      # Multi-class classification problems
      self.loss_fn = F.cross_entropy
    elif (loss_type == 'regression'):
      self.loss_fn = mse
    assert(self.loss_fn)

    metrics_cbs = [LossMetricsCB()] + metrics_cbs
    self.metrics_cbs = metrics_cbs + [MetricsGrp(metrics_cbs)]

  # ----------------------------
  # Load the data using the Tabular Data Bundle
  # ----------------------------
  def load_data(self, app_db, main_file_path, test_file_path, related_csv_paths, steps=['load', 'post_load'], **kwargs):
    if ('load' in steps):
      self.db = app_db(main_file_path, test_file_path, related_csv_paths, **kwargs)
      self.db.process(steps=['load'])

    if ('post_load' in steps):
      self.db.process(steps=['post_load'])
      self.n_cont, self.cat_szs, self.n_tgt, self.tgt_range = self.db.col_szs()

  # ----------------------------
  # Create the architecture
  # ----------------------------
  def create_arch(self, emb_p=0.04, hidden_szs=[1000, 500], hidden_ps=[.001, .01]):
    self._arch = ArchTabular()
    self._arch.create_model(self.cat_szs, self.n_cont, 
                            emb_p=emb_p, hidden_szs=hidden_szs, hidden_ps=hidden_ps, 
                            out_sz=self.n_tgt, out_range=self.tgt_range)
    return self._arch

  # ----------------------------
  # Create the debug settings
  # ----------------------------
  def create_debug(self, use_dtr=False, track_batches_per_epoch=5, disp_tb=False, disp_pd=True, debug_bkwd=False, debug_fwd=False, abort_iter=0):
    dtr, hooks, debug_cbs = None, None, []
    if (use_dtr):
      dtr = DebugTracker(max_count=track_batches_per_epoch, disp=(disp_tb, disp_pd))
      debug_cbs += [dtr, DebugYhatLossCB(fwd=False, bkwd=debug_bkwd)]

      # Add hooks for the forward pass activations
      if (debug_fwd):
        # Arch and Model should be created already
        assert(self._arch and self._arch.model)
        arch = self._arch
        model = arch.model

        # Add Debug Hooks to the hook_layers and save a list of all the hooks
        hook_cls=[[partial(DebugActivationHook, do_print=False, model=model, dtr=dtr)]]
        hook_groups = arch.hook_groups()
        hooks = Hooks(hook_groups, hook_cls)

    if (abort_iter > 0):
      debug_cbs += [AbortTrainCB(abort_iter)]

    self.dtr, self.hooks, self.debug_cbs = dtr, hooks, debug_cbs

  # ----------------------------
  # Train the model
  # ----------------------------
  def run_train(self, num_epochs=1, split_lr=[1e-3], weight_decay=0.2, one_cycle=False):
    assert(isinstance(one_cycle, bool))

    train_dl = self.db.train_dl
    valid_dl = self.db.valid_dl

    # Loss function
    loss_func = self.loss_fn

    # Model
    arch = self._arch
    model = arch.model

    opt_adamw = partial(optim.AdamW, betas=(0.9, 0.99), weight_decay=weight_decay)
    opt_adam = optim.Adam
    lr_sched='one_cycle' if one_cycle else None
    opt, hyper_cbs = HyperParams.set(model, module_groups=None, split_lr=split_lr, split=False, lr_sched=lr_sched, opt_func=opt_adamw)

    gpu_cbs = [CudaCB(device = torch.device('cuda',0))]
    track_cbs = [Recorder(), ProgressCallback()]
    callbs = gpu_cbs + track_cbs
    callbs += self.metrics_cbs + hyper_cbs + self.debug_cbs

    loop = Trainer(train_dl, valid_dl, model, opt, loss_func, callbs, dtr=self.dtr)
    loop.hooks = self.hooks

    loop.fit(num_epochs=num_epochs)
    return loop

  # ----------------------------
  # Learning Rate Finder
  # ----------------------------
  def lr_find(self, start_lr, end_lr, num_iter):

    train_dl = self.db.train_dl
    valid_dl = None

    # Loss function
    loss_func = self.loss_fn

    # Model
    arch = self._arch
    model = arch.model

    opt_adamw = partial(optim.AdamW, betas=(0.9, 0.99), weight_decay=0.01)
    opt, hyper_cbs = HyperParams.set(model, module_groups=None, split_lr=[start_lr], split=False, lr_sched='lrf', opt_func=opt_adamw, start_lr=start_lr, end_lr=end_lr, num_iter=num_iter)

    gpu_cbs = [CudaCB(device = torch.device('cuda',0))]
    track_cbs = [Recorder(), ProgressCallback()]
    lrf_cbs = [LRRangeFind(num_iter)]
    callbs = gpu_cbs + track_cbs
    callbs += lrf_cbs + hyper_cbs

    loop = Trainer(train_dl, valid_dl, model, opt, loss_func, callbs, dtr=None)

    num_epochs = int(math.ceil(num_iter / len(train_dl)))
    loop.fit(num_epochs=num_epochs)
    return loop

  @staticmethod
  def subset_path(app_dir, gd_path, subset_sz):
    subset_root_path = Path(gd_path)/'data'
    subset_data_path = subset_root_path/f'{app_dir}_{subset_sz}'
    return subset_data_path

  @staticmethod
  def set_data_path (app_dir, gd_path, subset_sz, data_files):
    root_path = Path.cwd()
    download_path = root_path/app_dir

    subset_data_path = AppTabular.subset_path(app_dir, gd_path, subset_sz)
    debug_path = subset_data_path/'debug'
    hist_path = subset_data_path/'hist'

    if (subset_sz == 'full'):
      # In this case data is taken from the download path, and subset data path is
      # used only for debug data.
      data_path = download_path
    else:
      data_path = subset_data_path

    main_file_path = data_path/data_files['main']
    test_file_path = data_path/data_files['test']
    related_csv_paths = [data_path/file for file in data_files['related']]

    return (root_path, download_path, data_path, debug_path, hist_path, main_file_path, test_file_path, related_csv_paths)