In [1]:
!ls

'Lesson 1-Stripped [Code only].ipynb'  'Lesson 4-Stripped [Code only].ipynb'
'Lesson 2-Stripped[Code only].ipynb'   'Lesson 5-Stripped [Code Only].ipynb'
'Lesson 3-Stripped [Code only].ipynb'


In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
#from fastai.nlp import *
import os
import re
import torch
import tqdm as tq
import collections
import torch.nn as nn
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
import collections.abc as collections_abc

from distutils.version import LooseVersion
from torch.autograd import Variable
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tnrange, tqdm

from collections.abc import Iterable, Iterator
from timeit import default_timer as timer
from abc import abstractmethod
from glob import glob
from torch.utils.data import Dataset
from torch.utils.data.sampler import SequentialSampler, RandomSampler, BatchSampler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
string_classes = (str, bytes)


def get_tensor(batch, pin, half=False):
    if isinstance(batch, (np.ndarray, np.generic)):
        batch = T(batch, half=half, cuda=False).contiguous()
        if pin: batch = batch.pin_memory()
        return to_gpu(batch)
    elif isinstance(batch, string_classes):
        return batch
    elif isinstance(batch, collections_abc.Mapping):
        return {k: get_tensor(sample, pin, half) for k, sample in batch.items()}
    elif isinstance(batch, collections_abc.Sequence):
        return [get_tensor(sample, pin, half) for sample in batch]
    raise TypeError(f"batch must contain numbers, dicts or lists; found {type(batch)}")


class DataLoader(object):
    def __init__(self, dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, pad_idx=0,
                 num_workers=None, pin_memory=False, drop_last=False, pre_pad=True, half=False,
                 transpose=False, transpose_y=False):
        self.dataset,self.batch_size,self.num_workers = dataset,batch_size,num_workers
        self.pin_memory,self.drop_last,self.pre_pad = pin_memory,drop_last,pre_pad
        self.transpose,self.transpose_y,self.pad_idx,self.half = transpose,transpose_y,pad_idx,half

        if batch_sampler is not None:
            if batch_size > 1 or shuffle or sampler is not None or drop_last:
                raise ValueError('batch_sampler is mutually exclusive with '
                                 'batch_size, shuffle, sampler, and drop_last')

        if sampler is not None and shuffle:
            raise ValueError('sampler is mutually exclusive with shuffle')

        if batch_sampler is None:
            if sampler is None:
                sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
            batch_sampler = BatchSampler(sampler, batch_size, drop_last)

        if num_workers is None:
            self.num_workers = num_cpus()

        self.sampler = sampler
        self.batch_sampler = batch_sampler

    def __len__(self): return len(self.batch_sampler)

    def jag_stack(self, b):
        if len(b[0].shape) not in (1,2): return np.stack(b)
        ml = max(len(o) for o in b)
        if min(len(o) for o in b)==ml: return np.stack(b)
        res = np.zeros((len(b), ml), dtype=b[0].dtype) + self.pad_idx
        for i,o in enumerate(b):
            if self.pre_pad: res[i, -len(o):] = o
            else:            res[i,  :len(o)] = o
        return res

    def np_collate(self, batch):
        b = batch[0]
        if isinstance(b, (np.ndarray, np.generic)): return self.jag_stack(batch)
        elif isinstance(b, (int, float)): return np.array(batch)
        elif isinstance(b, string_classes): return batch
        elif isinstance(b, collections_abc.Mapping):
            return {key: self.np_collate([d[key] for d in batch]) for key in b}
        elif isinstance(b, collections_abc.Sequence):
            return [self.np_collate(samples) for samples in zip(*batch)]
        raise TypeError(("batch must contain numbers, dicts or lists; found {}".format(type(b))))

    def get_batch(self, indices):
        res = self.np_collate([self.dataset[i] for i in indices])
        if self.transpose:   res[0] = res[0].T
        if self.transpose_y: res[1] = res[1].T
        return res

    def __iter__(self):
        if self.num_workers==0:
            for batch in map(self.get_batch, iter(self.batch_sampler)):
                yield get_tensor(batch, self.pin_memory, self.half)
        else:
            with ThreadPoolExecutor(max_workers=self.num_workers) as e:
                # avoid py3.6 issue where queue is infinite and can result in memory exhaustion
                for c in chunk_iter(iter(self.batch_sampler), self.num_workers*10):
                    for batch in e.map(self.get_batch, c):
                        yield get_tensor(batch, self.pin_memory, self.half)



In [5]:
class ModelData():
    """Encapsulates DataLoaders and Datasets for training, validation, test."""
    def __init__(self, path, trn_dl, val_dl, test_dl=None):
        self.path,self.trn_dl,self.val_dl,self.test_dl = path,trn_dl,val_dl,test_dl

    @classmethod
    def from_dls(cls, path,trn_dl,val_dl,test_dl=None):
        #trn_dl,val_dl = DataLoader(trn_dl),DataLoader(val_dl)
        #if test_dl: test_dl = DataLoader(test_dl)
        return cls(path, trn_dl, val_dl, test_dl)

    @property
    def is_reg(self): return self.trn_ds.is_reg
    @property
    def is_multi(self): return self.trn_ds.is_multi
    @property
    def trn_ds(self): return self.trn_dl.dataset
    @property
    def val_ds(self): return self.val_dl.dataset
    @property
    def test_ds(self): return self.test_dl.dataset
    @property
    def trn_y(self): return self.trn_ds.y
    @property
    def val_y(self): return self.val_ds.y

In [6]:
def set_train_mode(m):
    if (hasattr(m, 'running_mean') and (getattr(m,'bn_freeze',False)
              or not getattr(m,'trainable',False))): m.eval()
    elif (getattr(m,'drop_freeze',False) and hasattr(m, 'p')
          and ('drop' in type(m).__name__.lower())): m.eval()
    else: m.train()
    
def batch_sz(x, seq_first=False):
    if is_listy(x): x = x[0]
    return x.shape[1 if seq_first else 0]

In [7]:
class Stepper():
    def __init__(self, m, opt, crit, clip=0, reg_fn=None, fp16=False, loss_scale=1):
        self.m,self.opt,self.crit,self.clip,self.reg_fn = m,opt,crit,clip,reg_fn
        self.fp16 = fp16
        self.reset(True)
        if self.fp16: self.fp32_params = copy_model_to_fp32(m, opt)
        self.loss_scale = loss_scale

    def reset(self, train=True):
        if train: apply_leaf(self.m, set_train_mode)
        else: self.m.eval()
        if hasattr(self.m, 'reset'):
            self.m.reset()
            if self.fp16: self.fp32_params = copy_model_to_fp32(self.m, self.opt)

    def step(self, xs, y, epoch):
        xtra = []
        output = self.m(*xs)
        if isinstance(output,tuple): output,*xtra = output
        if self.fp16: self.m.zero_grad()
        else: self.opt.zero_grad() 
        loss = raw_loss = self.crit(output, y)
        if self.loss_scale != 1: assert(self.fp16); loss = loss*self.loss_scale
        if self.reg_fn: loss = self.reg_fn(output, xtra, raw_loss)
        loss.backward()
        if self.fp16: update_fp32_grads(self.fp32_params, self.m)
        if self.loss_scale != 1:
            for param in self.fp32_params: param.grad.data.div_(self.loss_scale)
        if self.clip:   # Gradient clipping
            if IS_TORCH_04: nn.utils.clip_grad_norm_(trainable_params_(self.m), self.clip)
            else:           nn.utils.clip_grad_norm(trainable_params_(self.m), self.clip)
        if 'wd' in self.opt.param_groups[0] and self.opt.param_groups[0]['wd'] != 0: 
            #Weight decay out of the loss. After the gradient computation but before the step.
            for group in self.opt.param_groups:
                lr, wd = group['lr'], group['wd']
                for p in group['params']:
                    if p.grad is not None: p.data = p.data.add(-wd * lr, p.data)
        self.opt.step()
        if self.fp16: 
            copy_fp32_to_model(self.m, self.fp32_params)
            torch.cuda.synchronize()
        return torch_item(raw_loss.data)
    
    def evaluate(self, xs, y):
        preds = self.m(*xs)
        if isinstance(preds,tuple): preds=preds[0]
        return preds, self.crit(preds, y)

In [8]:
def validate(stepper, dl, metrics, epoch, seq_first=False, validate_skip = 0):
    if epoch < validate_skip: return [float('nan')] + [float('nan')] * len(metrics)
    batch_cnts,loss,res = [],[],[]
    stepper.reset(False)
    with torch.no_grad():
        t = tqdm(iter(dl), leave=False, total=len(dl), miniters=0, desc='Validation')
        for (*x,y) in t:
            y = VV(y)
            preds, l = stepper.evaluate(VV(x), y)
            batch_cnts.append(batch_sz(x, seq_first=seq_first))
            loss.append(to_np(l))
            res.append([to_np(f(datafy(preds), datafy(y))) for f in metrics])
    return [np.average(loss, 0, weights=batch_cnts)] + list(np.average(np.stack(res), 0, weights=batch_cnts))

def append_stats(ep_vals, epoch, values, decimals=6):
    ep_vals[epoch]=list(np.round(values, decimals))
    return ep_vals

def print_stats(epoch, values, visualize, prev_val=[], decimals=6):
    layout = "{!s:^10}" + " {!s:10}" * len(values)
    values = [epoch] + list(np.round(values, decimals))
    sym = ""
    if visualize:
        if epoch == 0:                                             pass        
        elif values[1] > prev_val[0] and values[2] > prev_val[1]:  sym = " △ △"
        elif values[1] > prev_val[0] and values[2] < prev_val[1]:  sym = " △ ▼"            
        elif values[1] < prev_val[0] and values[2] > prev_val[1]:  sym = " ▼ △"            
        elif values[1] < prev_val[0] and values[2] < prev_val[1]:  sym = " ▼ ▼"
    print(layout.format(*values) + sym)

In [9]:
def fit(model, data, n_epochs, opt, crit, metrics=None, callbacks=None, stepper=Stepper,
        swa_model=None, swa_start=None, swa_eval_freq=None, visualize=False, **kwargs):
    """ Fits a model

    Arguments:
       model (model): any pytorch module
           net = to_gpu(net)
       data (ModelData): see ModelData class and subclasses (can be a list)
       opts: an optimizer. Example: optim.Adam. 
       If n_epochs is a list, it needs to be the layer_optimizer to get the optimizer as it changes.
       n_epochs(int or list): number of epochs (or list of number of epochs)
       crit: loss function to optimize. Example: F.cross_entropy
    """

    seq_first = kwargs.pop('seq_first', False)
    all_val = kwargs.pop('all_val', False)
    get_ep_vals = kwargs.pop('get_ep_vals', False)
    validate_skip = kwargs.pop('validate_skip', 0)
    metrics = metrics or []
    callbacks = callbacks or []
    avg_mom=0.98
    batch_num,avg_loss=0,0.
    for cb in callbacks: cb.on_train_begin()
    names = ["epoch", "trn_loss", "val_loss"] + [f.__name__ for f in metrics]
    if swa_model is not None:
        swa_names = ['swa_loss'] + [f'swa_{f.__name__}' for f in metrics]
        names += swa_names
        # will use this to call evaluate later
        swa_stepper = stepper(swa_model, None, crit, **kwargs)

    layout = "{!s:10} " * len(names)
    if not isinstance(n_epochs, Iterable): n_epochs=[n_epochs]
    if not isinstance(data, Iterable): data = [data]
    if len(data) == 1: data = data * len(n_epochs)
    for cb in callbacks: cb.on_phase_begin()
    model_stepper = stepper(model, opt.opt if hasattr(opt,'opt') else opt, crit, **kwargs)
    ep_vals = collections.OrderedDict()
    tot_epochs = int(np.ceil(np.array(n_epochs).sum()))
    cnt_phases = np.array([ep * len(dat.trn_dl) for (ep,dat) in zip(n_epochs,data)]).cumsum()
    phase = 0
    for epoch in tnrange(tot_epochs, desc='Epoch'):
        if phase >= len(n_epochs): break #Sometimes cumulated errors make this append.
        model_stepper.reset(True)
        cur_data = data[phase]
        if hasattr(cur_data, 'trn_sampler'): cur_data.trn_sampler.set_epoch(epoch)
        if hasattr(cur_data, 'val_sampler'): cur_data.val_sampler.set_epoch(epoch)
        num_batch = len(cur_data.trn_dl)
        t = tqdm(iter(cur_data.trn_dl), leave=False, total=num_batch, miniters=0)
        if all_val: val_iter = IterBatch(cur_data.val_dl)

        for (*x,y) in t:
            batch_num += 1
            for cb in callbacks: cb.on_batch_begin()
            loss = model_stepper.step(V(x),V(y), epoch)
            avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)
            debias_loss = avg_loss / (1 - avg_mom**batch_num)
            t.set_postfix(loss=debias_loss, refresh=False)
            stop=False
            los = debias_loss if not all_val else [debias_loss] + validate_next(model_stepper,metrics, val_iter)
            for cb in callbacks: stop = stop or cb.on_batch_end(los)
            if stop: return
            if batch_num >= cnt_phases[phase]:
                for cb in callbacks: cb.on_phase_end()
                phase += 1
                if phase >= len(n_epochs):
                    t.close()
                    break
                for cb in callbacks: cb.on_phase_begin()
                if isinstance(opt, LayerOptimizer): model_stepper.opt = opt.opt
                if cur_data != data[phase]:
                    t.close()
                    break

        if not all_val:
            vals = validate(model_stepper, cur_data.val_dl, metrics, epoch, seq_first=seq_first, validate_skip = validate_skip)
            stop=False
            for cb in callbacks: stop = stop or cb.on_epoch_end(vals)
            if swa_model is not None:
                if (epoch + 1) >= swa_start and ((epoch + 1 - swa_start) % swa_eval_freq == 0 or epoch == tot_epochs - 1):
                    fix_batchnorm(swa_model, cur_data.trn_dl)
                    swa_vals = validate(swa_stepper, cur_data.val_dl, metrics, epoch, validate_skip = validate_skip)
                    vals += swa_vals

            if epoch > 0: 
                print_stats(epoch, [debias_loss] + vals, visualize, prev_val)
            else:
                print(layout.format(*names))
                print_stats(epoch, [debias_loss] + vals, visualize)
            prev_val = [debias_loss] + vals
            ep_vals = append_stats(ep_vals, epoch, [debias_loss] + vals)
        if stop: break
    for cb in callbacks: cb.on_train_end()
    if get_ep_vals: return vals, ep_vals
    else: return vals

def torch_item(x): return x.item() if hasattr(x,'item') else x[0]

In [10]:
def accuracy_multi(preds, targs, thresh):
    return ((preds>thresh).float()==targs).float().mean()

def sum_geom(a,r,n): return a*n if r==1 else math.ceil(a*(1-r**n)/(1-r))

def one_hot(a,c): return np.eye(c)[a]

def calc_r(y_i, x, y):
    return np.log(calc_pr(y_i, x, y, True) / calc_pr(y_i, x, y, False))

def calc_pr(y_i, x, y, b):
    idx = np.argwhere((y==y_i)==b)
    ct = x[idx[:,0]].sum(0)+1
    tot = ((y==y_i)==b).sum()+1
    return ct/tot
def num_cpus():
    try:
        return len(os.sched_getaffinity(0))
    except AttributeError:
        return os.cpu_count()

IS_TORCH_04 = LooseVersion(torch.__version__) >= LooseVersion('0.4')
USE_GPU = torch.cuda.is_available()
def to_gpu(x, *args, **kwargs):
    '''puts pytorch variable to gpu, if cuda is available and USE_GPU is set to true. '''
    return x.cuda(*args, **kwargs) if USE_GPU else x

def T(a, half=False, cuda=True):
    """
    Convert numpy array into a pytorch tensor. 
    """
    if not torch.is_tensor(a):
        a = np.array(np.ascontiguousarray(a))
        if a.dtype in (np.int8, np.int16, np.int32, np.int64):
            a = torch.LongTensor(a.astype(np.int64))
        elif a.dtype in (np.float32, np.float64):
            a = to_half(a) if half else torch.FloatTensor(a)
        else: raise NotImplementedError(a.dtype)
    if cuda: a = to_gpu(a)
    return a

def create_variable(x, volatile, requires_grad=False):
    if type (x) != Variable:
        if IS_TORCH_04: x = Variable(T(x), requires_grad=requires_grad)
        else:           x = Variable(T(x), requires_grad=requires_grad, volatile=volatile)
    return x

def V_(x, requires_grad=False, volatile=False):
    '''equivalent to create_variable, which creates a pytorch tensor'''
    return create_variable(x, volatile=volatile, requires_grad=requires_grad)

def V(x, requires_grad=False, volatile=False):
    '''creates a single or a list of pytorch tensors, depending on input x. '''
    return map_over(x, lambda o: V_(o, requires_grad, volatile))

def VV_(x): 
    '''creates a volatile tensor, which does not require gradients. '''
    return create_variable(x, True)

def VV(x):
    '''creates a single or a list of pytorch tensors, depending on input x. '''
    return map_over(x, VV_)

def to_np(v):
    '''returns an np.array object given an input of np.array, list, tuple, torch variable or tensor.'''
    if isinstance(v, float): return np.array(v)
    if isinstance(v, (np.ndarray, np.generic)): return v
    if isinstance(v, (list,tuple)): return [to_np(o) for o in v]
    if isinstance(v, Variable): v=v.data
    if torch.cuda.is_available():
        if is_half_tensor(v): v=v.float()
    if isinstance(v, torch.FloatTensor): v=v.float()
    return v.cpu().numpy()

def is_half_tensor(v):
    return isinstance(v, torch.cuda.HalfTensor)

def accuracy_thresh(thresh):
    return lambda preds,targs: accuracy_multi(preds, targs, thresh)

def children(m): return m if isinstance(m, (list, tuple)) else list(m.children())

def is_listy(x): return isinstance(x, (list,tuple))

def map_over(x, f): return [f(o) for o in x] if is_listy(x) else f(x)

def is_iter(x): return isinstance(x, collections_abc.Iterable)

def listify(x, y):
    if not is_iter(x): x=[x]
    n = y if type(y)==int else len(y)
    if len(x)==1: x = x * n
    return x

def delistify(x): return x[0] if is_listy(x) else x

def datafy(x):
    if is_listy(x): return [o.data for o in x]
    else:           return x.data

def trainable_params_(m):
    '''Returns a list of trainable parameters in the model'''
    return [p for p in m.parameters() if p.requires_grad]

def chain_params(p):
    if is_listy(p):
        return list(chain(*[trainable_params_(o) for o in p]))
    return trainable_params_(p)

def apply_leaf(m, f):
    c = children(m)
    if isinstance(m, nn.Module): f(m)
    if len(c)>0:
        for l in c: apply_leaf(l,f)
        
def chunk_iter(iterable, chunk_size):
    '''A generator that yields chunks of iterable, chunk_size at a time. '''
    while True:
        chunk = []
        try:
            for _ in range(chunk_size): chunk.append(next(iterable))
            yield chunk
        except StopIteration:
            if chunk: yield chunk
            break

In [11]:
class BOW_Dataset(Dataset):
    def __init__(self, bow, y, max_len):
        self.bow,self.max_len = bow,max_len
        self.c = int(y.max())+1
        self.n,self.vocab_size = bow.shape
        self.y = one_hot(y,self.c).astype(np.float32)
        x = self.bow.sign()
        self.r = np.stack([calc_r(i, x, y).A1 for i in range(self.c)]).T

    def __getitem__(self, i):
        row = self.bow.getrow(i)

        num_row_entries = row.indices.shape[0]
        indices = (row.indices + 1).astype(np.int64)
        data = (row.data).astype(np.int64)

        if num_row_entries < self.max_len:
            # If short, pad
            indices = np.pad(indices, (self.max_len - num_row_entries, 0), mode='constant')
            data = np.pad(data, (self.max_len - num_row_entries, 0), mode='constant')
        else:
            # If long, truncate
            indices, data = indices[-self.max_len:], data[-self.max_len:]

        return indices, data, min(self.max_len, num_row_entries), self.y[i]

    def __len__(self): return len(self.bow.indptr)-1


class TextClassifierData(ModelData):
    @property
    def c(self): return self.trn_ds.c

    @property
    def r(self):
        return torch.Tensor(np.concatenate([np.zeros((1,self.c)), self.trn_ds.r]))

    def get_model(self, f, **kwargs):
        m = to_gpu(f(self.trn_ds.vocab_size, self.c, **kwargs))
        m.r.weight.data = to_gpu(self.r)
        m.r.weight.requires_grad = False
        model = BasicModel(m)
        return BOW_Learner(self, model, metrics=[accuracy_thresh(0.5)], opt_fn=optim.Adam)

    def dotprod_nb_learner(self, **kwargs): return self.get_model(DotProdNB, **kwargs)
    def nb_learner(self, **kwargs): return self.get_model(SimpleNB, **kwargs)

    @classmethod
    def from_bow(cls, trn_bow, trn_y, val_bow, val_y, sl):
        trn_ds = BOW_Dataset(trn_bow, trn_y, sl)
        val_ds = BOW_Dataset(val_bow, val_y, sl)
        trn_dl = DataLoader(trn_ds, 64, True)
        val_dl = DataLoader(val_ds, 64, False)
        return cls('.', trn_dl, val_dl)

In [12]:
class Learner():
    def __init__(self, data, models, opt_fn=None, tmp_name='tmp', models_name='models', metrics=None, clip=None, crit=None):
        """
        Combines a ModelData object with a nn.Module object, such that you can train that
        module.
        data (ModelData): An instance of ModelData.
        models(module): chosen neural architecture for solving a supported problem.
        opt_fn(function): optimizer function, uses SGD with Momentum of .9 if none.
        tmp_name(str): output name of the directory containing temporary files from training process
        models_name(str): output name of the directory containing the trained model
        metrics(list): array of functions for evaluating a desired metric. Eg. accuracy.
        clip(float): gradient clip chosen to limit the change in the gradient to prevent exploding gradients Eg. .3
        """
        self.data_,self.models,self.metrics,self.clip = data,models,metrics,clip
        self.sched=None
        self.wd_sched = None
        self.opt_fn = opt_fn or SGD_Momentum(0.9)
        self.tmp_path = tmp_name if os.path.isabs(tmp_name) else os.path.join(self.data.path, tmp_name)
        self.models_path = models_name if os.path.isabs(models_name) else os.path.join(self.data.path, models_name)
        os.makedirs(self.tmp_path, exist_ok=True)
        os.makedirs(self.models_path, exist_ok=True)
        self.crit = crit if crit else self._get_crit(data)
        self.reg_fn = None
        self.fp16 = False

    @classmethod
    def from_model_data(cls, m, data, **kwargs):
        self = cls(data, BasicModel(to_gpu(m)), **kwargs)
        self.unfreeze()
        return self

    def __getitem__(self,i): return self.children[i]

    @property
    def children(self): return children(self.model)

    @property
    def model(self): return self.models.model

    @property
    def data(self): return self.data_

    def summary(self): return model_summary(self.model, [torch.rand(3, 3, self.data.sz,self.data.sz)])

    def __repr__(self): return self.model.__repr__()
    
    def lsuv_init(self, needed_std=1.0, std_tol=0.1, max_attempts=10, do_orthonorm=False):         
        x = V(next(iter(self.data.trn_dl))[0])
        self.models.model=apply_lsuv_init(self.model, x, needed_std=needed_std, std_tol=std_tol,
                            max_attempts=max_attempts, do_orthonorm=do_orthonorm, 
                            cuda=USE_GPU and torch.cuda.is_available())

    def set_bn_freeze(self, m, do_freeze):
        if hasattr(m, 'running_mean'): m.bn_freeze = do_freeze

    def bn_freeze(self, do_freeze):
        apply_leaf(self.model, lambda m: self.set_bn_freeze(m, do_freeze))

    def freeze_to(self, n):
        c=self.get_layer_groups()
        for l in c:     set_trainable(l, False)
        for l in c[n:]: set_trainable(l, True)

    def freeze_all_but(self, n):
        c=self.get_layer_groups()
        for l in c: set_trainable(l, False)
        set_trainable(c[n], True)
        
    def freeze_groups(self, groups):
        c = self.get_layer_groups()
        self.unfreeze()
        for g in groups:
            set_trainable(c[g], False)
            
    def unfreeze_groups(self, groups):
        c = self.get_layer_groups()
        for g in groups:
            set_trainable(c[g], True)

    def unfreeze(self): self.freeze_to(0)

    def get_model_path(self, name): return os.path.join(self.models_path,name)+'.h5'
    
    def save(self, name): 
        save_model(self.model, self.get_model_path(name))
        if hasattr(self, 'swa_model'): save_model(self.swa_model, self.get_model_path(name)[:-3]+'-swa.h5')
                       
    def load(self, name): 
        load_model(self.model, self.get_model_path(name))
        if hasattr(self, 'swa_model'): load_model(self.swa_model, self.get_model_path(name)[:-3]+'-swa.h5')

    def set_data(self, data): self.data_ = data

    def get_cycle_end(self, name):
        if name is None: return None
        return lambda sched, cycle: self.save_cycle(name, cycle)

    def save_cycle(self, name, cycle): self.save(f'{name}_cyc_{cycle}')
    def load_cycle(self, name, cycle): self.load(f'{name}_cyc_{cycle}')

    def half(self):
        if self.fp16: return
        self.fp16 = True
        if type(self.model) != FP16: self.models.model = FP16(self.model)
    def float(self):
        if not self.fp16: return
        self.fp16 = False
        if type(self.model) == FP16: self.models.model = self.model.module
        self.model.float()

    def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None, best_save_name=None,
                use_clr=None, use_clr_beta=None, metrics=None, callbacks=None, use_wd_sched=False, norm_wds=False,             
                wds_sched_mult=None, use_swa=False, swa_start=1, swa_eval_freq=5, **kwargs):

        if cycle_save_name:
            assert use_clr or use_clr_beta or cycle_len, "cycle_save_name argument requires either of the following arguments use_clr, use_clr_beta, cycle_len"

        if callbacks is None: callbacks=[]
        if metrics is None: metrics=self.metrics

        if use_wd_sched:
            # This needs to come before CosAnneal() because we need to read the initial learning rate from
            # layer_opt.lrs - but CosAnneal() alters the layer_opt.lrs value initially (divides by 100)
            if np.sum(layer_opt.wds) == 0:
                print('fit() warning: use_wd_sched is set to True, but weight decay(s) passed are 0. Use wds to '
                      'pass weight decay values.')
            batch_per_epoch = len(data.trn_dl)
            cl = cycle_len if cycle_len else 1
            self.wd_sched = WeightDecaySchedule(layer_opt, batch_per_epoch, cl, cycle_mult, n_cycle,
                                                norm_wds, wds_sched_mult)
            callbacks += [self.wd_sched]

        if use_clr is not None:
            clr_div,cut_div = use_clr[:2]
            moms = use_clr[2:] if len(use_clr) > 2 else None
            cycle_end = self.get_cycle_end(cycle_save_name)
            assert cycle_len, "use_clr requires cycle_len arg"
            self.sched = CircularLR(layer_opt, len(data.trn_dl)*cycle_len, on_cycle_end=cycle_end, div=clr_div, cut_div=cut_div,
                                    momentums=moms)
        elif use_clr_beta is not None:
            div,pct = use_clr_beta[:2]
            moms = use_clr_beta[2:] if len(use_clr_beta) > 3 else None
            cycle_end = self.get_cycle_end(cycle_save_name)
            assert cycle_len, "use_clr_beta requires cycle_len arg"
            self.sched = CircularLR_beta(layer_opt, len(data.trn_dl)*cycle_len, on_cycle_end=cycle_end, div=div,
                                    pct=pct, momentums=moms)
        elif cycle_len:
            cycle_end = self.get_cycle_end(cycle_save_name)
            cycle_batches = len(data.trn_dl)*cycle_len
            self.sched = CosAnneal(layer_opt, cycle_batches, on_cycle_end=cycle_end, cycle_mult=cycle_mult)
        elif not self.sched: self.sched=LossRecorder(layer_opt)
        callbacks+=[self.sched]

        if best_save_name is not None:
            callbacks+=[SaveBestModel(self, layer_opt, metrics, best_save_name)]

        if use_swa:
            # make a copy of the model to track average weights
            self.swa_model = copy.deepcopy(model)
            callbacks+=[SWA(model, self.swa_model, swa_start)]

        n_epoch = int(sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle))
        return fit(model, data, n_epoch, layer_opt.opt, self.crit,
            metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
            swa_model=self.swa_model if use_swa else None, swa_start=swa_start, 
            swa_eval_freq=swa_eval_freq, **kwargs)

    def get_layer_groups(self): return self.models.get_layer_groups()

    def get_layer_opt(self, lrs, wds):
        return LayerOptimizer(self.opt_fn, self.get_layer_groups(), lrs, wds)

    def fit(self, lrs, n_cycle, wds=None, **kwargs):
        self.sched = None
        layer_opt = self.get_layer_opt(lrs, wds)
        return self.fit_gen(self.model, self.data, layer_opt, n_cycle, **kwargs)

    def warm_up(self, lr, wds=None):
        layer_opt = self.get_layer_opt(lr/4, wds)
        self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), lr, linear=True)
        return self.fit_gen(self.model, self.data, layer_opt, 1)

    def lr_find(self, start_lr=1e-5, end_lr=10, wds=None, linear=False, **kwargs):
        self.save('tmp')
        layer_opt = self.get_layer_opt(start_lr, wds)
        self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)
        self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)
        self.load('tmp')

    def lr_find2(self, start_lr=1e-5, end_lr=10, num_it = 100, wds=None, linear=False, stop_dv=True, **kwargs):
        self.save('tmp')
        layer_opt = self.get_layer_opt(start_lr, wds)
        self.sched = LR_Finder2(layer_opt, num_it, end_lr, linear=linear, metrics=self.metrics, stop_dv=stop_dv)
        self.fit_gen(self.model, self.data, layer_opt, num_it//len(self.data.trn_dl) + 1, all_val=True, **kwargs)
        self.load('tmp')

    def predict(self, is_test=False, use_swa=False):
        dl = self.data.test_dl if is_test else self.data.val_dl
        m = self.swa_model if use_swa else self.model
        return predict(m, dl)

    def predict_with_targs(self, is_test=False, use_swa=False):
        dl = self.data.test_dl if is_test else self.data.val_dl
        m = self.swa_model if use_swa else self.model
        return predict_with_targs(m, dl)

    def predict_dl(self, dl): return predict_with_targs(self.model, dl)[0]

    def predict_array(self, arr):
        """
        Args:
            arr: a numpy array to be used as input to the model for prediction purposes
        Returns:
            a numpy array containing the predictions from the model
        """
        if not isinstance(arr, np.ndarray): raise OSError(f'Not valid numpy array')
        self.model.eval()
        return to_np(self.model(to_gpu(V(T(arr)))))

    def TTA(self, n_aug=4, is_test=False):
        """ Predict with Test Time Augmentation (TTA)

        Additional to the original test/validation images, apply image augmentation to them
        (just like for training images) and calculate the mean of predictions. The intent
        is to increase the accuracy of predictions by examining the images using multiple
        perspectives.


            n_aug: a number of augmentation images to use per original image
            is_test: indicate to use test images; otherwise use validation images

        Returns:
            (tuple): a tuple containing:

                log predictions (numpy.ndarray): log predictions (i.e. `np.exp(log_preds)` will return probabilities)
                targs (numpy.ndarray): target values when `is_test==False`; zeros otherwise.
        """
        dl1 = self.data.test_dl     if is_test else self.data.val_dl
        dl2 = self.data.test_aug_dl if is_test else self.data.aug_dl
        preds1,targs = predict_with_targs(self.model, dl1)
        preds1 = [preds1]*math.ceil(n_aug/4)
        preds2 = [predict_with_targs(self.model, dl2)[0] for i in tqdm(range(n_aug), leave=False)]
        return np.stack(preds1+preds2), targs

    def fit_opt_sched(self, phases, cycle_save_name=None, best_save_name=None, stop_div=False, data_list=None, callbacks=None, 
                      cut = None, use_swa=False, swa_start=1, swa_eval_freq=5, **kwargs):
        if data_list is None: data_list=[]
        if callbacks is None: callbacks=[]
        layer_opt = LayerOptimizer(phases[0].opt_fn, self.get_layer_groups(), 1e-2, phases[0].wds)
        if len(data_list) == 0: nb_batches = [len(self.data.trn_dl)] * len(phases)
        else: nb_batches = [len(data.trn_dl) for data in data_list] 
        self.sched = OptimScheduler(layer_opt, phases, nb_batches, stop_div)
        callbacks.append(self.sched)
        metrics = self.metrics
        if best_save_name is not None:
            callbacks+=[SaveBestModel(self, layer_opt, metrics, best_save_name)]
        if use_swa:
            # make a copy of the model to track average weights
            self.swa_model = copy.deepcopy(self.model)
            callbacks+=[SWA(self.model, self.swa_model, swa_start)]
        n_epochs = [phase.epochs for phase in phases] if cut is None else cut
        if len(data_list)==0: data_list = [self.data]
        return fit(self.model, data_list, n_epochs,layer_opt, self.crit,
            metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,
            swa_model=self.swa_model if use_swa else None, swa_start=swa_start, 
            swa_eval_freq=swa_eval_freq, **kwargs)

    def _get_crit(self, data): return F.mse_loss

In [13]:
class BasicModel():
    def __init__(self,model,name='unnamed'): self.model,self.name = model,name
    def get_layer_groups(self, do_fc=False): return children(self.model)

class SingleModel(BasicModel):
    def get_layer_groups(self): return [self.model]

class SimpleNet(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Linear(layers[i], layers[i + 1]) for i in range(len(layers) - 1)])

    def forward(self, x):
        x = x.view(x.size(0), -1)
        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
        return F.log_softmax(l_x, dim=-1)

In [14]:
def opt_params(parm, lr, wd):
    return {'params': chain_params(parm), 'lr':lr, 'weight_decay':wd}

class LayerOptimizer():
    def __init__(self, opt_fn, layer_groups, lrs, wds=None):
        if not isinstance(layer_groups, (list,tuple)): layer_groups=[layer_groups]
        lrs = listify(lrs, layer_groups)
        if wds is None: wds=0.
        wds = listify(wds, layer_groups)
        self.layer_groups,self.lrs,self.wds = layer_groups,lrs,wds
        self.opt = opt_fn(self.opt_params())

    def opt_params(self):
        assert len(self.layer_groups) == len(self.lrs), f'size mismatch, expected {len(self.layer_groups)} lrs, but got {len(self.lrs)}'
        assert len(self.layer_groups) == len(self.wds), f'size mismatch, expected {len(self.layer_groups)} wds, but got {len(self.wds)}'
        params = list(zip(self.layer_groups,self.lrs,self.wds))
        return [opt_params(*p) for p in params]

    @property
    def lr(self): return self.lrs[-1]

    @property
    def mom(self):
        if 'betas' in self.opt.param_groups[0]:
            return self.opt.param_groups[0]['betas'][0]
        else:
            return self.opt.param_groups[0]['momentum']

    def set_lrs(self, lrs):
        lrs = listify(lrs, self.layer_groups)
        set_lrs(self.opt, lrs)
        self.lrs=lrs

    def set_wds_out(self, wds):
        wds = listify(wds, self.layer_groups)
        set_wds_out(self.opt, wds)
        set_wds(self.opt, [0] * len(self.layer_groups))
        self.wds=wds

    def set_wds(self, wds):
        wds = listify(wds, self.layer_groups)
        set_wds(self.opt, wds)
        set_wds_out(self.opt, [0] * len(self.layer_groups))
        self.wds=wds
    
    def set_mom(self,momentum):
        if 'betas' in self.opt.param_groups[0]:
            for pg in self.opt.param_groups: pg['betas'] = (momentum, pg['betas'][1])
        else:
            for pg in self.opt.param_groups: pg['momentum'] = momentum
    
    def set_beta(self,beta):
        if 'betas' in self.opt.param_groups[0]:
            for pg in self.opt.param_groups: pg['betas'] = (pg['betas'][0],beta)
        elif 'alpha' in self.opt.param_groups[0]:
            for pg in self.opt.param_groups: pg['alpha'] = beta

    def set_opt_fn(self, opt_fn):
        if type(self.opt) != type(opt_fn(self.opt_params())):
            self.opt = opt_fn(self.opt_params())

def zip_strict_(l, r):
    assert len(l) == len(r), f'size mismatch, expected lengths {len(l)}, but got {len(l)} and {len(r)} instead.'
    return zip(l, r)

def set_lrs(opt, lrs):
    lrs = listify(lrs, opt.param_groups)
    for pg,lr in zip_strict_(opt.param_groups,lrs): pg['lr'] = lr

def set_wds_out(opt, wds):
    wds = listify(wds, opt.param_groups)
    for pg,wd in zip_strict_(opt.param_groups,wds): pg['wd'] = wd

def set_wds(opt, wds):
    wds = listify(wds, opt.param_groups)
    for pg,wd in zip_strict_(opt.param_groups,wds): pg['weight_decay'] = wd

In [15]:
class BOW_Learner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)

    def _get_crit(self, data): return F.l1_loss

In [16]:
class Callback:
    def on_train_begin(self): pass
    def on_batch_begin(self): pass
    def on_phase_begin(self): pass
    def on_epoch_end(self, metrics): pass
    def on_phase_end(self): pass
    def on_batch_end(self, metrics): pass
    def on_train_end(self): pass

class LoggingCallback(Callback):
    '''
    A class useful for maintaining status of a long-running job.
    e.g.: learn.fit(0.01, 1, callbacks = [LoggingCallback(save_path="/tmp/log")])
    '''
    def __init__(self, save_path):
        super().__init__()
        self.save_path=save_path
    def on_train_begin(self):
        self.batch = 0
        self.epoch = 0
        self.phase = 0
        self.f = open(self.save_path, "a", 1)
        self.log("\ton_train_begin")
    def on_batch_begin(self):
        self.log(str(self.batch)+"\ton_batch_begin")
    def on_phase_begin(self):
        self.log(str(self.phase)+"\ton_phase_begin")
    def on_epoch_end(self, metrics):
        self.log(str(self.epoch)+"\ton_epoch_end: "+str(metrics))
        self.epoch += 1
    def on_phase_end(self):
        self.log(str(self.phase)+"\ton_phase_end")
        self.phase+=1
    def on_batch_end(self, metrics):
        self.log(str(self.batch)+"\ton_batch_end: "+str(metrics))
        self.batch += 1
    def on_train_end(self):
        self.log("\ton_train_end")
        self.f.close()
    def log(self, string):
        self.f.write(time.strftime("%Y-%m-%dT%H:%M:%S")+"\t"+string+"\n")
        
class LossRecorder(Callback):
    '''
    Saves and displays loss functions and other metrics. 
    Default sched when none is specified in a learner. 
    '''
    def __init__(self, layer_opt, save_path='', record_mom=False, metrics=[]):
        super().__init__()
        self.layer_opt=layer_opt
        self.init_lrs=np.array(layer_opt.lrs)
        self.save_path, self.record_mom, self.metrics = save_path, record_mom, metrics

    def on_train_begin(self):
        self.losses,self.lrs,self.iterations,self.epochs,self.times = [],[],[],[],[]
        self.start_at = timer()
        self.val_losses, self.rec_metrics = [], []
        if self.record_mom:
            self.momentums = []
        self.iteration = 0
        self.epoch = 0

    def on_epoch_end(self, metrics):
        self.epoch += 1
        self.epochs.append(self.iteration)
        self.times.append(timer() - self.start_at)
        self.save_metrics(metrics)

    def on_batch_end(self, loss):
        self.iteration += 1
        self.lrs.append(self.layer_opt.lr)
        self.iterations.append(self.iteration)
        if isinstance(loss, list):
            self.losses.append(loss[0])
            self.save_metrics(loss[1:])
        else: self.losses.append(loss)
        if self.record_mom: self.momentums.append(self.layer_opt.mom)

    def save_metrics(self,vals):
        self.val_losses.append(delistify(vals[0]))
        if len(vals) > 2: self.rec_metrics.append(vals[1:])
        elif len(vals) == 2: self.rec_metrics.append(vals[1])

    def plot_loss(self, n_skip=10, n_skip_end=5):
        '''
        plots loss function as function of iterations. 
        When used in Jupyternotebook, plot will be displayed in notebook. Else, plot will be displayed in console and both plot and loss are saved in save_path. 
        '''
        if not in_ipynb(): plt.switch_backend('agg')
        plt.plot(self.iterations[n_skip:-n_skip_end], self.losses[n_skip:-n_skip_end])
        if not in_ipynb():
            plt.savefig(os.path.join(self.save_path, 'loss_plot.png'))
            np.save(os.path.join(self.save_path, 'losses.npy'), self.losses[10:])

    def plot_lr(self):
        '''Plots learning rate in jupyter notebook or console, depending on the enviroment of the learner.'''
        if not in_ipynb():
            plt.switch_backend('agg')
        if self.record_mom:
            fig, axs = plt.subplots(1,2,figsize=(12,4))
            for i in range(0,2): axs[i].set_xlabel('iterations')
            axs[0].set_ylabel('learning rate')
            axs[1].set_ylabel('momentum')
            axs[0].plot(self.iterations,self.lrs)
            axs[1].plot(self.iterations,self.momentums)   
        else:
            plt.xlabel("iterations")
            plt.ylabel("learning rate")
            plt.plot(self.iterations, self.lrs)
        if not in_ipynb():
            plt.savefig(os.path.join(self.save_path, 'lr_plot.png'))

In [17]:
class LR_Updater(LossRecorder):
    '''
    Abstract class where all Learning Rate updaters inherit from. (e.g., CirularLR)
    Calculates and updates new learning rate and momentum at the end of each batch. 
    Have to be extended. 
    '''
    def on_train_begin(self):
        super().on_train_begin()
        self.update_lr()
        if self.record_mom:
            self.update_mom()

    def on_batch_end(self, loss):
        res = super().on_batch_end(loss)
        self.update_lr()
        if self.record_mom:
            self.update_mom()
        return res

    def update_lr(self):
        new_lrs = self.calc_lr(self.init_lrs)
        self.layer_opt.set_lrs(new_lrs)
    
    def update_mom(self):
        new_mom = self.calc_mom()
        self.layer_opt.set_mom(new_mom)

    @abstractmethod
    def calc_lr(self, init_lrs): raise NotImplementedError
    
    @abstractmethod
    def calc_mom(self): raise NotImplementedError

In [18]:
class CosAnneal(LR_Updater):
    ''' Learning rate scheduler that implements a cosine annealation schedule. '''
    def __init__(self, layer_opt, nb, on_cycle_end=None, cycle_mult=1):
        self.nb,self.on_cycle_end,self.cycle_mult = nb,on_cycle_end,cycle_mult
        super().__init__(layer_opt)

    def on_train_begin(self):
        self.cycle_iter,self.cycle_count=0,0
        super().on_train_begin()

    def calc_lr(self, init_lrs):
        if self.iteration<self.nb/20:
            self.cycle_iter += 1
            return init_lrs/100.

        cos_out = np.cos(np.pi*(self.cycle_iter)/self.nb) + 1
        self.cycle_iter += 1
        if self.cycle_iter==self.nb:
            self.cycle_iter = 0
            self.nb *= self.cycle_mult
            if self.on_cycle_end: self.on_cycle_end(self, self.cycle_count)
            self.cycle_count += 1
        return init_lrs / 2 * cos_out

In [19]:
class DotProdNB(nn.Module):
    def __init__(self, nf, ny, w_adj=0.4, r_adj=10):
        super().__init__()
        self.w_adj,self.r_adj = w_adj,r_adj
        self.w = nn.Embedding(nf+1, 1, padding_idx=0)
        self.w.weight.data.uniform_(-0.1,0.1)
        self.r = nn.Embedding(nf+1, ny)

    def forward(self, feat_idx, feat_cnt, sz):
        w = self.w(feat_idx)
        r = self.r(feat_idx)
        x = ((w+self.w_adj)*r/self.r_adj).sum(1)
        return F.softmax(x, dim=-1)

In [20]:
PATH = "../../nb5_data/aclImdb/"
names = ['neg','pos']

In [21]:
def texts_labels_from_folders(path, folders):
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    return texts, np.array(labels).astype(np.int64)

In [22]:
trn,trn_y = texts_labels_from_folders(f'{PATH}train',names)
val,val_y = texts_labels_from_folders(f'{PATH}test',names)

In [23]:
trn[0]

'A retired diplomat, played nicely by Michael York, goes to Russia to get revenge on the Russian gangster that murdered the diplomat\'s policeman son. There the diplomat meets an exceptionally strong and decent Russian cop who helps him bring the Russian gangster to justice.<br /><br />I remembered the old action flicks of the 1980s that always portray the Russians as evil bad guys out to undermine the righteous U.S. government. It\'s interesting to see this time the Russian guy as a hero.<br /><br />Not a great flick, it\'s really typically a "B" action flick. Michael York lends some class to this mediocre movie. Alexander Nevsky, who plays the Russian cop is kind of "blah" but surprisingly has some chemistry with Michael York. Face it, Michael York is such a good actor that he\'d have chemistry with anyone he\'s doing a scene with. Disappointingly, the handsome Adrian Paul gets killed within the first 15 minutes into the movie. Now, if Adrian Paul was in this movie longer, it would\'

In [24]:
trn_y[0]

0

In [25]:
re_tok = re.compile('([!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~“”¨«»®´·º½¾¿¡§£₤‘’])')

In [26]:
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [27]:
veczr = CountVectorizer(tokenizer=tokenize)

In [28]:
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [29]:
trn_term_doc

<25000x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 3749745 stored elements in Compressed Sparse Row format>

In [30]:
trn_term_doc[0]

<1x75132 sparse matrix of type '<class 'numpy.int64'>'
	with 166 stored elements in Compressed Sparse Row format>

In [31]:
vocab = veczr.get_feature_names(); vocab[5000:5005]

['aussie', 'aussies', 'austen', 'austeniana', 'austens']

In [32]:
w0 = set([o.lower() for o in trn[0].split(' ')]); w0

{'"b"',
 '"blah"',
 '/><br',
 '/>i',
 '/>not',
 '/>rent',
 '15',
 '1980s',
 'a',
 'about',
 'above',
 'acting',
 'action',
 'actor',
 'adrian',
 'alexander',
 'all',
 'always',
 'an',
 'and',
 'anyone',
 'are',
 'as',
 'at',
 'average',
 'bad',
 'bad.',
 'be',
 'been',
 'better,',
 'blonde',
 'bring',
 'but',
 'by',
 'can',
 'chemistry',
 'class',
 'cop',
 'could',
 "could've",
 'decent',
 'decent.',
 'diplomat',
 "diplomat's",
 'diplomat,',
 'disappointingly,',
 "doesn't",
 'doing',
 'else',
 'evil',
 'exceptionally',
 'face',
 'faster,',
 'first',
 'flick',
 'flick,',
 'flick.',
 'flicks',
 'for',
 'gangster',
 'get',
 'gets',
 'goes',
 'good',
 'government.',
 'great',
 'guy',
 'guy.',
 'guys',
 'handsome',
 'has',
 'have',
 'he',
 "he'd",
 "he's",
 'helps',
 'hero.<br',
 'hey,',
 'him',
 'i',
 'if',
 'in',
 'interesting',
 'into',
 'is',
 'it',
 "it's",
 'it,',
 'justice.<br',
 'killed',
 'kind',
 'lends',
 'like',
 'longer,',
 'look',
 'looks',
 'mediocre',
 'meets',
 'michael',
 

In [33]:
len(w0)

172

In [34]:
veczr.vocabulary_['absurd']

1297

In [35]:
trn_term_doc[0,1297]

0

In [36]:
trn_term_doc[0,5000]

0

In [37]:
def pr(y_i):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [38]:
x=trn_term_doc
y=trn_y

r = np.log(pr(1)/pr(0))
b = np.log((y==1).mean() / (y==0).mean())

In [39]:
pre_preds = val_term_doc @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.81656

In [40]:
x=trn_term_doc.sign()
r = np.log(pr(1)/pr(0))

pre_preds = val_term_doc.sign() @ r.T + b
preds = pre_preds.T>0
(preds==val_y).mean()

0.83016

~~#TODO: Original nb had dual =True, check why~~

In [41]:
x.shape

(25000, 75132)

Setting *__dual=True__* because here number of features > number of samples. Setting this to True provides a lower bound to the solution of the primal (minimization) problem. Sklean recommends to set *__dual=False__* when n_samples > n_features.

In [42]:
m = LogisticRegression(C=1e8, dual=True, solver='liblinear')
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()



0.83268

In [43]:
m = LogisticRegression(C=1e8, dual=True, solver='liblinear')
m.fit(trn_term_doc.sign(), y)
preds = m.predict(val_term_doc.sign())
(preds==val_y).mean()



0.8552

~~#TODO: In original notebook, this is higher, `0.84x`~~

In [44]:
m = LogisticRegression(C=0.1, dual=True, solver='liblinear')
m.fit(x, y)
preds = m.predict(val_term_doc)
(preds==val_y).mean()

0.84872

In [45]:
m = LogisticRegression(C=0.1, dual=True, solver='liblinear')
m.fit(trn_term_doc.sign(), y)
preds = m.predict(val_term_doc.sign())
(preds==val_y).mean()

0.88404

In [46]:
veczr =  CountVectorizer(ngram_range=(1,3), tokenizer=tokenize, max_features=800000)
trn_term_doc = veczr.fit_transform(trn)
val_term_doc = veczr.transform(val)

In [47]:
trn_term_doc.shape

(25000, 800000)

In [48]:
vocab = veczr.get_feature_names()

In [49]:
vocab[200000:200005]

['by vast', 'by vengeance', 'by vengeance .', 'by vera', 'by vera miles']

In [50]:
y=trn_y
x=trn_term_doc.sign()
val_x = val_term_doc.sign()

In [51]:
r = np.log(pr(1) / pr(0))
b = np.log((y==1).mean() / (y==0).mean())

In [52]:
m = LogisticRegression(C=0.1, dual=True, solver='liblinear')
m.fit(x, y);

preds = m.predict(val_x)
(preds.T==val_y).mean()

0.905

In [53]:
r.shape, r

((1, 800000),
 matrix([[-0.05468386, -0.16100472, -0.24783616, ...,  1.09861229,
          -0.69314718, -0.69314718]]))

In [54]:
np.exp(r)

matrix([[0.94678442, 0.85128806, 0.7804878 , ..., 3.        , 0.5       ,
         0.5       ]])

In [55]:
x_nb = x.multiply(r)
m = LogisticRegression(C=0.1, dual=True, solver='liblinear')
m.fit(x_nb, y);

val_x_nb = val_x.multiply(r)
preds = m.predict(val_x_nb)
(preds.T==val_y).mean()

0.91768

In [56]:
sl=2000

~~# Note: Need to make things work after this~~

In [57]:
# Here is how we get a model from a bag of words
md = TextClassifierData.from_bow(trn_term_doc, trn_y, val_term_doc, val_y, sl)

In [58]:
learner = md.dotprod_nb_learner()
learner.fit(0.02, 1, wds=1e-6, cycle_len=1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Validation:   0%|          | 0/391 [00:00<?, ?it/s]

epoch      trn_loss   val_loss   <lambda>   
    0      0.022909   0.120032   0.91632   


[0.1200320093870163, 0.9163199999809265]

In [59]:
learner.fit(0.02, 2, wds=1e-6, cycle_len=1)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Validation:   0%|          | 0/391 [00:00<?, ?it/s]

epoch      trn_loss   val_loss   <lambda>   
    0      0.02172    0.113879   0.92112   


  0%|          | 0/391 [00:00<?, ?it/s]

Validation:   0%|          | 0/391 [00:00<?, ?it/s]

    1      0.011636   0.11232    0.92068   


[0.11231955892562866, 0.920680000038147]

In [60]:
learner.fit(0.02, 2, wds=1e-6, cycle_len=1)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/391 [00:00<?, ?it/s]

Validation:   0%|          | 0/391 [00:00<?, ?it/s]

epoch      trn_loss   val_loss   <lambda>   
    0      0.016484   0.111565   0.92152   


  0%|          | 0/391 [00:00<?, ?it/s]

Validation:   0%|          | 0/391 [00:00<?, ?it/s]

    1      0.00962    0.11013    0.92132   


[0.11012963932037353, 0.92132]