# Download the basic blenderbot model


In [1]:
!parlai i -mf zoo:blender/blender_90M/model

14:12:31 | [33mLoading model with `--beam-block-full-context false`[0m
14:12:31 | loading dictionary from /home/ckjellson/code/emely-models/ParlAI/data/models/blender/blender_90M/model.dict
14:12:32 | num words = 54944
14:12:32 | TransformerGenerator: full interactive mode on.
14:12:32 | [33mDEPRECATED: XLM should only be used for backwards compatibility, as it involves a less-stable layernorm operation.[0m
14:12:33 | Total parameters: 87,508,992 (87,508,992 trainable)
14:12:33 | Loading existing model params from /home/ckjellson/code/emely-models/ParlAI/data/models/blender/blender_90M/model
14:12:33 | Opt:
14:12:33 |     activation: gelu
14:12:33 |     adafactor_eps: '[1e-30, 0.001]'
14:12:33 |     adam_eps: 1e-08
14:12:33 |     add_p1_after_newln: False
14:12:33 |     aggregate_micro: False
14:12:33 |     allow_missing_init_opts: False
14:12:33 |     attention_dropout: 0.0
14:12:33 |     batchsize: 16
14:12:33 |     beam_block_full_context: False
14:12:33 |     beam_block_list_fi

In [6]:
from parlai.core.agents import create_agent
from parlai.agents.emely.emely import EmelyAgent
from parlai.core.opt import Opt
from pathlib import Path
import torch

In [7]:
model_path = Path.cwd() / 'data/models/blender/blender_90M/'
assert model_path.is_dir()

opt_path = model_path / 'model.opt'
opt = Opt.load(opt_path)

# Change opts
opt['skip_generation'] = False
opt['init_model'] = (model_path / 'model').as_posix()
opt['no_cuda'] = True  # Cloud run doesn't offer gpu support

# Inference options
opt['inference'] = 'greedy' # 'beam'
opt['beam_size'] = 1

## This is how we use Emely currently

EmelyAgent subclasses *TransformerGeneratorAgent* and has the extra method observe_and_act(text)

EmelyAgent has many attributes (and methods), but importantly:

- EmelyAgent.model: the pytorch model of the transformer
    - model.encoder: the transformer encoder
    - model.decoder: the transformer decoder
    
    
- EmelyAgent.history: the conversation history, which is used when 

- EmelyAgent.observe(): observes a new message and adds to the history
- EmelyAgent.self_observe: observes it's own response and adds to the history

- EmelyAgent.act(): creates a new response based on the history. high level function
- EmelyAgent.eval_step(): model inference used to get the raw model output
- EmelyAgent._generate(): used to generate a response from the raw model output. Calls beam search or topk sampling


The parlai *TransformerGeneratorAgent* object has a history, from which the context is built and passed thorugh the model to generate a reply. But due to Emely handling several conversations simultaneously, we send the entire dialog history we want the model to act on (this is usually the last 6-8 messages) and the method observe_and_act() builds the history and then calls act()


#### Text format. Messages are separated by \n  
text = 'hey there\nHi my name is Emely, how are you today?\ngood thanks. What do you work with?'

In [8]:
emely_agent = EmelyAgent(opt)
# Option to quantize the mdoel in torch to speed up a little bit
# emely_agent.model = torch.quantization.quantize_dynamic(emely_agent.model, {torch.nn.Linear}, dtype=torch.qint8) 

14:06:39 | loading dictionary from /home/ckjellson/code/emely-models/ParlAI/data/models/blender/blender_90M/model.dict
14:06:39 | num words = 54944
14:06:40 | Total parameters: 87,508,992 (87,508,992 trainable)
14:06:40 | Loading existing model params from /home/ckjellson/code/emely-models/ParlAI/data/models/blender/blender_90M/model


#### Try passing text through emely

In [9]:
# This is a conversation where the human has written two messages and Emely one. 
text = "Hi Emely, how are you?\nI'm good thanks! What do you do for work?\nI write code and I drink coffe"
emely_agent.observe_and_act(text)


"that ' s cool ! i ' ve never tried coffee . i ' d love to try it though ."

In [7]:
from torchinfo import summary
print(summary(emely_agent.model))

Layer (type:depth-idx)                        Param #
TransformerGeneratorModel                     --
├─Embedding: 1-1                              28,131,328
├─TransformerEncoder: 1-2                     --
│    └─Dropout: 2-1                           --
│    └─Embedding: 2-2                         (recursive)
│    └─Embedding: 2-3                         262,144
│    └─LayerNorm: 2-4                         1,024
│    └─ModuleList: 2-5                        --
│    │    └─TransformerEncoderLayer: 3-1      3,152,384
│    │    └─TransformerEncoderLayer: 3-2      3,152,384
│    │    └─TransformerEncoderLayer: 3-3      3,152,384
│    │    └─TransformerEncoderLayer: 3-4      3,152,384
│    │    └─TransformerEncoderLayer: 3-5      3,152,384
│    │    └─TransformerEncoderLayer: 3-6      3,152,384
│    │    └─TransformerEncoderLayer: 3-7      3,152,384
│    │    └─TransformerEncoderLayer: 3-8      3,152,384
├─TransformerDecoder: 1-3                     --
│    └─Dropout: 2-6             

## PDB debug of observe_and_act()

#### the observe_and_act() method
1. Build history from text
2. act(). During act the history is vectorized and passed through the model. This method is quite deep and is good to debug to get an understanding of

In [14]:
from IPython.core.debugger import set_trace

def debug():
    #set_trace()
    reply = emely_agent.observe_and_act(text)
    reply = emely_agent.observe_and_act(text)
    
debug()

> [0;32m<ipython-input-14-cae48e672ceb>[0m(5)[0;36mdebug[0;34m()[0m
[0;32m      3 [0;31m[0;32mdef[0m [0mdebug[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m    [0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0mreply[0m [0;34m=[0m [0memely_agent[0m[0;34m.[0m[0mobserve_and_act[0m[0;34m([0m[0mtext[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    [0mreply[0m [0;34m=[0m [0memely_agent[0m[0;34m.[0m[0mobserve_and_act[0m[0;34m([0m[0mtext[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0;34m[0m[0m
[0m


ipdb>  5


5
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


ipdb>  rgsef




# Notes and scraps below

Unstructured notes and comments/code

### batch_act(observations)
- observations: list(Message)
- observation: Message
        {'id': 'localHuman', 'text': 'Hi', 'episode_done': False, 'label_candidates': None, 'full_text': 'Hi', 'text_vec':      tensor([792]), 'full_text_vec': [792], 'context_original_length': 1, 'context_truncate_rate': False, 'context_truncated_length': 0}
        
        
batch = self.batchify(observations) -> batch: Batch
num_observations = len(observations)

eval_step(self, batch)
batch: Batch
Batch({
  _context_original_length: LongTensor[1],
  _context_truncate_rate: LongTensor[1],
  _context_truncated_length: LongTensor[1],
  _label_original_length: None,
  _label_truncate_rate: None,
  _label_truncated_length: None,
  batchsize: 1,
  candidate_vecs: None,
  candidates: None,
  image: None,
  is_training: False,
  label_lengths: None,
  label_vec: None,
  labels: None,
  observations: None (use --debug to include),
  rewards: None,
  text_lengths: None,
  text_vec: LongTensor[1, 1],
  valid_indices: LongTensor[1],
})

bsz = batch.text_vec.size(0) # Length of text vector
self.model.eval()
prefix_tokens = self.get_prefix_tokens(batch)
beam_preds_scores, beams = self._generate(
    batch, self.beam_size, maxlen, prefix_tokens=prefix_tokens
)
preds, scores = zip(*beam_preds_scores)

### _generate()

encoder_states = model.encoder(*self._encoder_input(batch)) - ## WHEN/HOW WAS self._encoder_input(batch)  
encoder_states[0].shape
torch.Size([1, 1, 512])

beams = [
                self._treesearch_factory(dev)
                .set_batch_context(batch_context_list, batch_idx)
                .set_block_list(self.beam_block_list)
                for batch_idx in range(batchsize)
            ] wtf happens here?
            
#### self._treesearch_factory -> TopKSampling / BeamSearch

return TopKSampling(
                self.opt['topk'],
                beam_size,
                min_length=self.beam_min_length,
                block_ngram=self.beam_block_ngram,
                context_block_ngram=self.beam_context_block_ngram,
                length_penalty=self.opt.get('beam_length_penalty', 0.65),
                padding_token=self.NULL_IDX,
                bos_token=self.START_IDX,
                eos_token=self.END_IDX,
                device=device,
            )

In [10]:
from parlai.core.message import Message
from parlai.core.torch_agent import History
from parlai.core.worlds import validate

sample_message = Message()
sample_message['text'] = text
sample_message['id'] = 'localHuman' # 'TransformerGenerator'
sample_message['episode_done'] = False
sample_message['label_candidates'] = None
validate(sample_message)

emely_agent.observe(sample_message)

history = emely_agent.history
#emely_agent.vectorize(message, history)

In [11]:
dir(history)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_person_tokens',
 '_global_end_token',
 '_update_raw_strings',
 '_update_strings',
 '_update_vecs',
 'add_p1_after_newln',
 'add_person_tokens',
 'add_reply',
 'delimiter',
 'delimiter_tok',
 'dict',
 'field',
 'get_history_str',
 'get_history_vec',
 'get_history_vec_list',
 'history_raw_strings',
 'history_strings',
 'history_vecs',
 'max_len',
 'p1_token',
 'p2_token',
 'parse',
 'reset',
 'reversed',
 'size',
 'split_on_newln',
 'temp_history',
 'update_history']

In [5]:
from time import time

t1 = time()

for i in range(100):
    emely_agent.observe_and_act(text)




t2 = time()
elapsed = t2 - t1
elapsed

38.942471981048584

57 sek med beam =10
39 sek med greedy

In [13]:
emely_agent.opt['inference'] = 'greedy'
emely_agent.opt['beam_size'] = 1