In [1]:
! pip install trax

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting trax
  Downloading trax-1.4.1-py2.py3-none-any.whl (637 kB)
[K     |████████████████████████████████| 637 kB 16.4 MB/s 
Collecting funcsigs
  Downloading funcsigs-1.0.2-py2.py3-none-any.whl (17 kB)
Collecting tensorflow-text
  Downloading tensorflow_text-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 20.0 MB/s 
Collecting tensorflow<2.12,>=2.11.0
  Downloading tensorflow-2.11.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 6.5 kB/s 
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 13.1 MB/s 
Collecting flatbuffers>=2.0
  Downloading flatbuffers-22.11.23-py2.py3-none-any.whl (26 kB)
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downl

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import json
import random
import numpy as np
from termcolor import colored

import trax
from trax import layers as tl
from trax.supervised import training
import pandas as pd


In [4]:
# filename of the MultiWOZ dialogue dataset
DATA_FILE = 'all_conversations.csv'

# vocabulary filename
VOCAB_FILE = 'en_32k.subword'


In [27]:
df = pd.read_csv('./TM-3-2020/full-train-dataset.csv')
last_id = df['conversation_id'][0]
tmp = ''
untokenized_data = []
for _, row in df.iterrows():
    # print(last_id)
    if row['conversation_id'] == last_id:
        tmp += 'User: ' + str(row['user']) + \
            ' Assistant: ' + str(row['assistant'])
    else:
        untokenized_data.append(tmp)
        tmp = ''
    last_id = row['conversation_id']
untokenized_data.append(tmp)


In [28]:
len(untokenized_data)


4000

In [None]:
# print the first element to check if it's the same as the one we got before
print(untokenized_data[0])


User: hi....am buying a ticket [date.showing] so we go and see a movie at [name.theater] Assistant: No problem. Is there a particular type of movie you’re looking for?User: hhhmmmmm not at all. i dont have any in mind for now Assistant: Sure. I can help with that. Let me listings at A[name.theater]User: sure you can but i want to see the movie at [name.theater] Assistant: Oh, sorry about that. So you're interested in [name.genre] films at [name.theater], right?User: yeah Assistant: OK. I show one [name.genre] movie playing at [name.theater] [name.movie]. Remaining showtimes are [time.showing], [time.showing] and [time.showing]. Does any of those work?User: yeah but [time.showing] will be perfect for me Assistant: Great. And how many tickets?User: myself and two other persons are going to see a movie Assistant: All right. Let me confirm that you'd like [num.tickets] tickets for [name.movie] at [name.theater] [date.showing] at [time.showing]. Is that all correct?User: yeah Assistant: Is 

In [None]:
# shuffle the list we generated above
random.shuffle(untokenized_data)

# define a cutoff (5% of the total length for this assignment)
# convert to int because we will use it as a list index
cut_off = int(len(untokenized_data) * .05)
# cut_off=1
# slice the list. the last elements after the cut_off value will be the eval set. the rest is for training.
train_data, eval_data = untokenized_data[:-
                                         cut_off], untokenized_data[-cut_off:]

print(f'number of conversations in the data set: {len(untokenized_data)}')
print(f'number of conversations in train set: {len(train_data)}')
print(f'number of conversations in eval set: {len(eval_data)}')


number of conversations in the data set: 23761
number of conversations in train set: 22573
number of conversations in eval set: 1188


In [5]:
def stream(data):
    while True:
        d = random.choice(data)
        yield (d, d)


In [None]:
# trax allows us to use combinators to generate our data pipeline
data_pipeline = trax.data.Serial(
    # randomize the stream
    trax.data.Shuffle(),

    # tokenize the data
    trax.data.Tokenize(vocab_file=VOCAB_FILE),

    # filter too long sequences
    trax.data.FilterByLength(2048),

    # bucket by length
    trax.data.BucketByLength(boundaries=[128, 256,  512, 1024],
                             batch_sizes=[16,    8,    4,   2, 1]),

    # add loss weights but do not add it to the padding tokens (i.e. 0)
    trax.data.AddLossWeights(id_to_mask=0)
)

train_stream = data_pipeline(stream(train_data))
eval_stream = data_pipeline(stream(eval_data))


In [None]:
# the stream generators will yield (input, target, weights). let's just grab the input for inspection
inp, _, _ = next(train_stream)

# print the shape. format is (batch size, token length)
print("input shape: ", inp.shape)

# detokenize the first element
print(trax.data.detokenize(inp[0], vocab_file=VOCAB_FILE))


input shape:  (4, 512)
User: Can I purchase movie tickets for [date.showing] please? Assistant: Sure, which movie are you interested in seeing [date.showing]?User: [name.movie] please Assistant: Ok, and which theater would you like to go to?User: [name.theater] Assistant: Ok! What time would work best for you. Available times for [date.showing] are [time.showing] and [time.showing].User: [time.showing] would work Assistant: Great! Lastly, how many tickets are you going to need.User: Just [num.tickets] tickets please, my friends are coming back from college and want to see a movie. Assistant: Perfect. To confirm, you would like to purchase [num.tickets] movie tickets to [name.movie] at [name.theater] at [time.showing].User: Wait, I wanted to go to the [time.showing] movie. Assistant: My apologies, this movie has filled up. Would you like the [time.showing] movie instead?User: Sure that is fine. Assistant: To confirm, you would like to purchase [num.tickets] movie tickets to [name.movie]

In [6]:
# UNQ_C2
# GRADED FUNCTION: reversible_layer_forward
def reversible_layer_forward(x, f, g):
    """
    Args: 
        x (np.array): an input vector or matrix
        f (function): a function which operates on a vector/matrix
        g (function): a function which operates on a vector/matrix
    Returns: 
        y (np.array): an output vector or matrix whose form is determined by 'x', f and g
    """
    # split the input vector into two (* along the last axis because it is the depth dimension)
    x1, x2 = np.split(x, 2, axis=-1)

    ### START CODE HERE ###

    # get y1 using equation 3
    y1 = x1 + f(x2)

    # get y2 using equation 4
    y2 = x2 + g(y1)

    # concatenate y1 and y2 along the depth dimension. be sure output is of type np.ndarray
    y = np.concatenate([y1, y2], axis=-1)

    ### END CODE HERE ###
    return y


In [7]:
# UNQ_C3
# GRADED FUNCTION: reversible_layer_reverse
def reversible_layer_reverse(y, f, g):
    """
    Args: 
        y (np.array): an input vector or matrix
        f (function): a function which operates on a vector/matrix of the form of 'y'
        g (function): a function which operates on a vector/matrix of the form of 'y'
    Returns: 
        y (np.array): an output vector or matrix whose form is determined by 'y', f and g
    """

    # split the input vector into two (* along the last axis because it is the depth dimension)
    y1, y2 = np.split(y, 2, axis=-1)

    ### START CODE HERE ###

    # compute x2 using equation 5
    x2 = y2 - g(y1)

    # compute x1 using equation 6
    x1 = y1 - f(x2)

    # concatenate x1 and x2 along the depth dimension
    x = np.concatenate([x1, x2], axis=-1)

    ### END CODE HERE ###
    return x


In [8]:
# UNQ_C4
# GRADED FUNCTION
def ReformerLM(vocab_size=33000, n_layers=2, mode='train', attention_type=tl.SelfAttention):

    ### START CODE HERE ###
    # initialize an instance of Trax's ReformerLM class
    model = tl.Serial(
        trax.models.reformer.ReformerLM(
            # set vocab size
            vocab_size=vocab_size,
            # set number of layers
            n_layers=n_layers,
            # set mode
            mode=mode,
            # set attention type
            attention_type=attention_type
        ), tl.LogSoftmax()
    )
    ### END CODE HERE ###
    return model  # tl.Serial(model, tl.LogSoftmax(),)


In [None]:
# display the model
temp_model = ReformerLM('train')
print(str(temp_model))

# free memory
#del temp_model


Serial[
  Serial[
    Serial[
      ShiftRight(1)
    ]
    Embedding_train_512
    Dropout
    Serial[
      PositionalEncoding
    ]
    Dup_out2
    ReversibleSerial_in2_out2[
      ReversibleHalfResidualDecoderAttn_in2_out2[
        Serial[
          LayerNorm
        ]
        SelfAttention
      ]
      ReversibleSwap_in2_out2
      ReversibleHalfResidualDecoderFF_in2_out2[
        Serial[
          LayerNorm
          Dense_2048
          Dropout
          Serial[
            FastGelu
          ]
          Dense_512
          Dropout
        ]
      ]
      ReversibleSwap_in2_out2
      ReversibleHalfResidualDecoderAttn_in2_out2[
        Serial[
          LayerNorm
        ]
        SelfAttention
      ]
      ReversibleSwap_in2_out2
      ReversibleHalfResidualDecoderFF_in2_out2[
        Serial[
          LayerNorm
          Dense_2048
          Dropout
          Serial[
            FastGelu
          ]
          Dense_512
          Dropout
        ]
      ]
      ReversibleSwa

In [None]:
# UNQ_C5
# GRADED FUNCTION: train_model
def training_loop(ReformerLM, train_gen, eval_gen, output_dir="./models/model_"):

    lr_schedule = trax.lr.warmup_and_rsqrt_decay(
        n_warmup_steps=1000, max_value=0.01)

    train_task = training.TrainTask(

        labeled_data=train_gen,

        loss_layer=tl.CrossEntropyLoss(),

        optimizer=trax.optimizers.Adam(0.01),

        lr_schedule=lr_schedule,

        n_steps_per_checkpoint=10
    )
    eval_task = training.EvalTask(

        labeled_data=eval_gen,

        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()]

    loop=training.Loop(ReformerLM(mode='train'),
                         train_task,
                         eval_tasks=[eval_task], output_dir='.models/model_full_data_1')
    return loop


In [None]:
# we will now test your function
loop = training_loop(ReformerLM, train_stream, eval_stream)
loop.run(500)


In [12]:
def tokenize(sentence, vocab_file):
    return list(trax.data.tokenize(iter([sentence]), vocab_file=vocab_file))[0]


def detokenize(tokens, vocab_file):
    return trax.data.detokenize(tokens, vocab_file=vocab_file)


In [13]:
# UNQ_C6
# GRADED FUNCTION
def ReformerLM_output_gen(ReformerLM, start_sentence, vocab_file, temperature, tokenize=tokenize):
    """
    Args:
        ReformerLM:  the Reformer language model you just trained
        start_sentence (string): starting sentence of the conversation
        vocab_file (string): vocabulary filename
        vocab_dir (string): directory of the vocabulary file
        temperature (float): parameter for sampling ranging from 0.0 to 1.0.
            0.0: same as argmax, always pick the most probable token
            1.0: sampling from the distribution (can sometimes say random things)

    Returns:
        generator: yields the next symbol generated by the model
    """

    ### START CODE HERE ###

    # Create input tokens using the the tokenize function
    input_tokens = tokenize(start_sentence, vocab_file=vocab_file)

    # Add batch dimension to array. Convert from (n,) to (1, n)
    input_tokens_with_batch = np.array(input_tokens)[None, :]

    # call the autoregressive_sample_stream function from trax
    output_gen = trax.supervised.decoding.autoregressive_sample_stream(
        # model
        ReformerLM,
        # inputs will be the tokens with batch dimension
        inputs=input_tokens_with_batch,
        # temperature
        temperature=temperature
    )

    ### END CODE HERE ###

    return output_gen


In [14]:
shape11 = trax.shapes.ShapeDtype((1, 1), dtype=np.int32)


def attention(*args, **kwargs):
    kwargs['predict_mem_len'] = 120  # max length for predictions
    kwargs['predict_drop_len'] = 120  # never drop old stuff
    return tl.SelfAttention(*args, **kwargs)


model = ReformerLM(
    # vocab_size=33000,
    # n_layers=6,
    mode='predict',
    attention_type=attention,
)


In [15]:
model.init_from_file('./models/model_full_data/model.pkl.gz',
                     weights_only=True,
                     input_signature=shape11)

STARTING_STATE = model.state


In [74]:
def generate_dialogue(ReformerLM, model_state, start_sentence, vocab_file,  max_len, temperature):
    
    delimiter_1 = 'User: '
    delimiter_2 = 'Assistant: '
    
    sentence = ''
    
    counter = 0
    
    result = [tokenize(': ', vocab_file=vocab_file)]
    
    ReformerLM.state = model_state
    
    output = ReformerLM_output_gen(
        ReformerLM, start_sentence, vocab_file=VOCAB_FILE, temperature=temperature)
    
    print(start_sentence.split(delimiter_2)[0].strip())
    
    for o in output:

        result.append(o)

        sentence = detokenize(np.concatenate(
            result, axis=0), vocab_file=VOCAB_FILE)        

        if sentence.endswith(delimiter_1):
            sentence = sentence.split(delimiter_1)[0]
            print(f'{delimiter_2}{sentence}')
            sentence = ''
            result.clear()

        elif sentence.endswith(delimiter_2):
            sentence = sentence.split(delimiter_2)[0]
            print(f'{delimiter_1}{sentence}')
            sentence = ''
            result.clear()

        counter += 1

        if counter > max_len:
            break


In [None]:
sample_sentence = 'User: Hi, I want to order movie tickets for a movie for tomorrow Assistant: '
generate_dialogue(ReformerLM=model, model_state=STARTING_STATE,
                  start_sentence=sample_sentence, vocab_file=VOCAB_FILE, max_len=120, temperature=0.2)


In [36]:
sample_sentence = 'Yes, that is right. '
generate_dialogue(ReformerLM=model, model_state=STARTING_STATE,
                  start_sentence=sample_sentence, vocab_file=VOCAB_FILE, max_len=30, temperature=0.2)


Yes, that is right.
User: : 


In [35]:
sample_sentence = 'User: I want to buy [num.tickets] movie tickets for [name.movie] Assistant: '
generate_dialogue(ReformerLM=model, model_state=STARTING_STATE,
                  start_sentence=sample_sentence, vocab_file=VOCAB_FILE, max_len=120, temperature=0.2)


User: I want to buy [num.tickets] movie tickets for [name.movie]
Assistant: : I have [name.


In [None]:
sample_sentence = 'User: I am interested in seeing the movie [name.movie].  Assistant: '
generate_dialogue(ReformerLM=model, model_state=STARTING_STATE,
                  start_sentence=sample_sentence, vocab_file=VOCAB_FILE, max_len=120, temperature=0.2)


User: I am interested in seeing the movie [name.movie].


In [71]:

sample_sentence = "I would like to buy [num.tickets] tickets to see [name.movie]."
results = generate_dialogue(ReformerLM=model, model_state=STARTING_STATE, start_sentence=sample_sentence +' Assistant: ', vocab_file=VOCAB_FILE, max_len=80, temperature=0.2)


I would like to buy [num.tickets] tickets to see [name.movie].
Assistant: : I have [name.


In [73]:
from Levenshtein import ratio
ratio(['You would like to purchase [num.tickets] ticket to see [name.movie]'], [
      'I have [name.theater]  [name.theater]  [name.movie]  [name.theater]  [name.'], processor=lambda s: s[0])


0.275