Chatbot based on Pre-Trained GPT - [article](https://towardsdatascience.com/beginners-guide-to-building-a-singlish-ai-chatbot-7ecff8255ee), [notebook](https://github.com/chuachinhon/practical_nlp)

### Imports

In [None]:
import os
import codecs
import json
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split

### Download data

In [None]:
# The corpus is a collection of SMS messages by Singaporean students at a local university. The 
# language is 'Singlish', or colloquial Singaporean English. It's a mish-mash of several languages 
# and local slang.

!wget 'https://github.com/chuachinhon/practical_nlp/blob/master/data/singlish.json?raw=true' -O singlish.json

### Extract and clean data

In [None]:
# The SMSes are nested pretty deeply in the original json file. Next few cells are aimed at 
# extracting the data into a dataframe format

raw = [json.loads(line) for line in open('singlish.json', 'r')]

In [None]:
df_raw = pd.json_normalize(raw)

df_raw.head()

Unnamed: 0,smsCorpus.@date,smsCorpus.@version,smsCorpus.message
0,2015.03.09,1.2,"[{'@id': 10120, 'text': {'$': 'Bugis oso near ..."


In [None]:
raw_messages = pd.concat(
    df_raw["smsCorpus.message"]
    .apply(pd.DataFrame)
    .tolist(),
    keys=df_raw["smsCorpus.@date"],
    sort=False,
).reset_index(level="smsCorpus.@date")


In [None]:
raw_messages['sms_text'] = [x.get('$') for x in raw_messages['text']]

In [None]:
source = pd.json_normalize(raw_messages['source'], meta='@id')

destination = pd.json_normalize(raw_messages['destination'], meta='@id')

profile = pd.json_normalize(raw_messages['messageProfile'], meta='@id')

collection = pd.json_normalize(raw_messages['collectionMethod'], meta='@id')


In [None]:
sms_raw = pd.concat([raw_messages, source, destination, profile, collection], axis=1)

In [None]:
cols = [
    "@id",
    "userProfile.userID.$",
    "sms_text",
    "userProfile.country.$",
    "userProfile.age.$",
    "userProfile.gender.$",
    "srcNumber.$",
    "phoneModel.@manufactuer",
    "phoneModel.@smartphone",
    "userProfile.frequency.$",
]

sms = sms_raw[cols].copy()


In [None]:
sms['sms_text'] = sms['sms_text'].astype('str')

# simple function to clean the text and remove non-ascii characters
def clean_text(text):    
    text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single    
    return text

sms["clean_text"] = sms['sms_text'].map(lambda text: clean_text(text))

sms = sms.dropna(subset=['clean_text'])

In [None]:
#adding a word count col for filtering

sms['word_count'] = sms['clean_text'].str.count(' ') + 1

In [None]:
# narrowing down col selection

cols = ["@id", "userProfile.userID.$", "userProfile.country.$", "sms_text", "clean_text", "word_count"]

sms = sms[cols].copy()


In [None]:
# renaming cols for clarity

sms = sms.rename(
    columns={
        "@id": "data_id",
        "userProfile.userID.$": "user_id",
        "userProfile.country.$": "country",
        "sms_text": "sms_text",
        "clean_text": "clean_text",
        "word_count": "word_count",

    }
)


In [None]:
sms.shape

(55835, 6)

In [None]:
sms.head()

Unnamed: 0,data_id,user_id,country,sms_text,clean_text,word_count
0,10120,51,SG,Bugis oso near wat...,Bugis oso near wat,4
1,10121,51,SG,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,20
2,10122,51,SG,I dunno until when... Lets go learn pilates...,I dunno until when Lets go learn pilates,8
3,10123,51,SG,Den only weekdays got special price... Haiz......,Den only weekdays got special price Haiz Cant ...,25
4,10124,51,SG,Meet after lunch la...,Meet after lunch la,4


In [None]:
# Filter out SMSes of 3 words or less (too few words) and keep only those sent by users in Singapore.

crit1 = sms['word_count'] > 3
crit2 = sms['country'] == 'SG'
crit3 = sms['country'] == 'Singapore'

sms = sms[crit1 & (crit2 | crit3)].copy().reset_index()

### Prepare Conversation Data - with prior message context and reply

In [None]:
# the model will be fed 7 previous SMSes as "context", in order to generate the 'response'

contexted = []

n = 7

for i in range(n, len(sms['clean_text'])):
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current response and 7 previous responses  
    for j in range(i, prev, -1):
        row.append(sms['clean_text'][j])
    contexted.append(row)  

In [None]:
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [None]:
df.shape

(29353, 8)

In [None]:
df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,Hey pple or for nights Excellent location wif ...,nights We nt staying at port step liao Too ex,m walking in citylink now faster come down Me ...,Meet after lunch la,Den only weekdays got special price Haiz Cant ...,I dunno until when Lets go learn pilates,Go until jurong point crazy Available only in ...,Bugis oso near wat
1,Yun ah the ubi one say if wan call by tomorrow...,Hey pple or for nights Excellent location wif ...,nights We nt staying at port step liao Too ex,m walking in citylink now faster come down Me ...,Meet after lunch la,Den only weekdays got special price Haiz Cant ...,I dunno until when Lets go learn pilates,Go until jurong point crazy Available only in ...
2,Hey tmr maybe can meet you at yck,Yun ah the ubi one say if wan call by tomorrow...,Hey pple or for nights Excellent location wif ...,nights We nt staying at port step liao Too ex,m walking in citylink now faster come down Me ...,Meet after lunch la,Den only weekdays got special price Haiz Cant ...,I dunno until when Lets go learn pilates
3,Oh i asked for fun Haha take care,Hey tmr maybe can meet you at yck,Yun ah the ubi one say if wan call by tomorrow...,Hey pple or for nights Excellent location wif ...,nights We nt staying at port step liao Too ex,m walking in citylink now faster come down Me ...,Meet after lunch la,Den only weekdays got special price Haiz Cant ...
4,We are supposed to meet to discuss abt our tri...,Oh i asked for fun Haha take care,Hey tmr maybe can meet you at yck,Yun ah the ubi one say if wan call by tomorrow...,Hey pple or for nights Excellent location wif ...,nights We nt staying at port step liao Too ex,m walking in citylink now faster come down Me ...,Meet after lunch la


In [None]:
# Split the df into training andd validation set

train_df, validate_df = train_test_split(df, random_state=42, test_size=0.2)

In [None]:
train_df.shape, validate_df.shape

((23482, 8), (5871, 8))

In [None]:
train_df.to_csv('train_df.csv', index=False)
validate_df.to_csv('validate_df.csv', index=False)

### Install Hugging Face transformers library for pre-trained DialoGPT model

The original code to train and use this model in the following cells came from:
* [project 1](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) - ([blog](https://nathancooper.io/i-am-a-nerd/chatbot/deep-learning/gpt2/2020/05/12/chatbot-part-1.html)) - Open-Dialog Chatbots for Learning New Languages [Part 1]
* [project 2](https://colab.research.google.com/drive/15wa925dj7jvdvrz8_z3vU7btqAFQLVlG) - Make your own Rick Sanchez (bot) with Transformers and DialoGPT fine-tuning

I haven't tried to understand the code because it is specific to Hugging Face's API, and which was probably taken from their tutorial examples.

More [details](https://huggingface.co/transformers/model_doc/dialogpt.html) on DialoGPT model (originally by Microsoft, incorporated into transformers library by Hugging Face)

In [None]:
! pip -q install transformers

[K     |████████████████████████████████| 778kB 4.5MB/s 
[K     |████████████████████████████████| 1.1MB 12.5MB/s 
[K     |████████████████████████████████| 3.0MB 22.9MB/s 
[K     |████████████████████████████████| 890kB 46.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import glob
import logging
import numpy as np
import pandas as pd
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple
import torch

from pathlib import Path

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

### Model Configuration

In [None]:
# If you are using a free Colab account, switch to DialoGPT-small instead of DialoGPT-medium or large
# If you encounter GPU out of memory issues on Colab, reduce the batch-size.
# The model checkpoints will take up considerable space on your G-drive. I changed it to save every 80K-steps
# If you have more storage space, feel free to adjust the checkpoints interval

class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 64
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 1
        self.per_gpu_eval_batch_size = 1
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 80000
        self.save_steps = 80000
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 66
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

### Convert data for DialoGPT model

In [None]:
# GPT2 is a language model that gives the probability of some word given the context.
#   eg. How much wood could a woodchuck chuck, if a woodchuck could [blank]
# The model then gives some probability to what the next word will be, which it uses to select the word. 
# The selected word is then added back to our sentence and we repeat the whole process again. 
#   eg. How much wood could a woodchuck chuck, if a woodchuck could chuck [blank]
# This is called an autoregressive model, where the result at one time step depends on the result 
# at a previous time step.
#
# To train the language model we need a bunch of example sentences, or chunks of text.
# Then we hide the last word, and use these sentences with the missing word as our inputs 
# and the last words as the target.
#
# DialoGPT is a model that repurposes this generator, GPT2, to behave as a chatbot.
# Consider a sample conversation between two speakers, where we include some special tokens that 
# signify when one of the speakers has finished talking, which is called a 'turn'.
#   eg. Hi, how are you? [end_of_turn] I'm good, what about you? [end_of_turn] Not so good, lots of long nights at work. [end_of_turn] Darn, that sucks :( [end_of_conversation]
# We can then treat this example like the previous language model....
#   eg. Hi, how are you? [end_of_turn] [blank]
# ... and use the same logic to use GPT2 to guess the next word in this conversation...
#   eg. Hi, how are you? [end_of_turn] I'm [blank]
# ... and keep feeding back the prediction of our model.

In [None]:
# Convert the dataframes into Pytorch Dataset and Dataloader for input to the model. The dataframes
# contain multiple historical dialogs (with a response and multiple previous contexts). Each dialog is
# converted into a single conversation string that is separated a special token that tells our model 
# when a person is finished speaking. These are then tokenized with the Transformers API.

def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.max_len - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

### Train and Evaluate Functions

In [None]:
# Take a batch of examples from our dataloader and use it both as our inputs and labels. 
# We do this because GPT2 uses the context to predict the next token. This prediction is 
# then added to the original context and fed back in as the new context for generating the next token.
#
# To evaluate our model, we use the metric perplexity, which is a simple, but powerful metric. 
# Perplexity is a measure of how unsure the model is in its choice of the next token. 
# The more unsure our model is, the higher its perplexity.
#
# Although we use DialoGPT-small here due to resource constraints, DialoGPT-medium or large
# give substantially better perplexity scores.

# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

### Main Runner - Train the Model

In [None]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForCausalLM.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelForCausalLM.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

In [None]:
train_df = pd.read_csv('train_df.csv')
validate_df = pd.read_csv('validate_df.csv')


main(train_df, validate_df)

08/20/2020 13:37:49 - INFO - filelock -   Lock 140713799870784 acquired on cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5.lock
08/20/2020 13:37:49 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json not found in cache or force_download set to True, downloading to /content/cached/tmpf8ncpqn3


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=554.0, style=ProgressStyle(description_…

08/20/2020 13:37:49 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json in cache at cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5
08/20/2020 13:37:49 - INFO - transformers.file_utils -   creating metadata file for cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5
08/20/2020 13:37:49 - INFO - filelock -   Lock 140713799870784 released on cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5.lock
08/20/2020 13:37:49 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json from cache at cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425e




08/20/2020 13:37:50 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json from cache at cached/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5
08/20/2020 13:37:50 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 50257
}

08/20/2020 13:37:50 - 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

08/20/2020 13:37:51 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/vocab.json in cache at cached/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 13:37:51 - INFO - transformers.file_utils -   creating metadata file for cached/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 13:37:51 - INFO - filelock -   Lock 140713498389304 released on cached/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock





08/20/2020 13:37:51 - INFO - filelock -   Lock 140713498390200 acquired on cached/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
08/20/2020 13:37:51 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt not found in cache or force_download set to True, downloading to /content/cached/tmplidns9c7


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

08/20/2020 13:37:52 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt in cache at cached/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 13:37:52 - INFO - transformers.file_utils -   creating metadata file for cached/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 13:37:52 - INFO - filelock -   Lock 140713498390200 released on cached/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock





08/20/2020 13:37:53 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/vocab.json from cache at cached/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 13:37:53 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt from cache at cached/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 13:37:53 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/added_tokens.json from cache at None
08/20/2020 13:37:53 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=351265583.0, style=ProgressStyle(descri…

08/20/2020 13:37:58 - INFO - transformers.file_utils -   storing https://cdn.huggingface.co/microsoft/DialoGPT-small/pytorch_model.bin in cache at cached/9eab12d0b721ee394e9fe577f35d9b8b22de89e1d4f6a89b8a76d6e1a82bceae.906a78bee3add2ff536ac7ef16753bb3afb3a1cf8c26470f335b7c0e46a21483
08/20/2020 13:37:58 - INFO - transformers.file_utils -   creating metadata file for cached/9eab12d0b721ee394e9fe577f35d9b8b22de89e1d4f6a89b8a76d6e1a82bceae.906a78bee3add2ff536ac7ef16753bb3afb3a1cf8c26470f335b7c0e46a21483
08/20/2020 13:37:58 - INFO - filelock -   Lock 140713899190536 released on cached/9eab12d0b721ee394e9fe577f35d9b8b22de89e1d4f6a89b8a76d6e1a82bceae.906a78bee3add2ff536ac7ef16753bb3afb3a1cf8c26470f335b7c0e46a21483.lock
08/20/2020 13:37:58 - INFO - transformers.modeling_utils -   loading weights file https://cdn.huggingface.co/microsoft/DialoGPT-small/pytorch_model.bin from cache at cached/9eab12d0b721ee394e9fe577f35d9b8b22de89e1d4f6a89b8a76d6e1a82bceae.906a78bee3add2ff536ac7ef16753bb3afb3a1cf




08/20/2020 13:38:03 - INFO - transformers.modeling_utils -   All model checkpoint weights were used when initializing GPT2LMHeadModel.

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
08/20/2020 13:38:16 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7ffa7c072f28>
08/20/2020 13:38:16 - INFO - __main__ -   Creating features from dataset file at cached
08/20/2020 13:38:45 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
08/20/2020 13:38:45 - INFO - __main__ -   ***** Running training *****
08/20/2020 13:38:45 - INFO - __main__ -     Num examples = 23482
08/20/2020 13:38:45 - INFO - __main__ -     Num Epochs = 3
08/20/2020 13:38:45 - INFO - __main__ -     Instantaneous batch size per GPU = 1
08/20/2020 13:38:45 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 1
08/20/2020 13:38:45 - INFO - __main__ -     Gradient Accumula

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=23482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=23482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=23482.0, style=ProgressStyle(description_…

08/20/2020 15:37:17 - INFO - __main__ -    global_step = 70440, average loss = 2.8329429477771075
08/20/2020 15:37:17 - INFO - __main__ -   Saving model checkpoint to output-small
08/20/2020 15:37:17 - INFO - transformers.configuration_utils -   Configuration saved in output-small/config.json






08/20/2020 15:37:18 - INFO - transformers.modeling_utils -   Model weights saved in output-small/pytorch_model.bin
08/20/2020 15:37:19 - INFO - transformers.configuration_utils -   loading configuration file output-small/config.json
08/20/2020 15:37:19 - INFO - transformers.configuration_utils -   Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 50257
}

08/20/2020 15:37:19 - INFO - transformers.modeling_utils -   loading weights file output-small/pytorch_model.bin
08/20/202

HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5871.0, style=ProgressStyle(description_…

08/20/2020 15:39:17 - INFO - __main__ -   ***** Eval results  *****
08/20/2020 15:39:17 - INFO - __main__ -     perplexity = tensor(5.6225)





{'perplexity_': tensor(5.6225)}

### Chat with Bot

In [None]:
# Now that we have our model trained, we can have our first conversation with it!

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained('output-small')

# Let's chat for 5 lines
for step in range(10):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=500,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature = 0.8
    )
    
    # pretty print last ouput tokens from bot
    print("Beng Bot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

08/20/2020 15:40:04 - INFO - filelock -   Lock 140713768626944 acquired on /root/.cache/torch/transformers/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5.lock
08/20/2020 15:40:04 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp7gql89p1


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=554.0, style=ProgressStyle(description_…

08/20/2020 15:40:04 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json in cache at /root/.cache/torch/transformers/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5
08/20/2020 15:40:04 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5
08/20/2020 15:40:04 - INFO - filelock -   Lock 140713768626944 released on /root/.cache/torch/transformers/c3a09526c725b854c685b72cf60c50f1fea9b0e4d6227fa41573425ef4bd4bc6.4c1d7fc2ac6ddabeaf0c8bec2ffc7dc112f668f5871a06efcff113d2797ec7d5.lock
08/20/2020 15:40:04 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/config.json fr




08/20/2020 15:40:04 - INFO - filelock -   Lock 140713768628064 acquired on /root/.cache/torch/transformers/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock
08/20/2020 15:40:04 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/vocab.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpjc72swj7


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

08/20/2020 15:40:05 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/vocab.json in cache at /root/.cache/torch/transformers/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 15:40:05 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 15:40:05 - INFO - filelock -   Lock 140713768628064 released on /root/.cache/torch/transformers/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71.lock





08/20/2020 15:40:05 - INFO - filelock -   Lock 140713768547440 acquired on /root/.cache/torch/transformers/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock
08/20/2020 15:40:05 - INFO - transformers.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp2qar__vt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

08/20/2020 15:40:06 - INFO - transformers.file_utils -   storing https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt in cache at /root/.cache/torch/transformers/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 15:40:06 - INFO - transformers.file_utils -   creating metadata file for /root/.cache/torch/transformers/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 15:40:06 - INFO - filelock -   Lock 140713768547440 released on /root/.cache/torch/transformers/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda.lock





08/20/2020 15:40:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/vocab.json from cache at /root/.cache/torch/transformers/78725a31b87003f46d5bffc3157ebd6993290e4cfb7002b5f0e52bb0f0d9c2dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
08/20/2020 15:40:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/merges.txt from cache at /root/.cache/torch/transformers/570e31eddfc57062e4d0c5b078d44f97c0e5ac48f83a2958142849b59df6bbe6.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
08/20/2020 15:40:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/microsoft/DialoGPT-small/added_tokens.json from cache at None
08/20/2020 15:40:07 - INFO - transformers.tokenization_utils_base -   loading file https://s3.amazonaws.com/m

>> User:eh where u
Beng Bot: where u going today
>> User:ok wanna go bugis to makan
Beng Bot: wat time u coming back
>> User:what u wanna makan
Beng Bot: can u give me ur tel no
>> User:free how can?
Beng Bot: can help me ask uncle he wan fetch u
>> User:ok see you later bro
Beng Bot: hey i will be late oversleep
>> User:q
Beng Bot: hey will u be late over icq
>> User:bye
Beng Bot: hi i will call u when i reach dhoby ghaut
>> User:what is your name
Beng Bot: wat is your position at dhoby Ghaut
>> User:i am hungry
Beng Bot: t been captured yet
>> User:time to sleep now
Beng Bot: where you want to go
