# Training Block

In [1]:
"""
Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler

try:
    from torch.utils.tensorboard import SummaryWriter
except:
    from tensorboardX import SummaryWriter
    
from tqdm import tqdm, trange

from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)


logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)}


class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path='train', block_size=512):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')

        if os.path.exists(cached_features_file):
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size
                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
            # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])


def load_and_cache_examples(args, tokenizer, evaluate=False):
    dataset = TextDataset(tokenizer, file_path=args['eval_data_file'] if evaluate else args['train_data_file'], block_size=args['block_size'])
    return dataset


def set_seed(args):
    random.seed(args['seed'])
    np.random.seed(args['seed'])
    torch.manual_seed(args['seed'])
    if args['n_gpu'] > 0:
        torch.cuda.manual_seed_all(args['seed'])


def mask_tokens(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    masked_indices = torch.bernoulli(torch.full(labels.shape, args['mlm_probability'])).bool()
    labels[~masked_indices] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels


def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args['local_rank'] in [-1, 0]:
        tb_writer = SummaryWriter()

    args['train_batch_size'] = args['per_gpu_train_batch_size'] * max(1, args['n_gpu'])
    train_sampler = RandomSampler(train_dataset) if args['local_rank'] == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])

    if args['max_steps'] > 0:
        t_total = args['max_steps']
        args['num_train_epochs'] = args['max_steps'] // (len(train_dataloader) // args['gradient_accumulation_steps']) + 1
    else:
        t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
#     scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps = t_total)
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])

    # multi-gpu training (should be after apex fp16 initialization)
    if args['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args['local_rank'] != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args['local_rank']],
                                                          output_device=args['local_rank'],
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Instantaneous batch size per GPU = %d", args['per_gpu_train_batch_size'])
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args['train_batch_size'] * args['gradient_accumulation_steps'] * (torch.distributed.get_world_size() if args['local_rank'] != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch", disable=args['local_rank'] not in [-1, 0])
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args['local_rank'] not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer, args) if args['mlm'] else (batch, batch)
            inputs = inputs.to(args['device'])
            labels = labels.to(args['device'])
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args['mlm'] else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args['n_gpu'] > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                if args['fp16']:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args['local_rank'] in [-1, 0] and args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['local_rank'] == -1 and args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['local_rank'] in [-1, 0] and args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args['max_steps'] > 0 and global_step > args['max_steps']:
                epoch_iterator.close()
                break
        if args['max_steps'] > 0 and global_step > args['max_steps']:
            train_iterator.close()
            break

    if args['local_rank'] in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step


def evaluate(args, model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    results = {}
    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if not os.path.exists(eval_output_dir) and args['local_rank'] in [-1, 0]:
        os.makedirs(eval_output_dir)

    args['eval_batch_size'] = args['per_gpu_eval_batch_size'] * max(1, args['n_gpu'])
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args['local_rank'] == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = batch.to(args['device'])

        with torch.no_grad():
            outputs = model(batch, masked_lm_labels=batch) if args['mlm'] else model(batch, labels=batch)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity
    }

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results


def main(args):

    if args['model_type'] in ["bert", "roberta"] and not args['mlm']:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args['eval_data_file'] is None and args['do_eval']:
        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                         "or remove the --do_eval argument.")

    if os.path.exists(args['output_dir']) and os.listdir(args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args['output_dir']))

    # Setup distant debugging if needed
    if args['server_ip'] and args['server_port']:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args['server_ip'], args['server_port']), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args['local_rank'] == -1 or args['no_cuda']:
        device = torch.device("cuda" if torch.cuda.is_available() and not args['no_cuda'] else "cpu")
        args['n_gpu'] = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args['local_rank'])
        device = torch.device("cuda", args['local_rank'])
        torch.distributed.init_process_group(backend='nccl')
        args['n_gpu'] = 1
    args['device'] = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args['local_rank'] in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args['local_rank'], device, args['n_gpu'], bool(args['local_rank'] != -1), args['fp16'])

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args['local_rank'] not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
    config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'])

    #for new models
    tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'], do_lower_case=args['do_lower_case'])

#    might be needed if loading a finetuned model?
#    tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
    
    if args['block_size'] <= 0:
#         args['block_size'] = tokenizer.max_len  # Our input block size will be the max possible for the model
        args['block_size'] = tokenizer.max_len_single_sentence
    args['block_size'] = min(args['block_size'], tokenizer.max_len_single_sentence)

#    for new models
    model = model_class.from_pretrained(args['model_name_or_path'], from_tf=bool('.ckpt' in args['model_name_or_path']), config=config)

#   for finetuned models    
#    model = model_class.from_pretrained(args['output_dir'])
    model.to(args['device'])

    if args['local_rank'] == 0:
        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args['do_train']:
        if args['local_rank'] not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

        if args['local_rank'] == 0:
            torch.distributed.barrier()

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args['do_train'] and (args['local_rank'] == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args['output_dir']) and args['local_rank'] in [-1, 0]:
            os.makedirs(args['output_dir'])

        logger.info("Saving model checkpoint to %s", args['output_dir'])
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args['output_dir'])
        tokenizer.save_pretrained(args['output_dir'])

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args['output_dir'], 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args['output_dir'])
        tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case'])
        model.to(args['device'])


    # Evaluation
    results = {}
    if args['do_eval'] and args['local_rank'] in [-1, 0]:
        checkpoints = [args['output_dir']]
        if args['eval_all_checkpoints']:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args['device'])
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return result

In [4]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'], do_lower_case=args['do_lower_case'])

In [2]:
args = {}

## Required parameters
args['train_data_file']='IconicShoesTrain.txt'

#if loading a checkpoint, make 'output_dir' the checkpoint folder itself, not the overall output folder'
args['output_dir']='IconicShoesOutput'
args['eval_data_file']='IconicShoesEval.txt'

args['model_type']="gpt2"
args['model_name_or_path']="gpt2"
#args['model_name_or_path']="HF5000Output/checkpoint7"

args['mlm']=False
args['mlm_probability']=0.15

args['config_name']=""
args['tokenizer_name']="gpt2"
args['cache_dir']=""
args['block_size']=-1
args['do_train']=True
args['do_eval']=True
args['evaluate_during_training']=True
args['do_lower_case']=True

args['per_gpu_train_batch_size']=1
args['per_gpu_eval_batch_size']=1
args['gradient_accumulation_steps']=1
args['learning_rate']=5e-5
args['weight_decay']=0.0
args['adam_epsilon']=1e-8
args['max_grad_norm']=1.0
args['num_train_epochs']=300.0
args['max_steps']=-1
args['warmup_steps']=0

args['logging_steps']=20000
args['save_steps']=5000
args['eval_all_checkpoints']=False
args['no_cuda']=False
args['overwrite_output_dir']=True
args['overwrite_cache']=True
args['seed']=42
args['fp16']=True
args['fp16_opt_level']='O1'
args['local_rank']=-1
args['server_ip']=''
args['server_port']=''

# Train

In [5]:
main(args)

02/05/2020 20:42:03 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/jupyter/.cache/torch/transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.699bbd1c449e9861456f359d6daa51bd523ac085b4b531ab0aad5a55d091e942
02/05/2020 20:42:03 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activat

02/05/2020 20:42:42 - INFO - transformers.modeling_utils -   loading weights file IconicShoesOutput/checkpoint-2/pytorch_model.bin
02/05/2020 20:42:45 - INFO - __main__ -   Loading features from cached file cached_lm_1024_IconicShoesEval.txt
02/05/2020 20:42:45 - INFO - __main__ -   ***** Running evaluation 2 *****
02/05/2020 20:42:45 - INFO - __main__ -     Num examples = 170
02/05/2020 20:42:45 - INFO - __main__ -     Batch size = 1
Evaluating: 100%|██████████| 170/170 [00:11<00:00, 14.80it/s]
02/05/2020 20:42:57 - INFO - __main__ -   ***** Eval results 2 *****
02/05/2020 20:42:57 - INFO - __main__ -     perplexity = tensor(3.8511)
02/05/2020 20:42:57 - INFO - transformers.configuration_utils -   loading configuration file IconicShoesOutput/checkpoint-25/config.json
02/05/2020 20:42:57 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2label": {
    "0"

02/05/2020 20:43:56 - INFO - transformers.modeling_utils -   loading weights file IconicShoesOutput/checkpoint-45000/pytorch_model.bin
02/05/2020 20:44:00 - INFO - __main__ -   Loading features from cached file cached_lm_1024_IconicShoesEval.txt
02/05/2020 20:44:00 - INFO - __main__ -   ***** Running evaluation 45000 *****
02/05/2020 20:44:00 - INFO - __main__ -     Num examples = 170
02/05/2020 20:44:00 - INFO - __main__ -     Batch size = 1
Evaluating: 100%|██████████| 170/170 [00:11<00:00, 14.82it/s]
02/05/2020 20:44:11 - INFO - __main__ -   ***** Eval results 45000 *****
02/05/2020 20:44:11 - INFO - __main__ -     perplexity = tensor(5.4846)
02/05/2020 20:44:11 - INFO - transformers.configuration_utils -   loading configuration file IconicShoesOutput/checkpoint-5000/config.json
02/05/2020 20:44:11 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2lab

02/05/2020 20:45:10 - INFO - transformers.modeling_utils -   loading weights file IconicShoesOutput/checkpoint-65000/pytorch_model.bin
02/05/2020 20:45:14 - INFO - __main__ -   Loading features from cached file cached_lm_1024_IconicShoesEval.txt
02/05/2020 20:45:14 - INFO - __main__ -   ***** Running evaluation 65000 *****
02/05/2020 20:45:14 - INFO - __main__ -     Num examples = 170
02/05/2020 20:45:14 - INFO - __main__ -     Batch size = 1
Evaluating: 100%|██████████| 170/170 [00:11<00:00, 14.80it/s]
02/05/2020 20:45:25 - INFO - __main__ -   ***** Eval results 65000 *****
02/05/2020 20:45:25 - INFO - __main__ -     perplexity = tensor(6.4034)
02/05/2020 20:45:25 - INFO - transformers.configuration_utils -   loading configuration file IconicShoesOutput/checkpoint-70000/config.json
02/05/2020 20:45:25 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2la

02/05/2020 20:46:24 - INFO - transformers.modeling_utils -   loading weights file IconicShoesOutput/checkpoint-90000/pytorch_model.bin
02/05/2020 20:46:27 - INFO - __main__ -   Loading features from cached file cached_lm_1024_IconicShoesEval.txt
02/05/2020 20:46:27 - INFO - __main__ -   ***** Running evaluation 90000 *****
02/05/2020 20:46:27 - INFO - __main__ -     Num examples = 170
02/05/2020 20:46:27 - INFO - __main__ -     Batch size = 1
Evaluating: 100%|██████████| 170/170 [00:11<00:00, 14.82it/s]
02/05/2020 20:46:39 - INFO - __main__ -   ***** Eval results 90000 *****
02/05/2020 20:46:39 - INFO - __main__ -     perplexity = tensor(7.2354)
02/05/2020 20:46:39 - INFO - transformers.configuration_utils -   loading configuration file IconicShoesOutput/checkpoint-95000/config.json
02/05/2020 20:46:39 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2la

{}

In [3]:
args['do_train']=False
args['eval_all_checkpoints']=True

# Generation

In [6]:
""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import logging
from tqdm import trange

import torch
import torch.nn.functional as F
import numpy as np

from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig

from transformers import GPT2LMHeadModel, GPT2Tokenizer

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config,)), ())

MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer)}

def set_seed(args):
    np.random.seed(args['seed'])
    torch.manual_seed(args['seed'])
    if args['n_gpu'] > 0:
        torch.cuda.manual_seed_all(args['seed'])


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size x vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits


def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.9, is_xlnet=False, repetition_penalty=1.0, device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)

                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: #greedy sampling:
                # Return indices of the top num_samples logits (i.e. equivalent to argmax if num_samples = 1)
                next_token = torch.topk(filtered_logits, num_samples)[1]
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=num_samples, replacement=True)
            generated = torch.cat((generated, next_token.unsqueeze(1)), dim=1)
    return generated
                     

def generate(args):   

    args['device'] = torch.device("cuda" if torch.cuda.is_available() and not args['no_cuda'] else "cpu")
    args['n_gpu'] = torch.cuda.device_count()

    set_seed(args)

    args['model_type'] = args['model_type'].lower()
    model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]
    tokenizer = tokenizer_class.from_pretrained('gpt2')
    model = model_class.from_pretrained(args['model_name_or_path'])
    model.to(args['device'])
    model.eval()

    if args['length'] < 0 and model.config.max_position_embeddings > 0:
        args['length'] = model.config.max_position_embeddings
    elif 0 < model.config.max_position_embeddings < args['length']:
        args['length'] = model.config.max_position_embeddings  # No generation bigger than model size 
    elif args['length'] < 0:
        args['length'] = MAX_LENGTH  # avoid infinite loop

#    print(args)
    

    while True:
        raw_text = args['prompt'] if args['prompt'] else input("Model prompt >>> ")

        if args['model_type'] in ["transfo-xl", "xlnet"]:
            # Models with memory likes to have a long prompt for short inputs.
            raw_text = (args['padding_text'] if args['padding_text'] else PADDING_TEXT) + raw_text
        context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
        outputs = sample_sequence(
            model=model,
            context=context_tokens,
            length=args['length'],
            temperature=args['temperature'],
            top_k=args['top_k'],
            top_p=args['top_p'],
            device=args['device'],
        )
        outputs = outputs[:, len(context_tokens):].tolist()
        text_candidates = []
        for out in outputs:
            text = tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=False)
            text = text[: text.find('<eos>')]
            text_candidates.append(text)
            print(text)
        if args['prompt']:
            break
    return text_candidates

# Elwood Generation Test

In [None]:
import pandas as pd

prompts = pd.read_csv('Elwood_Tester.txt', sep="<eos>", header=None)
prompts = prompts[0]
prompts.head()

In [None]:
#Descriptions pulled from other websites, with categories added

prompts[0]= "<bos> <category> Womens Clothing, Skirts <features> Structured crop top that features a textured woven Aztec fabric. \
The crop neckline sits high on the collar and fit slightly loose through the bust and body. \
The back of the crop features an open cross back design that is created through two panels weaved through each \
other then fall into a geometric peak at the hem. Crop is fully lined and is intended to pull-over the head for wear. \
The Summer Eyes Crop is designed to be worn with the Summer Swag Skirt. <title> Asilio Summer Swag Skirt \t <long> "

prompts[1]= "<bos> <category> Womens Clothing, jackets <features> Leopard is the new black, and velvet is the new velour. This jacket is a lovable feline \
accessory with quilted paneling and a cozy high neckline. Like any Unreal Fur garment, the Huff & Puff Jacket can be dressed \
up or down for any occasion where cold weather is a guest. The Huff & Puff Jacket features a concealed zipper, front pockets, \
and a high padded neckline. Our model is wearing an AU size Small. Available in extended sizes. \
Length: 62.5 CM Leopard print velvet outer shell Matte polyester lining Concealed zipper with button closures Long sleeves \
Elasticized cuffs <title> Unreal Fur Womens Huff & Puff Jacket Leopard \t <long> "

prompts[2]= "<bos> <category> Womens Clothing, Pants <features> Master contemporary-casual style in the Panther Pleat Pants, straight through the leg, with elastic waist \
and drawstring tie, this pieces answers weekend cool needs. Cut in an exclusive print with organic pleat effect that elevates this classic \
cut into a standout style. Length: 85cm 85% Rayon 15% Nylon. Dry textured finish Straight leg, cropped length Elastic waist with drawstring tie \
In house, Sydney HQ designed print, exclusive to THIRD FORM Roll hem Pleated fabric Our model is 175cm tall, wearing a size S/AU 8 and has a 65cm waist. \
<title> Third Form Pantha Pleat Pant Black Panther \t <long> "

prompts[3] = "<bos> <category> Womens Clothing and Accessories, Shoes <features> White sneaker. Leather upper and lining. Non-leather sole. Approx 5cm sole. \
Heel is approx 6cm. Size 8 is an EU39. This shoe runs true to size. From runway to real life, the sneaker trend is a thing and \
our ALIAS MAE Arlo Sneakers are no exception. Features a platform base and lace up across the vamp. <title> Alias Mae Arlo White Sneakers \t <long> "

prompts[4]= "<bos> <category> Womens Accessories, Jewelry, Earrings <features> Lucy Chain Hoops. The Lucy Chain hoops are a medium sized hoop in soft brushed finish. \
Perfect for everyday wear or layered up with other styled hoops.  <title> Jolie & Deen Womens Lucy Chain Hoops Earrings - Silver \t <long> "

prompts[5]= "<bos> <category> Baby & Kids, Kids Clothes, Girls Clothes, Shorts <features> Mini Rollers are a slim, low waist, relaxed fit denim short. Slim fit across hip. Raw dramatic curve through hem. Made from a rigid denim and features our signature button closure. Storm Buoy is a faded blue denim wash. Fabric: 100% cotton. \
Machine wash cold. Tumble dry to retain a soft hand feel & to maintain original fit. \
See the 'Lower Waist' measurement on our kids size chart to determine the best size. <title> Oneteaspoon Kids Rollers Denim Shorts Storm Buoy \t <long> "

prompts[6]= "<bos> <category> Mens Clothing and Accessories, Tops & T-shirts <features> Our Bass Pima Tee is a regular fit in 180g washed slub cotton jersey. We've created a relaxed fitted silhouette that's ideal for layering. Added with grinning stitch detailing through the back centre seam for a more unique feel. \
<title> Neuw Mens Bass Tee Military \t <long>"

prompts[7]= "<bos> <category> Mens Clothing and Accessories, Bottoms, Jeans <features> New from Nena And Pasadena this season, the Hellcat Elastic Ankle Jean is packed full of style and comfort. With classic moto styling, ribbed paneling and heavy distressing down front legs and tight elastic cuffs at both ankles, these pants are a must-have. Available now at Culture Kings while stocks last.\
- Moto-inspired long pants - Ribbed paneling and heavy distressing down front legs - Tight elastic cuffs at both ankles \
- Belt loops with button closure zip fly - 5-pocket design - 98% cotton, 2% elastane - True to size \
- Model is wearing size Medium/32 - Model usually wears - Tops: size Large, Pants: size Medium, Shoes: size 10 <title> Nena Pasadena Mens Hellcat Elastic Ankle Jeans Utah \t <long> "

prompts[8]= "<bos> <category> Mens Clothing and Accessories, Outerwear, Casual Jackets <features> The style is based on classic sherpa styles worn in Australia through the 70's,80's & 90's. \
The buttons used are shank not snap for extra strength. - Our Model is 189cm tall and wears a size: M \
- Composition: 100% Cotton, 100% Polyester Lining - Colour: Tan Cord - Long sleeves - Side pockets \
- Button through front <title> Rollas Mens Old Mate Sherpa Coat Tan Cord \t <long> "

prompts[9]= "<bos> <category> Mens Clothing and Accessories, athletic gear, bottoms <features> The Core Trackpant is a regular fit trackpant, featuring ribbed cuffs, \
side seam pockets and an Elwood branded print on thigh. These go-to trackies will become your season long favourite, \
made from 100% Cotton Fleece and finished with a garment soft-wash for a cosy feel, these tracksuit pants are sure to become a favourite <title> Elwood Mens Core Trackpant Dark Navy Mens \t  <long> "

In [None]:
# Now with the elwood descriptions instead of scraping

prompts[0]= "<bos> <category> Womens Clothing and Accessories, skirts <features> Standout highlight Aztec woven skirt by Asilio. \
The SUMMER SWAG SKIRT is a structured midi length skirt with cross front split styling and is made from a luxurious woven Aztec Fabric. \
Looks stunning when worn with a Black or White blouse and heels and is perfect for Spring Racing Festival or any highlight summery event. \
<title> Asilio Summer Swag Skirt \t <long> "

prompts[1]= "<bos> <category> Womens Clothing and Accessories, Jackets <features> Leopard is the new black, and velvet is the new velour. This jacket is a lovable feline accessory with quilted panelling, and a cozy high neckline. Like any Unreal Fur garment, the Huff & Puff Jacket can be dressed up or down for any occasion where cold weather is a guest. \
The Huff & Puff Jacket features a concealed zipper, front pockets, and a high padded neckline. \
Our model is wearing an AU size Small. Length: 62.5 CM Leopard print velvet outer shell Matte polyester lining \
Concealed zipper with button closures Long sleeves Elasticised cuffs <title> Unreal Fur Womens Huff & Puff Jacket Leopard \t <long> "

prompts[2]= "<bos> <category> Womens Clothing and Accessories, Pants <features> Australian designer pants in an exclusive Black White Panther Print from Third Form. \
Master contemporary-casual style in the Panther Pleat Pants, straight through the leg, with elastic waist and drawstring tie, this pieces answers weekend cool needs. \
Cut in an exclusive print with organic pleat effect that elevates this classic cut into a standout style. Length: 85cm \
85% Rayon 15% Nylon. Dry textured finish Straight leg, cropped length Elastic waist with drawstring tie In house, Sydney HQ designed print, exclusive to THIRD FORM \
Roll hem Pleated fabric Our model is 175cm tall, wearing a size S/AU 8 and has a 65cm waist. \
<title> Third Form Pantha Pleat Pant Black Panther \t <long> "

prompts[3] = "<bos> <category> Womens Clothing and Accessories, Shoes, Sneakers <features> White sneaker. Leather upper and lining. Non-leather sole. Approx 5cm sole. \
Heel is approx 6cm. Size 8 is an EU39. This shoe runs true to size. From runway to real life, the sneaker trend is a thing and \
our ALIAS MAE Arlo Sneakers are no exception. Features a platform base and lace up across the vamp. <title> Alias Mae Arlo White Sneakers \t <long> "

prompts[4]= "<bos> <category> Womens Accessories, Jewelry <features> Lucy Chain Hoops. The Lucy Chain hoops are a medium sized hoop in soft brushed finish. \
Perfect for everyday wear or layered up with other styled hoops.  <title> Jolie & Deen Womens Lucy Chain Hoops Earrings - Silver \t <long> "

prompts[5]= "<bos> <category> Baby & Kids, Kids Clothes, Girls Clothes, Shorts <features> HAND MADE DENIM STREETWEAR FOR KIDS by OneTeaspoon. \
Your Kids will be the coolest kids on the block in our all new OneTeaspoon Kids Collection. \
Mini Rollers are a slim, low waist, relaxed fit denim short. Slim fit across hip. Raw dramatic curve through hem. Made from a rigid denim and features our signature button closure. Storm Buoy is a faded blue denim wash. Fabric: 100% cotton. \
Hand Made and funky as you'd expect from OneTeaspoon. <title> Oneteaspoon Kids Rollers Denim Shorts Storm Buoy \t <long> "

prompts[6]= "<bos> <category> Mens Clothing and Accessories, Tops & T-shirts <features> Premium Pima Cotton Basic tee by Neuw ...a wardrobe essential item. \
The Bass Tee is a Neuw Denim classic and is cut to a regular fitting silhouette, providing an easy-wearing shape. \
It's crafted from quality Pima Cotton for a unique, soft and long lasting hand feel. And is finished with a classic crew neckline, \
fitted cap sleeves and double stitched side seams for durability. It's the perfect long-lasting piece for your Neuw wardrobe. \
Style it with your favourite denim wash and a leather or denim jacket for a timeless look. \
<title> Neuw Mens Bass Tee Military \t <long>"

prompts[7]= "<bos> <category> Mens Clothing and Accessories, Bottoms, Jeans <features> \
Introducing the new Top of the Range HELLCAT Pant by Nena Pasadena in KENTUCKY BLUE Denim, destined to be a best selling fit and style. \
The HELLCAT features a whole heap of distinctive style highlights incl : DROP CROTCH GUSSET \
SLIM ELASTIC CUFF ANKLES PIN TUCKED RIB DETAIL DISTRESSED KNEE OVERLAY 5 POCKET STYLING \
The Kentucky Blue colourway featured here, is the perfect washed mid blue denim that is always in demand, year in, year out, and looks great with any combination of tees and sneakers. \
<title> Nena Pasadena Mens Hellcat Elastic Ankle Jeans Utah \t <long> "

prompts[8]= "<bos> <category> Mens Clothing and Accessories, Outerwear, Casual Jackets <features> \
Mens Rollas Cordury and sherpa lined jacket.....ultra warm and a nod to the vintage styles of yesteryear. \
The style is based on classic sherpa styles worn in Australia through the 70's,80's & 90's. \
The buttons used are shank not snap for extra strength. Our Model is 189cm tall and wears a size: M \
- Composition: 100% Cotton, 100% Polyester Lining - Colour: Tan Cord - Long sleeves - Side pockets \
- Button through front <title> Rollas Mens Old Mate Sherpa Coat Tan Cord \t <long> "

prompts[9]= "<bos> <category> Mens Clothing and Accessories, athletic gear, bottoms <features> \
Just in by Elwood.. The Core trackpant. This is a regular fit with ribbed cuff, side seam pockets and soft plastisol thigh print. \
Made from 100% Cotton fleece and finished with a garment soft wash. <title> Elwood Mens Core Trackpant Dark Navy Mens \t  <long> "


In [None]:
prompts= prompts[:10]
prompts[1]

In [None]:
testerlist = []
for i in range(len(prompts)):
    inputs= prompts[i]
    testerlist.append([inputs])

In [None]:
args = {}

args["model_type"]='gpt2'
args['model_name_or_path']= 'OldIconicOutput/checkpoint3'
args['prompt']=""
args['padding_text']= ''#WHAT'S MY PADDING TEXT?
args['length']=200
args['temperature']=1.0
args['top_k']=0
args['top_p']=0.9
args['no_cuda']=False
args['seed']=1

In [None]:
#Elwood test block

with open("Elwood_Tester.txt", "w") as text_file:
    for i in range(len(testerlist)):
        args['prompt']=testerlist[i][0]
        prompt = args['prompt']
        line = f"Prompt:\n{prompt}\n\n"
        print(line, file=text_file)
        for o in range(41, 45):
            args['seed']=o
            generated = generate(args)
            line = f"\nGenerated description {o}:\n{generated}\n\n"
            print(line, file=text_file)

# Testing out TF-IDF scoring

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['This is the first document.','This document is the second document.','And this is the third one.',
'Is this the first document?']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open("Elwood_Multi_TFIDF_Test.txt", "w") as text_file:
    for i in range(len(testerlist)):
        descriptions = []
        args['prompt']=testerlist[i][0]
        prompt = args['prompt']
        descriptions.append(prompt)
        line = f"Prompt:\n{prompt}\n\n"
        print(line, file=text_file)
        for o in range(20):
            args['seed']=o
            generated = generate(args)
            descriptions.append(generated[0])
            print(o)
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform(descriptions)                                                                                                                                                                                                                    
        pairwise_similarity = tfidf * tfidf.T 
        arr = pairwise_similarity.toarray()
        np.fill_diagonal(arr, np.nan)
    
        input_idx = descriptions.index(prompt)
        result_idx = np.nanargmax(arr[input_idx])
    
        generated = descriptions[result_idx]
        line = f"\nGenerated description 1:\n{generated}\n\n"
        print(line, file=text_file)
        arr[input_idx,result_idx] = np.nan
        
        result_idx = np.nanargmax(arr[input_idx])
        generated = descriptions[result_idx]
        line = f"\nGenerated description 2:\n{generated}\n\n"
        print(line, file=text_file)
        arr[input_idx,result_idx] = np.nan
        
        result_idx = np.nanargmax(arr[input_idx])
        generated = descriptions[result_idx]
        line = f"\nGenerated description 3:\n{generated}\n\n"
        print(line, file=text_file)
        arr[input_idx,result_idx] = np.nan
        
        result_idx = np.nanargmax(arr[input_idx])
        generated = descriptions[result_idx]
        line = f"\nGenerated description 4:\n{generated}\n\n"
        print(line, file=text_file)
        arr[input_idx,result_idx] = np.nan

# Normal Generation

In [7]:
import pandas as pd

prompts = pd.read_csv('IconicShoesTester.txt', sep="<eos>", header=None)
prompts = prompts[0]
prompts.head()

  This is separate from the ipykernel package so we can avoid doing imports until


0    <bos> <category> Women/Shoes/Sandals <features...
1    <bos> <category> Women/Shoes/Sandals <features...
2    <bos> <category> Women/Shoes/Boots <features> ...
3    <bos> <category> Women/Shoes/Sandals <features...
4    <bos> <category> Women/Shoes/Flats <features> ...
Name: 0, dtype: object

In [8]:
testerlist = []
for i in range(len(prompts)):
    inputs, description = prompts[i].split('<description>')
    testerlist.append([inputs + ' <description> ', description])

In [12]:
testerlist[0][0]

['<bos> <category> Women/Shoes/Sandals <features>  Suede upper leather lined- Black hue- Open almond toe- Fixed vamp strap- Crossed midfoot straps- Pocket heel- Braided espadrille midsole- Crepe rubber sole- Slip-on design- 2.8cm heel <brand> OFFICE <model> Hallie \t <description> ',
 ' The Hallie sandals from OFFICE exude a sanguine nonchalance with their crossed suede strapping and woven espadrille midsole ']

In [None]:
args = {}

args["model_type"]='gpt2'
args['model_name_or_path']= 'IconicShoesOutput/checkpoint-1'
args['prompt']=""
args['padding_text']= ''#WHAT'S MY PADDING TEXT?
args['length']=200
args['temperature']=1.0
args['top_k']=0
args['top_p']=0.9
args['no_cuda']=False
args['seed']=1

In [11]:
#check multiple checkpoints
checkpoints = ['IconicShoesOutput/checkpoint-1', 'IconicShoesOutput/checkpoint-15', 'IconicShoesOutput/checkpoint-2', 'IconicShoesOutput/checkpoint-25', 'IconicShoesOutput/checkpoint-30000', 'IconicShoesOutput/checkpoint-35000', 'IconicShoesOutput/checkpoint-40000', 'IconicShoesOutput/checkpoint-45000', 'IconicShoesOutput/checkpoint-5000', 'IconicShoesOutput/checkpoint-50000', 'IconicShoesOutput/checkpoint-55000', 'IconicShoesOutput/checkpoint-60000', 'IconicShoesOutput/checkpoint-65000', 'IconicShoesOutput/checkpoint-70000', 'IconicShoesOutput/checkpoint-75000', 'IconicShoesOutput/checkpoint-80000', 'IconicShoesOutput/checkpoint-85000', 'IconicShoesOutput/checkpoint-90000', 'IconicShoesOutput/checkpoint-95000']

In [14]:
with open("IconicCheckpointTest.txt", "w") as text_file:
    for point in checkpoints:
        args['prompt']= testerlist[0]
        args['model_name_or_path']= point
        line = f'\n{point}\n<prompt> \n{testerlist[0][0]}'
        for i in range(40,44):
            args['seed']=i
            generated = generate(args)
            line += f'\n\n<generated>{i}: \n{generated[0]}\n\n'
        print(line, file=text_file)

02/05/2020 21:09:27 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json from cache at /home/jupyter/.cache/torch/transformers/f2808208f9bec2320371a9f5f891c184ae0b674ef866b79c58177067d15732dd.1512018be4ba4e8726e41b9145129dc30651ea4fec86aa61f4b9f40bf94eac71
02/05/2020 21:09:27 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt from cache at /home/jupyter/.cache/torch/transformers/d629f792e430b3c76a1291bb2766b0a047e36fae0588f9dbc1ae51decdff691b.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda
02/05/2020 21:09:27 - INFO - transformers.configuration_utils -   loading configuration file IconicShoesOutput/checkpoint-1/config.json
02/05/2020 21:09:27 - INFO - transformers.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "id2

KeyError: 'length'

In [None]:
for tester in testerlist:
    args['prompt']= tester[0]
#         line = f'<prompt> \n{args['prompt']} \n\n<actual description> \n{tester[1]}'
    for i in range(40,44):
        args['seed']=i
        generated = generate(args)
        print(generated[0])
#             line += f'\n\n<generated>{i}: \n{generated}\n\n'
#         print(line, file=text_file)

In [None]:
with open("IconicTestRun.txt", "w") as text_file:
    for tester in testerlist:
        args['prompt']= tester[0]
        line = f'<prompt> \n{tester[0]} \n\n<actual description> \n{tester[1]}'
        for i in range(40,44):
            args['seed']=i
            generated = generate(args)
            line += f'\n\n<generated>{i}: \n{generated[0]}\n\n'
        print(line, file=text_file)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open("Iconic_TFIDF_Test.txt", "w") as text_file:
    for i in range(len(testerlist)):
        descriptions = []
        args['prompt']=testerlist[i][0]
        prompt = args['prompt']
        descriptions.append(prompt)
        line = f"Prompt:\n{prompt}\n\n"
        print(line, file=text_file)
        for o in range(15):
            args['seed']=o
            generated = generate(args)
            descriptions.append(generated[0])
            print(o)
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform(descriptions)                                                                                                                                                                                                                    
        pairwise_similarity = tfidf * tfidf.T 
        arr = pairwise_similarity.toarray()
        np.fill_diagonal(arr, np.nan)
    
        input_idx = descriptions.index(prompt)
        result_idx = np.nanargmax(arr[input_idx])
        generated = descriptions[result_idx]
        line = f"\nGenerated description {o}:\n{generated}\n\n"
        print(line, file=text_file)