In [37]:
import transformers 
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from typing import Dict, List, Tuple
from torch.nn.utils.rnn import pad_sequence
from tqdm import trange
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

torch.cuda.set_device(0)



In [38]:
class Args():
    def __init__(self):
        self.output_dir = '../HULK/Counterspeech/models/gpt2-on-debate'
#         self.model_type = 'gpt2-medium'
#         self.model_name_or_path = 'dialog-gpt-createdebate'
#         self.config_name = 'dialog-gpt-createdebate'
#         self.tokenizer_name = 'dialog-gpt-createdebate'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'gpt2'
        self.config_name = 'gpt2'
        self.tokenizer_name = 'gpt2'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-6
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 10
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 56
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [39]:
import json
# Opening JSON file
with open('../HULK/Counterspeech//selected_arguments.json') as json_file:
    data = json.load(json_file)

In [40]:
total_data_sentences=[]
for key in data:
    total_data_sentences+=data[key]['selected_arguments']

In [41]:
X_train, X_test_dev = train_test_split(total_data_sentences, test_size=0.2, random_state=42, shuffle=True)
X_test, X_dev = train_test_split(X_test_dev, test_size=0.5, random_state=42, shuffle=True)

In [42]:
!pip install torchsummary



In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # PyTorch v0.4.0
model = Net().to(device)

summary(model, (1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 10, 24, 24]             260
            Conv2d-2             [-1, 20, 8, 8]           5,020
         Dropout2d-3             [-1, 20, 8, 8]               0
            Linear-4                   [-1, 50]          16,050
            Linear-5                   [-1, 10]             510
Total params: 21,840
Trainable params: 21,840
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.06
Params size (MB): 0.08
Estimated Total Size (MB): 0.15
----------------------------------------------------------------


In [43]:
path='../HULK/Saved_models/'

In [44]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2",cache_dir=path)
# model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small",cache_dir=path)

model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    cache_dir=path
)
    




    #model.to(args.device)

In [13]:
model.cuda()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [45]:
freeze_layer_count=6

for param in model.transformer.wpe.parameters():
        param.requires_grad = False
for param in model.transformer.wte.parameters():
        param.requires_grad = False

        
if freeze_layer_count != -1:
    # if freeze_layer_count == -1, we only freeze the embedding layer
    # otherwise we freeze the first `freeze_layer_count` encoder layers
    for layer in model.transformer.h[:freeze_layer_count]:
        for param in layer.parameters():
            param.requires_grad = False

In [46]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [47]:
def construct_conv(dict_reply_pair, tokenizer, eos = True, block_size=256):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list([tokenizer.encode(dict_reply_pair['initiator_message'],truncation=True,max_length=int((block_size/2)-1))+ 
                 [tokenizer.eos_token_id] + 
                tokenizer.encode(dict_reply_pair['reply_message'],truncation=True,max_length=int((block_size/2)-1))+
                [tokenizer.eos_token_id]])
    conv = flatten(conv)
    return conv

In [48]:
def fix_the_random(seed_val = 42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [49]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, text_list, block_size=512):

        self.examples = []
        for element in text_list:
            conv = construct_conv(element, tokenizer)
            self.examples.append(conv)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [50]:
def load_and_cache_examples(args, tokenizer, train_text, val_text, evaluate=False):
    return ConversationDataset(tokenizer, args, val_text if evaluate else train_text)

In [51]:
def train(args, train_dataset, eval_dataset,model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)

    # Prepare optimizer and schedule (linear warmup and decay)
    
    #The optimizer allows us to apply different hyperpameters for specific parameter groups. 
    #For example, we can apply weight decay to all parameters other than bias and layer normalization terms:
    
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    
    tr_loss, logging_loss = 0.0, 0.0
    
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    eval_best = 100000
    eval_ = []
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
                
        eval_.append(evaluate(args, model, tokenizer, train_dataset, eval_dataset)[1])
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        if eval_[-1]<eval_best:
            os.makedirs(args.output_dir, exist_ok=True)

            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
            # They can then be reloaded using `from_pretrained()`
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

            # Good practice: save your training arguments together with the trained model
            torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
            eval_best = eval_[-1]

    return global_step, tr_loss / global_step, eval_






# Evaluation of some model
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    return result, perplexity

In [52]:
def main(df_trn, df_val, test_text=None, model = None):
    args = Args()
    
    # Setup CUDA, GPU & distributed training
    #device = torch.device("cuda")
    device = torch.device("cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    
    config = AutoConfig.from_pretrained(args.config_name,cache_dir=path)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,cache_dir=path)
    if model ==None:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name_or_path,
            from_tf=False,
            config=config,
            cache_dir=path
        )
        model.to(args.device)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        global_step, tr_loss, eval_ = train(args, train_dataset, df_val, model, tokenizer)
        
    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)
    test_eval=0
    if test_text!=None:
        test_eval = evaluate(args, model, tokenizer, df_trn, test_text)[1]
    return eval_, test_eval, model

In [None]:
eval_, test_eval, model = main(X_train, X_dev,X_test)





Epoch:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A




Iteration:   0%|          | 0/1928 [00:00<?, ?it/s][A[A[A[A[A




Iteration:   0%|          | 1/1928 [00:13<7:08:22, 13.34s/it][A[A[A[A[A




Iteration:   0%|          | 2/1928 [00:23<6:33:58, 12.27s/it][A[A[A[A[A




Iteration:   0%|          | 3/1928 [00:42<7:41:26, 14.38s/it][A[A[A[A[A




Iteration:   0%|          | 4/1928 [00:58<7:56:23, 14.86s/it][A[A[A[A[A




Iteration:   0%|          | 5/1928 [01:12<7:52:16, 14.74s/it][A[A[A[A[A




Iteration:   0%|          | 6/1928 [01:28<8:01:26, 15.03s/it][A[A[A[A[A




Iteration:   0%|          | 7/1928 [01:41<7:38:57, 14.34s/it][A[A[A[A[A




Iteration:   0%|          | 8/1928 [01:53<7:16:47, 13.65s/it][A[A[A[A[A




Iteration:   0%|          | 9/1928 [02:10<7:51:48, 14.75s/it][A[A[A[A[A




Iteration:   1%|          | 10/1928 [02:19<6:57:11, 13.05s/it][A[A[A[A[A




Iteration:   1%|          | 11/1928 [02:36<7:29:33, 

Iteration:  10%|█         | 197/1928 [44:46<6:10:53, 12.86s/it][A[A[A[A[A




Iteration:  10%|█         | 198/1928 [44:53<5:18:29, 11.05s/it][A[A[A[A[A




Iteration:  10%|█         | 199/1928 [45:05<5:21:52, 11.17s/it][A[A[A[A[A




Iteration:  10%|█         | 200/1928 [45:25<6:38:34, 13.84s/it][A[A[A[A[A




Iteration:  10%|█         | 201/1928 [45:33<5:48:12, 12.10s/it][A[A[A[A[A




Iteration:  10%|█         | 202/1928 [45:43<5:34:31, 11.63s/it][A[A[A[A[A




Iteration:  11%|█         | 203/1928 [45:53<5:19:41, 11.12s/it][A[A[A[A[A




Iteration:  11%|█         | 204/1928 [46:07<5:41:11, 11.87s/it][A[A[A[A[A




Iteration:  11%|█         | 205/1928 [46:26<6:45:36, 14.12s/it][A[A[A[A[A




Iteration:  11%|█         | 206/1928 [46:42<6:57:06, 14.53s/it][A[A[A[A[A




Iteration:  11%|█         | 207/1928 [46:53<6:29:06, 13.57s/it][A[A[A[A[A




Iteration:  11%|█         | 208/1928 [47:05<6:14:08, 13.05s/it][A[A[A[A[A




Iter

Iteration:  20%|██        | 391/1928 [1:25:11<4:29:57, 10.54s/it][A[A[A[A[A




Iteration:  20%|██        | 392/1928 [1:25:22<4:36:49, 10.81s/it][A[A[A[A[A




Iteration:  20%|██        | 393/1928 [1:25:42<5:49:06, 13.65s/it][A[A[A[A[A




Iteration:  20%|██        | 394/1928 [1:25:54<5:29:59, 12.91s/it][A[A[A[A[A




Iteration:  20%|██        | 395/1928 [1:26:04<5:13:54, 12.29s/it][A[A[A[A[A




Iteration:  21%|██        | 396/1928 [1:26:16<5:05:32, 11.97s/it][A[A[A[A[A




Iteration:  21%|██        | 397/1928 [1:26:35<6:04:30, 14.29s/it][A[A[A[A[A




Iteration:  21%|██        | 398/1928 [1:26:56<6:49:01, 16.04s/it][A[A[A[A[A




Iteration:  21%|██        | 399/1928 [1:27:06<6:06:53, 14.40s/it][A[A[A[A[A




Iteration:  21%|██        | 400/1928 [1:27:22<6:19:23, 14.90s/it][A[A[A[A[A




Iteration:  21%|██        | 401/1928 [1:27:32<5:39:34, 13.34s/it][A[A[A[A[A




Iteration:  21%|██        | 402/1928 [1:27:49<6:05:35, 14.37s/it]

Iteration:  30%|███       | 583/1928 [2:01:41<3:29:55,  9.36s/it][A[A[A[A[A




Iteration:  30%|███       | 584/1928 [2:01:50<3:32:32,  9.49s/it][A[A[A[A[A




Iteration:  30%|███       | 585/1928 [2:02:03<3:53:17, 10.42s/it][A[A[A[A[A




Iteration:  30%|███       | 586/1928 [2:02:12<3:44:10, 10.02s/it][A[A[A[A[A




Iteration:  30%|███       | 587/1928 [2:02:22<3:45:40, 10.10s/it][A[A[A[A[A




Iteration:  30%|███       | 588/1928 [2:02:35<4:00:32, 10.77s/it][A[A[A[A[A




Iteration:  31%|███       | 589/1928 [2:02:49<4:21:15, 11.71s/it][A[A[A[A[A




Iteration:  31%|███       | 590/1928 [2:03:03<4:39:55, 12.55s/it][A[A[A[A[A




Iteration:  31%|███       | 591/1928 [2:03:13<4:21:19, 11.73s/it][A[A[A[A[A




Iteration:  31%|███       | 592/1928 [2:03:27<4:33:53, 12.30s/it][A[A[A[A[A




Iteration:  31%|███       | 593/1928 [2:03:36<4:17:23, 11.57s/it][A[A[A[A[A




Iteration:  31%|███       | 594/1928 [2:03:44<3:49:21, 10.32s/it]

Iteration:  40%|████      | 775/1928 [2:39:18<4:26:51, 13.89s/it][A[A[A[A[A




Iteration:  40%|████      | 776/1928 [2:39:31<4:17:56, 13.43s/it][A[A[A[A[A




Iteration:  40%|████      | 777/1928 [2:39:43<4:11:58, 13.13s/it][A[A[A[A[A




Iteration:  40%|████      | 778/1928 [2:39:54<4:00:19, 12.54s/it][A[A[A[A[A




Iteration:  40%|████      | 779/1928 [2:40:05<3:52:25, 12.14s/it][A[A[A[A[A




Iteration:  40%|████      | 780/1928 [2:40:17<3:49:48, 12.01s/it][A[A[A[A[A




Iteration:  41%|████      | 781/1928 [2:40:31<3:58:13, 12.46s/it][A[A[A[A[A




Iteration:  41%|████      | 782/1928 [2:40:48<4:27:08, 13.99s/it][A[A[A[A[A




Iteration:  41%|████      | 783/1928 [2:40:59<4:08:31, 13.02s/it][A[A[A[A[A




Iteration:  41%|████      | 784/1928 [2:41:07<3:41:51, 11.64s/it][A[A[A[A[A




Iteration:  41%|████      | 785/1928 [2:41:18<3:34:24, 11.25s/it][A[A[A[A[A




Iteration:  41%|████      | 786/1928 [2:41:25<3:13:14, 10.15s/it]

Iteration:  50%|█████     | 967/1928 [3:19:09<3:28:15, 13.00s/it][A[A[A[A[A




Iteration:  50%|█████     | 968/1928 [3:19:22<3:25:33, 12.85s/it][A[A[A[A[A




Iteration:  50%|█████     | 969/1928 [3:19:36<3:29:30, 13.11s/it][A[A[A[A[A




Iteration:  50%|█████     | 970/1928 [3:19:46<3:16:51, 12.33s/it][A[A[A[A[A




Iteration:  50%|█████     | 971/1928 [3:20:00<3:24:54, 12.85s/it][A[A[A[A[A




Iteration:  50%|█████     | 972/1928 [3:20:10<3:09:39, 11.90s/it][A[A[A[A[A




Iteration:  50%|█████     | 973/1928 [3:20:20<3:01:50, 11.42s/it][A[A[A[A[A




Iteration:  51%|█████     | 974/1928 [3:20:31<3:00:16, 11.34s/it][A[A[A[A[A




Iteration:  51%|█████     | 975/1928 [3:20:52<3:46:16, 14.25s/it][A[A[A[A[A




Iteration:  51%|█████     | 976/1928 [3:21:04<3:33:40, 13.47s/it][A[A[A[A[A




Iteration:  51%|█████     | 977/1928 [3:21:14<3:16:58, 12.43s/it][A[A[A[A[A




Iteration:  51%|█████     | 978/1928 [3:21:23<2:58:58, 11.30s/it]

Iteration:  60%|██████    | 1157/1928 [4:00:12<2:51:04, 13.31s/it][A[A[A[A[A




Iteration:  60%|██████    | 1158/1928 [4:00:24<2:48:24, 13.12s/it][A[A[A[A[A




Iteration:  60%|██████    | 1159/1928 [4:00:42<3:05:14, 14.45s/it][A[A[A[A[A




Iteration:  60%|██████    | 1160/1928 [4:00:55<3:01:18, 14.16s/it][A[A[A[A[A




Iteration:  60%|██████    | 1161/1928 [4:01:11<3:06:20, 14.58s/it][A[A[A[A[A




Iteration:  60%|██████    | 1162/1928 [4:01:23<2:58:04, 13.95s/it][A[A[A[A[A




Iteration:  60%|██████    | 1163/1928 [4:01:39<3:04:55, 14.50s/it][A[A[A[A[A




Iteration:  60%|██████    | 1164/1928 [4:01:53<3:00:30, 14.18s/it][A[A[A[A[A




Iteration:  60%|██████    | 1165/1928 [4:02:05<2:52:03, 13.53s/it][A[A[A[A[A




Iteration:  60%|██████    | 1166/1928 [4:02:17<2:48:49, 13.29s/it][A[A[A[A[A




Iteration:  61%|██████    | 1167/1928 [4:02:29<2:43:02, 12.86s/it][A[A[A[A[A




Iteration:  61%|██████    | 1168/1928 [4:02:39<2:31:29

Iteration:  70%|██████▉   | 1347/1928 [4:39:51<1:47:09, 11.07s/it][A[A[A[A[A




Iteration:  70%|██████▉   | 1348/1928 [4:40:07<2:00:46, 12.49s/it][A[A[A[A[A




Iteration:  70%|██████▉   | 1349/1928 [4:40:18<1:57:22, 12.16s/it][A[A[A[A[A




Iteration:  70%|███████   | 1350/1928 [4:40:29<1:55:10, 11.96s/it][A[A[A[A[A




Iteration:  70%|███████   | 1351/1928 [4:40:38<1:46:00, 11.02s/it][A[A[A[A[A




Iteration:  70%|███████   | 1352/1928 [4:40:48<1:42:34, 10.68s/it][A[A[A[A[A




Iteration:  70%|███████   | 1353/1928 [4:41:00<1:46:29, 11.11s/it][A[A[A[A[A




Iteration:  70%|███████   | 1354/1928 [4:41:17<2:03:26, 12.90s/it][A[A[A[A[A




Iteration:  70%|███████   | 1355/1928 [4:41:40<2:31:49, 15.90s/it][A[A[A[A[A




Iteration:  70%|███████   | 1356/1928 [4:41:52<2:19:06, 14.59s/it][A[A[A[A[A




Iteration:  70%|███████   | 1357/1928 [4:42:04<2:12:30, 13.92s/it][A[A[A[A[A




Iteration:  70%|███████   | 1358/1928 [4:42:22<2:24:24

Iteration:  80%|███████▉  | 1537/1928 [5:17:07<1:19:09, 12.15s/it][A[A[A[A[A




Iteration:  80%|███████▉  | 1538/1928 [5:17:20<1:18:57, 12.15s/it][A[A[A[A[A




Iteration:  80%|███████▉  | 1539/1928 [5:17:34<1:23:59, 12.95s/it][A[A[A[A[A




Iteration:  80%|███████▉  | 1540/1928 [5:17:42<1:14:14, 11.48s/it][A[A[A[A[A




Iteration:  80%|███████▉  | 1541/1928 [5:17:51<1:08:01, 10.55s/it][A[A[A[A[A




Iteration:  80%|███████▉  | 1542/1928 [5:18:02<1:08:59, 10.72s/it][A[A[A[A[A




Iteration:  80%|████████  | 1543/1928 [5:18:15<1:13:21, 11.43s/it][A[A[A[A[A




Iteration:  80%|████████  | 1544/1928 [5:18:31<1:21:02, 12.66s/it][A[A[A[A[A




Iteration:  80%|████████  | 1545/1928 [5:18:39<1:12:30, 11.36s/it][A[A[A[A[A




Iteration:  80%|████████  | 1546/1928 [5:18:47<1:06:10, 10.39s/it][A[A[A[A[A




Iteration:  80%|████████  | 1547/1928 [5:18:57<1:04:32, 10.16s/it][A[A[A[A[A




Iteration:  80%|████████  | 1548/1928 [5:19:03<56:35, 

Iteration:  90%|████████▉ | 1729/1928 [5:53:49<49:02, 14.79s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1730/1928 [5:54:02<47:23, 14.36s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1731/1928 [5:54:12<42:26, 12.92s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1732/1928 [5:54:27<44:29, 13.62s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1733/1928 [5:54:36<39:29, 12.15s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1734/1928 [5:54:45<36:10, 11.19s/it][A[A[A[A[A




Iteration:  90%|████████▉ | 1735/1928 [5:54:55<34:46, 10.81s/it][A[A[A[A[A




Iteration:  90%|█████████ | 1736/1928 [5:55:07<35:46, 11.18s/it][A[A[A[A[A




Iteration:  90%|█████████ | 1737/1928 [5:55:22<39:17, 12.34s/it][A[A[A[A[A




Iteration:  90%|█████████ | 1738/1928 [5:55:31<35:59, 11.37s/it][A[A[A[A[A




Iteration:  90%|█████████ | 1739/1928 [5:55:41<34:18, 10.89s/it][A[A[A[A[A




Iteration:  90%|█████████ | 1740/1928 [5:55:50<32:52, 10.49s/it][A[A[A[A

Iteration: 100%|█████████▉| 1923/1928 [6:31:58<01:05, 13.18s/it][A[A[A[A[A




Iteration: 100%|█████████▉| 1924/1928 [6:32:13<00:54, 13.62s/it][A[A[A[A[A




Iteration: 100%|█████████▉| 1925/1928 [6:32:25<00:39, 13.11s/it][A[A[A[A[A




Iteration: 100%|█████████▉| 1926/1928 [6:32:38<00:26, 13.19s/it][A[A[A[A[A




Iteration: 100%|█████████▉| 1927/1928 [6:32:50<00:12, 12.84s/it][A[A[A[A[A




Iteration: 100%|██████████| 1928/1928 [6:33:06<00:00, 12.23s/it][A[A[A[A[A





Evaluating:   0%|          | 0/241 [00:00<?, ?it/s][A[A[A[A[A




Evaluating:   0%|          | 1/241 [00:02<10:02,  2.51s/it][A[A[A[A[A




Evaluating:   1%|          | 2/241 [00:04<09:42,  2.44s/it][A[A[A[A[A




Evaluating:   1%|          | 3/241 [00:07<09:38,  2.43s/it][A[A[A[A[A




Evaluating:   2%|▏         | 4/241 [00:09<09:54,  2.51s/it][A[A[A[A[A




Evaluating:   2%|▏         | 5/241 [00:12<09:46,  2.49s/it][A[A[A[A[A




Evaluating:   2%|▏         | 

Evaluating:  82%|████████▏ | 197/241 [10:17<02:13,  3.04s/it][A[A[A[A[A




Evaluating:  82%|████████▏ | 198/241 [10:18<01:52,  2.61s/it][A[A[A[A[A




Evaluating:  83%|████████▎ | 199/241 [10:22<01:59,  2.84s/it][A[A[A[A[A




Evaluating:  83%|████████▎ | 200/241 [10:25<02:05,  3.07s/it][A[A[A[A[A




Evaluating:  83%|████████▎ | 201/241 [10:30<02:18,  3.46s/it][A[A[A[A[A




Evaluating:  84%|████████▍ | 202/241 [10:34<02:25,  3.73s/it][A[A[A[A[A




Evaluating:  84%|████████▍ | 203/241 [10:37<02:14,  3.55s/it][A[A[A[A[A




Evaluating:  85%|████████▍ | 204/241 [10:39<01:54,  3.11s/it][A[A[A[A[A




Evaluating:  85%|████████▌ | 205/241 [10:43<01:52,  3.11s/it][A[A[A[A[A




Evaluating:  85%|████████▌ | 206/241 [10:46<01:47,  3.09s/it][A[A[A[A[A




Evaluating:  86%|████████▌ | 207/241 [10:49<01:49,  3.21s/it][A[A[A[A[A




Evaluating:  86%|████████▋ | 208/241 [10:52<01:45,  3.20s/it][A[A[A[A[A




Evaluating:  87%|████████▋ |

In [None]:
!gpustat

In [None]:
run = neptune.init(project='Hatespeech-CNERG/Counterspeech-generation')