In [None]:
import transformers 
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from typing import Dict, List, Tuple
from torch.nn.utils.rnn import pad_sequence
from tqdm import trange
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

torch.cuda.set_device(0)



In [None]:
class Args():
    def __init__(self):
        self.output_dir = '../HULK/Counterspeech/models/createdebate_model'
#         self.model_type = 'gpt2-medium'
#         self.model_name_or_path = 'dialog-gpt-createdebate'
#         self.config_name = 'dialog-gpt-createdebate'
#         self.tokenizer_name = 'dialog-gpt-createdebate'
        self.model_type = 'microsoft/DialoGPT-medium'
        self.model_name_or_path = 'microsoft/DialoGPT-medium'
        self.config_name = 'microsoft/DialoGPT-medium'
        self.tokenizer_name = 'microsoft/DialoGPT-medium'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-6
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 10
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 56
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [None]:
total_data_sentences=[]
for key in dict_urls:
    total_data_sentences+=dict_urls[key]['selected_arguments']

In [None]:
X_train, X_test_dev = train_test_split(total_data_sentences, test_size=0.2, random_state=42, shuffle=True)
X_test, X_dev = train_test_split(X_test_dev, test_size=0.5, random_state=42, shuffle=True)


In [None]:
!pip install torchsummary

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # PyTorch v0.4.0
model = Net().to(device)

summary(model, (1, 28, 28))

In [None]:
path='../HULK/Saved_models/'

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small",cache_dir=path)
# model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-small",cache_dir=path)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/DialoGPT-small",
    cache_dir=path
)
    




    #model.to(args.device)

In [None]:
model.cuda()

In [None]:
freeze_layer_count=6

for param in model.transformer.wpe.parameters():
        param.requires_grad = False
for param in model.transformer.wte.parameters():
        param.requires_grad = False

        
if freeze_layer_count != -1:
    # if freeze_layer_count == -1, we only freeze the embedding layer
    # otherwise we freeze the first `freeze_layer_count` encoder layers
    for layer in model.transformer.h[:freeze_layer_count]:
        for param in layer.parameters():
            param.requires_grad = False

In [None]:
model

In [None]:
def construct_conv(dict_reply_pair, tokenizer, eos = True, block_size=256):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list([tokenizer.encode(dict_reply_pair['initiator_message'],truncation=True,max_length=int((block_size/2)-1))+ 
                 [tokenizer.eos_token_id] + 
                tokenizer.encode(dict_reply_pair['reply_message'],truncation=True,max_length=int((block_size/2)-1))+
                [tokenizer.eos_token_id]])
    conv = flatten(conv)
    return conv

In [None]:
def fix_the_random(seed_val = 42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [None]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, text_list, block_size=512):

        self.examples = []
        for element in text_list:
            conv = construct_conv(element, tokenizer)
            self.examples.append(conv)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
def load_and_cache_examples(args, tokenizer, train_text, val_text, evaluate=False):
    return ConversationDataset(tokenizer, args, val_text if evaluate else train_text)

In [None]:
def train(args, train_dataset, eval_dataset,model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)

    # Prepare optimizer and schedule (linear warmup and decay)
    
    #The optimizer allows us to apply different hyperpameters for specific parameter groups. 
    #For example, we can apply weight decay to all parameters other than bias and layer normalization terms:
    
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    
    tr_loss, logging_loss = 0.0, 0.0
    
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    eval_best = 100000
    eval_ = []
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
                
        eval_.append(evaluate(args, model, tokenizer, train_dataset, eval_dataset)[1])
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        if eval_[-1]<eval_best:
            os.makedirs(args.output_dir, exist_ok=True)

            # Save a trained model, configuration and tokenizer using `save_pretrained()`.
            # They can then be reloaded using `from_pretrained()`
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)

            # Good practice: save your training arguments together with the trained model
            torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
            eval_best = eval_[-1]

    return global_step, tr_loss / global_step, eval_






# Evaluation of some model
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    return result, perplexity

In [None]:
def main(df_trn, df_val, test_text=None, model = None):
    args = Args()
    
    # Setup CUDA, GPU & distributed training
    #device = torch.device("cuda")
    device = torch.device("cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    
    config = AutoConfig.from_pretrained(args.config_name,cache_dir=path)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,cache_dir=path)
    if model ==None:
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name_or_path,
            from_tf=False,
            config=config,
            cache_dir=path
        )
        model.to(args.device)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        global_step, tr_loss, eval_ = train(args, train_dataset, df_val, model, tokenizer)
        
    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)
    test_eval=0
    if test_text!=None:
        test_eval = evaluate(args, model, tokenizer, df_trn, test_text)[1]
    return eval_, test_eval, model

In [None]:
eval_, test_eval, model = main(X_train, X_dev,X_test)

In [None]:
!gpustat

In [None]:
run = neptune.init(project='Hatespeech-CNERG/Counterspeech-generation')