In [1]:
!pip install transformers



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset

import pandas as pd

2024-05-06 19:34:41.223236: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 19:34:41.223353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 19:34:41.390774: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-medium').to(device)

tokenizer.pad_token = tokenizer.eos_token
eos = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
def create_prompt(hate_speech, target, category, counterspeech=""):
    prompt =  f"Hi, do you want to play a game?{eos}Sure, how do we play?{eos}"
    prompt += f"I'll give you a hate speech against the {target} community. "
    prompt += f"In response, you'll give me a {category} based counter speech.{eos}"
    prompt += f"Ok. Sounds fun.{eos}"
    prompt += f"Here's you hate speech: {hate_speech} {eos}"
    prompt += f"Here's my counterspeech: {counterspeech}"
    
    return prompt

In [5]:
def generate_counterspeech(hate_speech, target, category):
    prompt = create_prompt(hate_speech, target, category)
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    output_ids = model.generate(input_ids, max_length=512, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    return output

In [6]:
def inference(df_name):
    df = pd.read_csv(df_name)
    
    generate = lambda row: generate_counterspeech(row["Hate Speech"], row["Target"], row["Category"])
    df["Counterspeech"] = df.apply(generate, axis=1)
    
    return df

In [7]:
class SpeechDataset(Dataset):
    
    def __init__(self, df):
        self.df = df
        self.len = len(df)
    
    
    def __len__(self):
        return self.len
    
    
    def __getitem__(self, index):
        hate_speech = self.df["Hate Speech"][index]
        target = self.df["Target"][index]
        category = self.df["Category"][index]
        counterspeech = self.df["Counterspeech"][index]
        
        prompt = create_prompt(hate_speech, target, category, counterspeech)
        
        encoding = tokenizer.encode(
            prompt,
            max_length = 256,
            truncation = True,
            padding = "max_length",
            return_tensors = "pt",
        ).squeeze()
        
        return {"input_ids": encoding, "labels": encoding}
        

In [8]:
train_dataset = SpeechDataset(pd.read_csv("/kaggle/input/counterspeech/train.csv"))
val_dataset = SpeechDataset(pd.read_csv("/kaggle/input/counterspeech/val.csv"))

In [9]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    evaluation_strategy="epoch",
    warmup_steps=100,
    learning_rate=1e-4,
    per_device_train_batch_size=7,
    per_device_eval_batch_size=10,
    weight_decay=0.01,
    num_train_epochs=10,
    report_to=[],
    save_total_limit = 2,    
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.422953
2,0.540000,0.393708
3,0.327000,0.393097
4,0.327000,0.410342
5,0.230900,0.436472
6,0.174600,0.479075
7,0.174600,0.50018
8,0.133100,0.522713
9,0.111400,0.539429
10,0.111400,0.546242




TrainOutput(global_step=3420, training_loss=0.23402658205980445, metrics={'train_runtime': 6374.5027, 'train_samples_per_second': 7.5, 'train_steps_per_second': 0.537, 'total_flos': 2.220059010269184e+16, 'train_loss': 0.23402658205980445, 'epoch': 10.0})

In [11]:
test_df = inference("/kaggle/input/counterspeech/test.csv")

print(test_df)

                                           Hate Speech    Target    Category  \
0                 Migrants are a drain on the society.  MIGRANTS    Question   
1            Migrants put a strain on public services.  MIGRANTS       Facts   
2    Women are subhuman, they are weak and childlik...     WOMEN    Positive   
3    We should return to the times when it was ok t...  DISABLED       Humor   
4    I would like to ask all these people supportin...  MIGRANTS    Question   
..                                                 ...       ...         ...   
672  Many Muslims are unemployed, poor and lack edu...   MUSLIMS       Facts   
673           Muslims are building Mosques everywhere!   MUSLIMS       Facts   
674  Jews are subhuman, they steal our jobs, they t...      JEWS    Question   
675                         It is OK to abuse a woman.     WOMEN  Denouncing   
676  Immigrants are invading our country. They are ...  MIGRANTS       Facts   

                                       

In [12]:
test_df.to_csv('/kaggle/working/output.csv', index=False)