In [1]:
!pip install transformers



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset

import pandas as pd

2024-05-07 14:32:28.432329: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 14:32:28.432453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 14:32:28.617754: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')

tokenizer.pad_token = tokenizer.eos_token
eos = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [4]:
class SpeechDataset(Dataset):
    
    def __init__(self, df):
        self.df = df
        self.len = len(df)
    
    
    def __len__(self):
        return self.len
    
    
    def __getitem__(self, index):
        hate_speech = self.df["Hate Speech"][index]
        counterspeech = self.df["Counterspeech"][index]
        
        prompt = f"{hate_speech}{eos}{counterspeech}"
        
        encoding = tokenizer.encode(
            prompt,
            max_length = 256,
            truncation = True,
            padding = "max_length",
            return_tensors = "pt",
        ).squeeze()
        
        return {"input_ids": encoding, "labels": encoding}
        

In [5]:
class Generator:
    
    def __init__(self, train_df, val_df, category):
        self.model =  AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small').to(device)
        self.train_dataset = SpeechDataset(train_df)
        self.val_dataset = SpeechDataset(val_df)
        self.category = category
    
    
    def train(self, num_epochs=5):
        
        training_args = TrainingArguments(
            output_dir=f"/kaggle/working/results_{self.category}",
            evaluation_strategy="epoch",
            warmup_steps=100,
            learning_rate=1e-4,
            per_device_train_batch_size=7,
            per_device_eval_batch_size=10,
            weight_decay=0.01,
            num_train_epochs=num_epochs,
            report_to=[],
            save_total_limit = 2,    
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset
        )
        
        trainer.train()

In [6]:
def generate_counterspeech(generators, hate_speech, category):
    generator = generators[category]
    prompt = f"{hate_speech}{eos}"
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    output_ids = generator.model.generate(input_ids, max_length=512, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    return output

In [7]:
categories = ['Positive', 'Denouncing', 'Facts', 'Question', 'Humor']

In [8]:
train_df = pd.read_csv("/kaggle/input/counterspeech/train.csv")
val_df = pd.read_csv("/kaggle/input/counterspeech/val.csv")

In [9]:
generators = {}

for category in categories:
    train_subdf = train_df[train_df['Category'] == category].reset_index(drop=True)
    val_subdf = val_df[val_df['Category'] == category].reset_index(drop=True)
    generators[category] = Generator(train_subdf, val_subdf, category)

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
generators["Positive"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.583925
2,No log,0.518924
3,No log,0.500387
4,No log,0.502196
5,No log,0.507721


In [11]:
generators["Denouncing"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.61218
2,No log,0.55032
3,No log,0.539259
4,No log,0.540316
5,No log,0.546652


In [12]:
generators["Facts"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.555629
2,No log,0.526838
3,No log,0.525235
4,0.558400,0.530224
5,0.558400,0.535212




In [13]:
generators["Question"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.47935
2,No log,0.441322
3,No log,0.436994
4,No log,0.439511
5,No log,0.44465


In [14]:
generators["Humor"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.665685
2,No log,0.538867
3,No log,0.507049
4,No log,0.500488
5,No log,0.50405


In [15]:
def inference(df_name, generators):
    df = pd.read_csv(df_name)
    
    generate = lambda row: generate_counterspeech(generators, row["Hate Speech"], row["Category"])
    df["Counterspeech"] = df.apply(generate, axis=1)
    
    return df

In [16]:
test_df = inference("/kaggle/input/counterspeech/test.csv", generators)

print(test_df)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

                                           Hate Speech    Target    Category  \
0                 Migrants are a drain on the society.  MIGRANTS    Question   
1            Migrants put a strain on public services.  MIGRANTS       Facts   
2    Women are subhuman, they are weak and childlik...     WOMEN    Positive   
3    We should return to the times when it was ok t...  DISABLED       Humor   
4    I would like to ask all these people supportin...  MIGRANTS    Question   
..                                                 ...       ...         ...   
672  Many Muslims are unemployed, poor and lack edu...   MUSLIMS       Facts   
673           Muslims are building Mosques everywhere!   MUSLIMS       Facts   
674  Jews are subhuman, they steal our jobs, they t...      JEWS    Question   
675                         It is OK to abuse a woman.     WOMEN  Denouncing   
676  Immigrants are invading our country. They are ...  MIGRANTS       Facts   

                                       

In [17]:
test_df.to_csv('/kaggle/working/output.csv', index=False)