In [1]:
!pip install transformers



In [2]:
!pip install peft

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset

from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

import pandas as pd

2024-05-07 18:05:10.294707: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-07 18:05:10.294809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-07 18:05:10.465753: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
model =  AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-medium').to(device)

tokenizer.pad_token = tokenizer.eos_token
eos = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
class SpeechDataset(Dataset):
    
    def __init__(self, df):
        self.df = df
        self.len = len(df)
    
    
    def __len__(self):
        return self.len
    
    
    def __getitem__(self, index):
        hate_speech = self.df["Hate Speech"][index]
        counterspeech = self.df["Counterspeech"][index]
        
        prompt = f"{hate_speech}{eos}{counterspeech}"
        
        encoding = tokenizer.encode(
            prompt,
            max_length = 256,
            truncation = True,
            padding = "max_length",
            return_tensors = "pt",
        ).squeeze()
        
        return {"input_ids": encoding, "labels": encoding}
        

In [6]:
class Generator:
    
    def __init__(self, train_df, val_df, category):
        generation_config = PromptTuningConfig(
            task_type=TaskType.CAUSAL_LM,  # This type indicates the model will generate text.
            prompt_tuning_init=PromptTuningInit.RANDOM,  # The added virtual tokens are initializad with random numbers
            num_virtual_tokens= 4,  # Number of virtual tokens to be added and trained.
            tokenizer_name_or_path='microsoft/DialoGPT-medium',  # The pre-trained model.
        )
        self.model =  get_peft_model(model, generation_config)
        self.train_dataset = SpeechDataset(train_df)
        self.val_dataset = SpeechDataset(val_df)
        self.category = category
    
    
    def train(self, num_epochs=5):
        
        training_args = TrainingArguments(
            output_dir=f"/kaggle/working/results_{self.category}",
            evaluation_strategy="epoch",
            warmup_steps=100,
            learning_rate=0.05,
            per_device_train_batch_size=7,
            per_device_eval_batch_size=10,
            weight_decay=0.01,
            num_train_epochs=num_epochs,
            report_to=[],
            save_total_limit = 2,    
        )
        
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset
        )
        
        trainer.train()

In [7]:
def generate_counterspeech(generators, hate_speech, category):
    generator = generators[category]
    prompt = f"{hate_speech}{eos}"
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    output_ids = generator.model.generate(input_ids, max_length=512, pad_token_id=tokenizer.eos_token_id)
    output = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    
    return output

In [8]:
categories = ['Positive', 'Denouncing', 'Facts', 'Question', 'Humor']

In [9]:
train_df = pd.read_csv("/kaggle/input/counterspeech/train.csv")
val_df = pd.read_csv("/kaggle/input/counterspeech/val.csv")

In [10]:
generators = {}

for category in categories:
    train_subdf = train_df[train_df['Category'] == category].reset_index(drop=True)
    val_subdf = val_df[val_df['Category'] == category].reset_index(drop=True)
    generators[category] = Generator(train_subdf, val_subdf, category)

In [11]:
generators["Positive"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.036034
2,No log,0.825678
3,No log,0.698808
4,No log,0.682798
5,No log,0.67697


In [12]:
generators["Denouncing"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.075723
2,No log,0.938406
3,No log,0.820317
4,No log,0.781808
5,No log,0.765681


In [13]:
generators["Facts"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.079688
2,No log,0.895277
3,No log,0.834502
4,1.070400,0.787219
5,1.070400,0.771063




In [14]:
generators["Question"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.916881
2,No log,0.725975
3,No log,0.675044
4,No log,0.63559
5,No log,0.624893


In [15]:
generators["Humor"].train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,1.042778
2,No log,0.990948
3,No log,0.818818
4,No log,0.756486
5,No log,0.744654


In [16]:
def inference(df_name, generators):
    df = pd.read_csv(df_name)
    
    generate = lambda row: generate_counterspeech(generators, row["Hate Speech"], row["Category"])
    df["Counterspeech"] = df.apply(generate, axis=1)
    
    return df

In [17]:
test_df = inference("/kaggle/input/counterspeech/test.csv", generators)

print(test_df)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

                                           Hate Speech    Target    Category  \
0                 Migrants are a drain on the society.  MIGRANTS    Question   
1            Migrants put a strain on public services.  MIGRANTS       Facts   
2    Women are subhuman, they are weak and childlik...     WOMEN    Positive   
3    We should return to the times when it was ok t...  DISABLED       Humor   
4    I would like to ask all these people supportin...  MIGRANTS    Question   
..                                                 ...       ...         ...   
672  Many Muslims are unemployed, poor and lack edu...   MUSLIMS       Facts   
673           Muslims are building Mosques everywhere!   MUSLIMS       Facts   
674  Jews are subhuman, they steal our jobs, they t...      JEWS    Question   
675                         It is OK to abuse a woman.     WOMEN  Denouncing   
676  Immigrants are invading our country. They are ...  MIGRANTS       Facts   

    Counterspeech  
0                  

In [18]:
test_df.to_csv('/kaggle/working/output.csv', index=False)