In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [None]:
import torch
from torch.utils.data import Dataset
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model

# Define the tokenizer and model globally
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Define the model globally and specify the 'medium' model
global_model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-medium')
global_model.to(device)

class SpeechDataset(Dataset):
    def __init__(self, df, tokenizer, device):
        self.df = df
        self.tokenizer = tokenizer
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        hate_speech = self.df.iloc[index]["Hate Speech"]
        counterspeech = self.df.iloc[index]["Counterspeech"]
        prompt = f"{hate_speech} {self.tokenizer.eos_token} {counterspeech}"
        encoding = self.tokenizer(prompt, max_length=256, padding="max_length", truncation=True, return_tensors="pt")
        return {"input_ids": encoding["input_ids"].squeeze().to(self.device), 
                "attention_mask": encoding["attention_mask"].squeeze().to(self.device), 
                "labels": encoding["input_ids"].squeeze().to(self.device)}

class Generator:
    def __init__(self, train_df, val_df, category, tokenizer, device):
        self.model = global_model  # Use the globally defined model
        config = LoraConfig(
            r=16,  # Number of attention heads
            lora_alpha=32,  # Alpha scaling
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )
        self.model = get_peft_model(self.model, config)  # Apply PEFT configuration
        self.train_dataset = SpeechDataset(train_df, tokenizer, device)
        self.val_dataset = SpeechDataset(val_df, tokenizer, device)
        self.category = category
        self.tokenizer = tokenizer
        self.device = device

    def train(self, num_epochs=3):
        training_args = TrainingArguments(
            output_dir=f"./results_{self.category}",
            evaluation_strategy="epoch",
            warmup_steps=100,
            learning_rate=1e-3,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            weight_decay=0.01,
            num_train_epochs=num_epochs,
            logging_dir='./logs',
            fp16=True,
            gradient_accumulation_steps=4,
            max_steps=200,
            logging_steps=1
        )
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
            data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False)
        )
        trainer.train()

def generate_counterspeech(generators, hate_speech, category):
    generator = generators[category]
    prompt = f"{hate_speech}{generator.tokenizer.eos_token}"
    input_ids = generator.tokenizer.encode(prompt, return_tensors='pt').to(generator.device)
    output_ids = generator.model.generate(input_ids, max_length=512, pad_token_id=generator.tokenizer.eos_token_id)
    output = generator.tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return output

# Load your data
train_df = pd.read_csv("/kaggle/input/counterspeech/train.csv")
val_df = pd.read_csv("/kaggle/input/counterspeech/val.csv")

categories = ['Positive', 'Denouncing', 'Facts', 'Question', 'Humor']
generators = {}

for category in categories:
    train_subdf = train_df[train_df['Category'] == category].reset_index(drop=True)
    val_subdf = val_df[val_df['Category'] == category].reset_index(drop=True)
    generator = Generator(train_subdf, val_subdf, category, tokenizer, device)
    generators[category] = generator  

for category in categories:
    generators[category].train()

def inference(df_name, generators):
    df = pd.read_csv(df_name)
#     df = df.head(10)
    generate = lambda row: generate_counterspeech(generators, row["Hate Speech"], row["Category"])
    df["Counterspeech"] = df.apply(generate, axis=1)
    return df

test_df = inference("/kaggle/input/counterspeech/test.csv", generators)
print(test_df)
test_df.to_csv('/kaggle/working/output.csv', index=False)


2024-05-08 06:16:59.919374: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 06:16:59.919469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 06:17:00.041103: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,5.6434,5.086633
2,3.2279,3.260252
4,3.1958,3.080157


max_steps is given, it will override any value given in num_train_epochs


Epoch,Training Loss,Validation Loss
1,3.3523,3.271511
2,3.4376,3.220133
3,3.2251,3.151061
4,3.2487,3.103521


max_steps is given, it will override any value given in num_train_epochs


Epoch,Training Loss,Validation Loss
0,2.93,2.839057
1,3.1548,2.78365


max_steps is given, it will override any value given in num_train_epochs


Epoch,Training Loss,Validation Loss
0,3.0996,2.734032
2,2.5229,2.677843


max_steps is given, it will override any value given in num_train_epochs


Epoch,Training Loss,Validation Loss
1,3.4347,3.33865
2,3.3335,3.286829
3,3.4126,3.240785
4,3.1422,3.237777
5,2.9518,3.22405
6,2.5697,3.231275


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
