In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    get_linear_schedule_with_warmup, 
    LlamaForSequenceClassification,
    LlamaTokenizer,
    pipeline
)
from torch.utils.data import DataLoader, Dataset, random_split, Subset
import torch
from tqdm import trange, tqdm
from torch.optim import AdamW
import pandas as pd


class MyDataset(Dataset):
    def __init__(self, contexts, labels):
        self.contexts = contexts
        self.labels = labels

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        return self.contexts[idx], self.labels[idx]

device = torch.device("cuda:1")
conan = pd.read_csv("/data/shared/hate_speech_dataset/CONAN.csv")

hate_df = conan[['hateSpeech']].drop_duplicates()
hate_df = hate_df.rename(columns={'hateSpeech': 'sentence'})
hate_df = hate_df.reset_index(drop=True)
hate_df['label'] = 1

nonhate_df = conan[['counterSpeech']].drop_duplicates()
nonhate_df = nonhate_df.rename(columns={'counterSpeech': 'sentence'})
nonhate_df = nonhate_df.reset_index(drop=True)
nonhate_df['label'] = 0

df = pd.concat([hate_df, nonhate_df], ignore_index=True)

contexts = df['sentence'].tolist()
labels = df['label'].tolist()

dataset = MyDataset(contexts, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
batch_size = 32
epochs = 3


train_dataset = Subset(dataset, indices=range(0, train_size))
val_dataset = Subset(dataset, indices=range(train_size, train_size + val_size))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize Model and Tokenizer
# model_path = "cardiffnlp/twitter-roberta-base-hate-latest"
# model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
model_path = "/data/shared/llama2/llama/7B-Chat/"
model = LlamaForSequenceClassification.from_pretrained(model_path).to(device)
# model = LlamaForSequenceClassification.from_pretrained(model_path, load_in_8bit=True).to(device)
tokenizer = LlamaTokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
lr=1e-5
optimizer = AdamW(model.parameters(), lr=lr)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

# Fine-tuning Loop
for epoch in trange(epochs, desc="Epoch"):
    model.train()

    # Training Loop
    for batch in tqdm(train_dataloader, desc="Training"):
        texts, labels = batch
        texts = list(texts)
        texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(**texts, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation Loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            texts, labels = batch
            texts = list(texts)
            texts = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
            labels = torch.tensor(labels).to(device)

            outputs = model(**texts, labels=labels)
            val_loss += outputs.loss.item()

    print(f"Epoch: {epoch+1}, Validation Loss: {val_loss / len(val_dataloader)}")

model_name = f"{model_path}-lr{lr}-epoch{epoch}"
model.save_pretraine


# CardiffNLP Finetune

In [1]:
import sys
import os
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import argparse


def train_hate_model(args):
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    conan = pd.read_csv("/data/shared/hate_speech_dataset/CONAN.csv")
    hate_df = conan[['hateSpeech']].drop_duplicates()
    hate_df = hate_df.rename(columns={'hateSpeech': 'sentence'})
    hate_df = hate_df.reset_index(drop=True)
    hate_df['labels'] = 1

    nonhate_df = conan[['counterSpeech']].drop_duplicates()
    nonhate_df = nonhate_df.rename(columns={'counterSpeech': 'sentence'})
    nonhate_df = nonhate_df.reset_index(drop=True)
    nonhate_df['labels'] = 0
    train_df = pd.concat([hate_df, nonhate_df], ignore_index=True)
    
    compare_datasets = "/data/jzheng36/hatemoderate/hatemoderate/fine_tune/cardiffnlp.pkl"

    model_args = ClassificationArgs()
    model_args.learning_rate = args.learning_rate
    model_args.num_train_epochs = args.n_epoch
    model_args.train_batch_size = 32
    model_args.eval_batch_size = 32
    model_args.n_gpu = 4
    model_args.output_dir = "{}_lr={}_epoch={}_hatemoderate".format(args.model_name.replace("/", "-"), args.learning_rate, args.n_epoch)
    model_args.overwrite_output_dir = True
    model_args.save_best_model = True
    model_args.use_multiprocessing = False
    model_args.use_multiprocessing_for_evaluation = False
    model_args.evaluate_during_training = False


    model = ClassificationModel(args.model_type, args.model_name, num_labels=2, args=model_args)

    cardiffnlp_datasets = pd.read_pickle(compare_datasets)
    cardiffnlp_datasets = cardiffnlp_datasets.rename(columns={"label": "labels"})
    cardiffnlp_datasets = cardiffnlp_datasets[cardiffnlp_datasets['split'] != 'test']

    columns = ["text", "labels"]

    train_df = train_df.rename(columns={"sentence": "text"}).sample(frac=1)
    if args.include == True:
        train_df = pd.concat([train_df[columns], cardiffnlp_datasets[columns]])
    else:
        train_df = pd.concat([cardiffnlp_datasets[columns]])



    model.train_model(train_df=train_df)



if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Train a hate speech classification model.")
    parser.add_argument("--model_name", type=str, help="The name or path of the pre-trained model.")
    parser.add_argument("--learning_rate", type=float, default=5e-6, help="The learning rate to use for training. Default: 5e-6.")
    parser.add_argument("--n_epoch", type=int, default=3, help="The number of epochs to train for. Default: 3.")
    parser.add_argument("--model_type", type=str, default="roberta", help="The type of the model (e.g., 'roberta', 'bert'). Default: 'roberta'.")
    parser.add_argument("--include", action="store_true", default=True, help="Whether to include the hatemoderate dataset in training. Default: True.")
    parser.add_argument("--no-include", action="store_false", dest="include",
                        help="Do not include the hatemoderate dataset in training.")
    args = parser.parse_args()
    train_hate_model(args)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.6/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /data/installation/anaconda3/envs/lora/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116_nocublaslt.so...


  warn(msg)
  warn(msg)
  warn(f"Failed to load image Python extension: {e}")
usage: ipykernel_launcher.py [-h] [--model_name MODEL_NAME]
                             [--learning_rate LEARNING_RATE]
                             [--n_epoch N_EPOCH] [--model_type MODEL_TYPE]
                             [--include] [--no-include]
ipykernel_launcher.py: error: unrecognized arguments: -f /home/jzheng36/.local/share/jupyter/runtime/kernel-a5f38b97-ca85-4db1-8b01-3c69a43a1e85.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
