In [1]:
# !pip install datasets
# !pip install evaluate
# ! pip install -U accelerate
# ! pip install -U transformers

In [25]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri May 17 20:26:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.10              Driver Version: 535.86.10    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  | 00000000:1C:00.0 Off |                    0 |
| N/A   35C    P0              78W / 300W |  19467MiB / 32768MiB |     26%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  | 00000000:1D:00.0 Off |  

In [1]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from bs4 import BeautifulSoup
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import copy
from tqdm import tqdm
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
id2label = {0: "Not_hate", 1: "Hate"}
label2id = {"Not_hate": 0, "Hate": 1}

In [4]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-xlarge-mnli")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-xlarge-mnli", num_labels=2, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

# Load model directly
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
# model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")

In [5]:
data = load_dataset("/home/sslashinin/kovakimyan/diploma/dataset/super_toxic/")

In [6]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def cleanPunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def clear_sentance(sentance):
    sentance= re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = cleanPunc(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    stop_words = set(stopwords.words('english'))
    stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in  stopwords.words('english'))
    return sentance.strip()

In [7]:
preprocessed_text = []
for sentance in tqdm(dataset['text']):
    preprocessed_text.append(clear_sentance(sentance))

  sentance = BeautifulSoup(sentance, 'lxml').get_text()
100%|██████████| 5000/5000 [00:54<00:00, 92.44it/s] 


In [8]:
new_dataset = copy.deepcopy(dataset)

new_dataset = new_dataset.add_column("preprocessed_text", preprocessed_text)

In [10]:
en_dataset = copy.deepcopy(new_dataset)

en_dataset_split = en_dataset.train_test_split(test_size=0.3)

en_dataset_split["train"] = en_dataset_split["train"].remove_columns("text")
en_dataset_split["test"] = en_dataset_split["test"].remove_columns("text")

en_dataset_split

DatasetDict({
    train: Dataset({
        features: ['label', 'preprocessed_text'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['label', 'preprocessed_text'],
        num_rows: 1500
    })
})

In [11]:
def preprocess_function(examples):
    return tokenizer(examples["preprocessed_text"], truncation=True)

tokenized_dataset = en_dataset_split.map(preprocess_function, batched=True)

tokenized_dataset

Map: 100%|██████████| 3500/3500 [00:00<00:00, 3698.12 examples/s]
Map: 100%|██████████| 1500/1500 [00:00<00:00, 5895.09 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'preprocessed_text', 'input_ids', 'attention_mask'],
        num_rows: 3500
    })
    test: Dataset({
        features: ['label', 'preprocessed_text', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
accuracy = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [15]:
!export WANDB_DISABLED=true

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
training_args = TrainingArguments(
    output_dir="hatespeech_detection_ft",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    report_to="none",
    #save_strategy="epoch",
    #load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()
trainer.save_model()

In [None]:
trainer.save_model("/home/sslashinin/kovakimyan/diploma/hatespeech_detection_ft")

In [18]:
trainer.save_model("/home/sslashinin/kovakimyan/diploma/hatespeech_detection_ft")

In [19]:
import torch

In [20]:
def run_model(text):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    print(model.config.id2label[predicted_class_id])

In [21]:
text = "Fuck you"
run_model(text)

text = "You act like a fool"
run_model(text)

text = "Today is sunny"
run_model(text)

Hate
Hate
Not_hate
