In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import emoji
import re
from peft import LoraConfig, TaskType, PeftModel, get_peft_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoModelForSeq2SeqLM,
    AutoModel,
    AutoTokenizer,
    CLIPModel,
    CLIPProcessor,
    Trainer,
    TrainingArguments,
    pipeline,
)
import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
from huggingface_hub import login
login(token="...")

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
model_name = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, output_attentions=False, output_hidden_states=False,
).to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class HateSpeechDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_length=512):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length


    def preprocess_text(self, text):
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'^RT\s+', '', text, flags=re.IGNORECASE)
        text = re.sub(r'http[s]?://\S+', '', text)
        text = re.sub(r'www\.\S+', '', text)
        text = text.replace('\n', ' ').replace('\t', ' ')
        text = ' '.join(text.split())
        text = text.strip()
                
        return text
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.preprocess_text(str(self.sentences[idx]))
        encoding = self.tokenizer(
            sentence,
            truncation=True,           
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
        }

In [6]:
df = pd.read_csv("hate_train.csv")
df.head()

Unnamed: 0,sentence,label
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0
1,@anonymized_account @anonymized_account Brawo ...,0
2,"@anonymized_account @anonymized_account Super,...",0
3,@anonymized_account @anonymized_account Musi. ...,0
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0


In [7]:
sentences = df['sentence'].tolist()
labels = df['label'].tolist()

In [8]:
print("Rozkład klas przed oversampling:")
print(df['label'].value_counts())
print(f"Procent klasy 1: {(df['label']==1).sum() / len(df) * 100:.2f}%")
print(f"Całkowita liczba próbek: {len(df)}")

multiplier = 5

positive_samples = df[df['label'] == 1]

duplicated_positives = pd.concat([positive_samples] * (multiplier - 1), ignore_index=True)

df_oversampled = pd.concat([df, duplicated_positives], ignore_index=True)

df_oversampled = df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

print("\nRozkład klas po oversampling:")
print(df_oversampled['label'].value_counts())
print(f"Procent klasy 1: {(df_oversampled['label']==1).sum() / len(df_oversampled) * 100:.2f}%")
print(f"Całkowita liczba próbek: {len(df_oversampled)}")

sentences = df_oversampled['sentence'].tolist()
labels = df_oversampled['label'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    sentences, 
    labels, 
    test_size=0.2,
    random_state=42,
    stratify=labels
)

print(f"\nPo train_test_split:")
print(f"Train: {len(X_train)} próbek")
print(f"Val: {len(X_val)} próbek")

Rozkład klas przed oversampling:
label
0    9190
1     851
Name: count, dtype: int64
Procent klasy 1: 8.48%
Całkowita liczba próbek: 10041

Rozkład klas po oversampling:
label
0    9190
1    4255
Name: count, dtype: int64
Procent klasy 1: 31.65%
Całkowita liczba próbek: 13445

Po train_test_split:
Train: 10756 próbek
Val: 2689 próbek


In [9]:
train_dataset = HateSpeechDataset(sentences=X_train, labels=y_train, tokenizer=tokenizer)
eval_dataset = HateSpeechDataset(sentences=X_val, labels=y_val, tokenizer=tokenizer)

In [10]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params:,}")

print(model)

Number of trainable parameters: 184,423,682
DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

In [11]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "query_proj", 
        "key_proj", 
        "value_proj", 
        "dense"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

# Prepare model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

print(model)

training_args = TrainingArguments(
    output_dir="./lora_results",
    num_train_epochs=20,
    per_device_train_batch_size=25,
    per_device_eval_batch_size=10,
    logging_dir="./lora_logs",
    logging_steps=100,
    learning_rate=5e-4,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DebertaV2ForSequenceClassification(
      (deberta): DebertaV2Model(
        (embeddings): DebertaV2Embeddings(
          (word_embeddings): Embedding(128100, 768, padding_idx=0)
          (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
          (dropout): StableDropout()
        )
        (encoder): DebertaV2Encoder(
          (layer): ModuleList(
            (0-11): 12 x DebertaV2Layer(
              (attention): DebertaV2Attention(
                (self): DisentangledSelfAttention(
                  (query_proj): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
                    )

In [12]:
# Print number of trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {trainable_params:,}")

Number of trainable parameters: 2,680,322


In [13]:
trainer.train()

Step,Training Loss
100,0.5774
200,0.4824
300,0.4153
400,0.3457
500,0.3047
600,0.2551
700,0.2425
800,0.2346
900,0.1847
1000,0.1485


Checkpoint destination directory ./lora_results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-2000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-2500 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-3000 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./lora_results/checkpoint-3500 already exists and is non-empty.Savin

TrainOutput(global_step=8620, training_loss=0.06875721449929174, metrics={'train_runtime': 3417.5301, 'train_samples_per_second': 62.946, 'train_steps_per_second': 2.522, 'total_flos': 5.837275244101632e+16, 'train_loss': 0.06875721449929174, 'epoch': 20.0})

In [27]:
model.save_pretrained("./lora_adapters")
tokenizer.save_pretrained("./lora_adapters")

# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "microsoft/deberta-v3-base",
#     num_labels=2,
#     output_attentions=False,
#     output_hidden_states=False,
# )

# model = PeftModel.from_pretrained(base_model, "./lora_adapters")
# model = model.to(device)

# # Załaduj tokenizer
# tokenizer = AutoTokenizer.from_pretrained("./lora_adapters")

('./lora_adapters/tokenizer_config.json',
 './lora_adapters/special_tokens_map.json',
 './lora_adapters/spm.model',
 './lora_adapters/added_tokens.json',
 './lora_adapters/tokenizer.json')

In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=10, shuffle=False)

all_predictions = []
all_labels = []

model.eval()

# Evaluate without gradient computation
with torch.no_grad():
    for batch in eval_dataloader:
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Get model predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        # Store predictions and labels
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print(f"Model Accuracy:  {accuracy:.4f}")
print(f"Precision:       {precision:.4f}")
print(f"Recall:          {recall:.4f}")
print(f"F1-Score:        {f1:.4f}")

Model Accuracy:  0.9803
Precision:       0.9414
Recall:          1.0000
F1-Score:        0.9698


In [17]:
class TestDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=256):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def preprocess_text(self, text):
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'^RT\s+', '', text, flags=re.IGNORECASE)
        text = re.sub(r'http[s]?://\S+', '', text)
        text = re.sub(r'www\.\S+', '', text)
        text = text.replace('\n', ' ').replace('\t', ' ')
        text = ' '.join(text.split())
        text = text.strip()
                
        return text
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.preprocess_text(str(self.sentences[idx]))
        encoding = self.tokenizer(
            sentence,
            truncation=True,           
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [29]:
test_sentences = []
with open('hate_test_data.txt', 'r', encoding='utf-8') as f:
    test_sentences = [line.strip() for line in f if line.strip()]
    
test_dataset = TestDataset(sentences=test_sentences, tokenizer=tokenizer)

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Predykcje
all_predictions = []

model.eval()

with torch.no_grad():
    for batch in test_dataloader:
        # Przenieś na device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        # Predykcje
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        
        # Zapisz predykcje
        all_predictions.extend(predictions.cpu().numpy())

# Zapisz do CSV
df_predictions = pd.DataFrame({
    'prediction': all_predictions
})

df_predictions.to_csv('pred.csv', index=False, header=False)

# Wyświetl kilka przykładów
print("\nPrzykładowe predykcje:")
for i in range(100):
    print(f"{i+1}. '{test_sentences[i][:512]}...' -> {all_predictions[i]}")


Przykładowe predykcje:
1. '@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok....' -> 0
2. '@anonymized_account @anonymized_account Ale on tu nie miał szans jej zagrania, a ta 'proba' to czysta prowizorka....' -> 0
3. '@anonymized_account No czy Prezes nie miał racji, mówiąc,ze to są zdradzieckie mordy? No czy nie miał racji?😁😁...' -> 0
4. '@anonymized_account @anonymized_account Przecież to nawet nie jest przewrotka 😂...' -> 0
5. '@anonymized_account @anonymized_account Owszem podatki tak. Ale nie w takich okolicznościach. Czemu Małysza odpalili z teamu Orlen?...' -> 0
6. '@anonymized_account @anonymized_account skąd wiesz jaki Skendija ma budżet skoro mówisz że jest bogatsza ? Tylko dwóch zawodników ponoć dobrze zarabia....' -> 0
7. 'Z tego, co widzę, to kibice Widzewa mają szczęście, że trwa mundial. Dzięki temu ogólnopolska szydera jest tylko z Argentyny i Messiego....' -> 0
8. '@anonymized_account @anonymized_account @anonymized_account

In [22]:
# Wypisz zdania z przewidywaną klasą 1
print("\n" + "="*50)
print("ZDANIA ZAKLASYFIKOWANE JAKO KLASA 1:")
print("="*50 + "\n")

# Znajdź indeksy gdzie przewidywania = 1
predicted_class_1_indices = [i for i, pred in enumerate(all_predictions) if pred == 1]

# Wypisz zdania
for i, idx in enumerate(predicted_class_1_indices):
    print(f"{i+1}. Zdanie #{idx}:")
    print(f"   Tekst: {X_val[idx]}")
    print(f"   Rzeczywista etykieta: {all_labels[idx]}")
    print(f"   Przewidywana etykieta: {all_predictions[idx]}")
    print("-" * 40)

# Podsumowanie
print(f"\nZnaleziono {len(predicted_class_1_indices)} zdań zaklasyfikowanych jako klasa 1")
print(f"To stanowi {len(predicted_class_1_indices)/len(all_predictions)*100:.2f}% wszystkich zdań")

# Opcjonalnie: wypisz tylko pierwsze N zdań jeśli jest ich dużo
MAX_DISPLAY = 20
if len(predicted_class_1_indices) > MAX_DISPLAY:
    print(f"\n(Pokazuję tylko pierwsze {MAX_DISPLAY} zdań)")
    for i in range(MAX_DISPLAY):
        idx = predicted_class_1_indices[i]
        print(f"\n{i+1}. {X_val[idx]}")


ZDANIA ZAKLASYFIKOWANE JAKO KLASA 1:

1. Zdanie #24:
   Tekst: @anonymized_account bo wie już to kraina tą, kwa kwa agora zła i gęgę zły tvn
   Rzeczywista etykieta: 0
   Przewidywana etykieta: 1
----------------------------------------
2. Zdanie #31:
   Tekst: @anonymized_account Jak gram na Orliku to nikt mi wynagrodzenia za to niedaje a biegam i staram się na całego wiec uważam ze to słabe wytłumaczenie
   Rzeczywista etykieta: 0
   Przewidywana etykieta: 1
----------------------------------------
3. Zdanie #37:
   Tekst: @anonymized_account Miłego dnia Ci życzę 🌼
   Rzeczywista etykieta: 0
   Przewidywana etykieta: 1
----------------------------------------
4. Zdanie #51:
   Tekst: RT @anonymized_account @anonymized_account Kiedy ten patologiczny kłamca skończy wreszcie opluwać Polaków i Polskę czy niema paragrafu na tę hienę?
   Rzeczywista etykieta: 1
   Przewidywana etykieta: 1
----------------------------------------
5. Zdanie #60:
   Tekst: @anonymized_account Obronił wczoraj