In [14]:
from transformers import BertTokenizer, BertForMaskedLM, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
import random
import numpy as np
import pandas as pd
import spacy
import re
import stopwordsiso as stopwords
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset

In [2]:
file_path = '..//task_1//data//text_all.jsonl'  
data = pd.read_json(file_path, lines=True)


labels_map = {
    0: "pozytywny wydźwięk",
    1: "neutralny wydźwięk",
    2: "negatywny wydźwięk",
    3: "mowa nienawiści",
    '0': "pozytywny wydźwięk",
    '1': "neutralny wydźwięk",
    '2': "negatywny wydźwięk",
    '3': "mowa nienawiści"
}

data["label"] = data["label"].replace(labels_map)
data['label'] = data['label'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else x)

data['label'] = data['label'].astype(str)

In [3]:
nlp = spacy.load("pl_core_news_sm")

polish_stopwords = stopwords.stopwords("pl")

def preprocess_text(text):
    text = re.sub(r'@\w+', '', text)

    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()

    doc = nlp(text)

    lemmatized_words = [token.lemma_ for token in doc if token.lemma_.lower() not in polish_stopwords]

    return ' '.join(lemmatized_words)

data['cleaned_text'] = data['text'].apply(preprocess_text)


In [4]:
data

Unnamed: 0,text,label,cleaned_text
0,"@USER Nic, nic,nic nieważne, jutro albo w najb...",neutralny wydźwięk,nicnica nieważny jutro bliski czas odezwę narka
1,@USER Kibic @USER odpowiada @USER i @USER na k...,neutralny wydźwięk,kibic odpowiadać krytyka Manuel Junconnh...
2,Mówi że stare rapsy są całkiem niezle,neutralny wydźwięk,mówić stary rapsy całkiem niezle
3,"@USER @USER Zaległości były, ale ważne czy był...",neutralny wydźwięk,Zaległość ważny wezwać zapłata klub wywiązać
4,@USER @USER Oby nie spierdolil na północ,negatywny wydźwięk,Oby spierdolil północ
...,...,...,...
4436,"@USER Noc? To wtedy, gdy jest ciemno? Bo żadne...",neutralny wydźwięk,noc ciemno różnica porównanie dzień
4437,"wszędzie dobrze, ale w grobie najlepiej",mowa nienawiści,wszędzie groba
4438,@USER a ile zagrał tam minut ?,neutralny wydźwięk,zagrać minuta
4439,@USER #FinalSix: Mamy to !!! Puchar Polski jes...,pozytywny wydźwięk,FinalSix mieć puchar Polska Wisła CanPack ...


In [18]:
texts = data['cleaned_text'].tolist()
labels = data['label'].tolist()

label_to_id = {label: idx for idx, label in enumerate(set(labels))}
num_labels = len(label_to_id)
numeric_labels = [label_to_id[label] for label in labels]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

def mask_tokens(inputs, mask_prob=0.15):
    outputs = []
    for input_ids in inputs:
        output_ids = input_ids.clone()
        num_to_mask = int(len(input_ids) * mask_prob)  
        mask_indices = random.sample(range(len(input_ids)), num_to_mask) 

        for idx in mask_indices:
            output_ids[idx] = tokenizer.mask_token_id  
        
        outputs.append(output_ids.unsqueeze(0))  
    return torch.cat(outputs)  

inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

augmented_inputs = mask_tokens(inputs['input_ids'])
augmented_labels = numeric_labels * (augmented_inputs.size(0) // inputs['input_ids'].size(0))  # Powiel etykiety

combined_inputs = torch.cat((inputs['input_ids'], augmented_inputs), dim=0)
combined_labels = numeric_labels + augmented_labels

train_inputs, val_inputs, train_labels, val_labels = train_test_split(combined_inputs, combined_labels, test_size=0.2)

class CustomDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long) 
        }

train_dataset = CustomDataset(train_inputs, train_labels)
val_dataset = CustomDataset(val_inputs, val_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",  
    logging_dir='./logs',         
    logging_steps=500,
    logging_first_step=True,  
    report_to='tensorboard'  
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate()

results

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/168 [25:59<?, ?it/s]
  0%|          | 2/2667 [00:00<07:58,  5.57it/s]

{'loss': 1.4287, 'grad_norm': 6.49643087387085, 'learning_rate': 4.998125234345707e-05, 'epoch': 0.0}


 19%|█▉        | 502/2667 [00:37<02:42, 13.33it/s]

{'loss': 1.2961, 'grad_norm': 4.030407905578613, 'learning_rate': 4.0626171728533934e-05, 'epoch': 0.56}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 33%|███▎      | 890/2667 [01:09<18:58,  1.56it/s]

{'eval_loss': 1.296167016029358, 'eval_accuracy': 0.449634214969049, 'eval_f1': 0.2789268150312656, 'eval_precision': 0.20217092727083294, 'eval_recall': 0.449634214969049, 'eval_runtime': 3.7844, 'eval_samples_per_second': 469.559, 'eval_steps_per_second': 58.926, 'epoch': 1.0}


 38%|███▊      | 1002/2667 [01:18<02:11, 12.70it/s]

{'loss': 1.2994, 'grad_norm': 4.1368231773376465, 'learning_rate': 3.1252343457067865e-05, 'epoch': 1.12}


 56%|█████▋    | 1502/2667 [01:56<01:30, 12.82it/s]

{'loss': 1.2934, 'grad_norm': 3.506744861602783, 'learning_rate': 2.18785151856018e-05, 'epoch': 1.69}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

 67%|██████▋   | 1780/2667 [02:21<10:11,  1.45it/s]

{'eval_loss': 1.287301778793335, 'eval_accuracy': 0.449634214969049, 'eval_f1': 0.2789268150312656, 'eval_precision': 0.20217092727083294, 'eval_recall': 0.449634214969049, 'eval_runtime': 4.034, 'eval_samples_per_second': 440.501, 'eval_steps_per_second': 55.279, 'epoch': 2.0}


 75%|███████▌  | 2002/2667 [02:39<00:52, 12.67it/s]

{'loss': 1.2783, 'grad_norm': 4.280080318450928, 'learning_rate': 1.2504686914135733e-05, 'epoch': 2.25}


 94%|█████████▍| 2502/2667 [03:18<00:12, 13.00it/s]

{'loss': 1.2916, 'grad_norm': 5.238785743713379, 'learning_rate': 3.1308586426696664e-06, 'epoch': 2.81}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

100%|██████████| 2667/2667 [03:36<00:00, 12.29it/s]


{'eval_loss': 1.2880703210830688, 'eval_accuracy': 0.449634214969049, 'eval_f1': 0.2789268150312656, 'eval_precision': 0.20217092727083294, 'eval_recall': 0.449634214969049, 'eval_runtime': 4.1117, 'eval_samples_per_second': 432.183, 'eval_steps_per_second': 54.236, 'epoch': 3.0}
{'train_runtime': 216.9358, 'train_samples_per_second': 98.255, 'train_steps_per_second': 12.294, 'train_loss': 1.2899724801560222, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 223/223 [00:04<00:00, 51.82it/s]


{'eval_loss': 1.2880703210830688,
 'eval_accuracy': 0.449634214969049,
 'eval_f1': 0.2789268150312656,
 'eval_precision': 0.20217092727083294,
 'eval_recall': 0.449634214969049,
 'eval_runtime': 4.3228,
 'eval_samples_per_second': 411.081,
 'eval_steps_per_second': 51.588,
 'epoch': 3.0}