# Klasyfikacja tekstu za pomocą BERT i GPT2

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

  from .autonotebook import tqdm as notebook_tqdm


## BertForSequenceClassification

In [2]:
dataset = load_dataset(path = 'json', data_files = '../task_1/data/full_text_classification.jsonl')
dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 4441
})

In [3]:
def convert_labels(example):
    convert_dict = {'pozytywny wydźwięk': 0,
                    'neutralny wydźwięk': 1,
                    'negatywny wydźwięk': 2,
                    'mowa nienawiści': 3}
    example['label'] = convert_dict[example['label']]
    return example

In [4]:
dataset_to_split = dataset['train'].train_test_split(test_size = 0.2)
train_dataset = dataset_to_split['train'].map(convert_labels)
val_dataset = dataset_to_split['test'].map(convert_labels)

Map: 100%|██████████| 3552/3552 [00:00<00:00, 14198.13 examples/s]
Map: 100%|██████████| 889/889 [00:00<00:00, 12699.84 examples/s]


In [5]:
train_dataset[0]

{'text': '@USER Prezes się cieszy ze Wisla odpadła bo w razie finału Legia-Wisla znów by musiał finał odwoływać',
 'label': 1}

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding = 'max_length', truncation = True, max_length = 128)

train_dataset = train_dataset.map(tokenize, batched = True)
val_dataset = val_dataset.map(tokenize, batched = True)

train_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

Map: 100%|██████████| 3552/3552 [00:01<00:00, 2575.62 examples/s]
Map: 100%|██████████| 889/889 [00:00<00:00, 2544.32 examples/s]


In [7]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 4)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis = 1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average = 'weighted')
    return {'accuracy': accuracy, 'f1': f1}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir = './results',
    eval_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_dir = '/logs',
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
)

trainer.train()

                                                 
100%|██████████| 444/444 [21:28<00:00,  2.90s/it]

{'eval_loss': 1.1393370628356934, 'eval_accuracy': 0.5365579302587177, 'eval_f1': 0.42207964617123633, 'eval_runtime': 73.9343, 'eval_samples_per_second': 12.024, 'eval_steps_per_second': 1.515, 'epoch': 1.0}
{'train_runtime': 1288.7082, 'train_samples_per_second': 2.756, 'train_steps_per_second': 0.345, 'train_loss': 1.1422541850322, 'epoch': 1.0}





TrainOutput(global_step=444, training_loss=1.1422541850322, metrics={'train_runtime': 1288.7082, 'train_samples_per_second': 2.756, 'train_steps_per_second': 0.345, 'total_flos': 233646812725248.0, 'train_loss': 1.1422541850322, 'epoch': 1.0})

In [9]:
eval_results = trainer.evaluate()
print(eval_results)

100%|██████████| 112/112 [01:15<00:00,  1.49it/s]

{'eval_loss': 1.1393370628356934, 'eval_accuracy': 0.5365579302587177, 'eval_f1': 0.42207964617123633, 'eval_runtime': 75.957, 'eval_samples_per_second': 11.704, 'eval_steps_per_second': 1.475, 'epoch': 1.0}





In [10]:
!mkdir models
model.save_pretrained('models')
tokenizer.save_pretrained('models')

A subdirectory or file models already exists.




('models\\tokenizer_config.json',
 'models\\special_tokens_map.json',
 'models\\vocab.txt',
 'models\\added_tokens.json')

In [12]:
model.config.id2label = {0: 'pozytywny wydźwięk', 
                         1: 'neutralny wydźwięk', 
                         2: 'negatywny wydźwięk', 
                         3: 'mowa nienawiści'}

model.config.id2label

{0: 'pozytywny wydźwięk',
 1: 'neutralny wydźwięk',
 2: 'negatywny wydźwięk',
 3: 'mowa nienawiści'}

In [13]:
from transformers import pipeline

clsf = pipeline('text-classification', model = model, tokenizer = tokenizer)
result = clsf('Naprawde bardzo ładny pies!')
result

[{'label': 'pozytywny wydźwięk', 'score': 0.3628515899181366}]

In [18]:
preds_dict = {text: clsf(text) for text in val_dataset['text']}