### 1. Wykorzystać model BERT do klasyfikacji tekstu, aby rozpoznać, z której powieści (Anna Karenina lub Jane Eyre) pochodzi dany fragment tekstu.


1. Przygotuj dane wejściowe:
   - Podziel teksty obu powieści na fragmenty o stałej długości (np. 100 słów lub 5 zdań).
   - Przypisz etykiety: `0` dla *Anna Karenina*, `1` dla *Jane Eyre*.
2. Skorzystaj z modelu `BertForSequenceClassification` do klasyfikacji tekstu.
3. Przeprowadź fine-tuning modelu na przygotowanym zbiorze danych.
4. Oceń skuteczność modelu na zbiorze testowym.

In [None]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import classification_report

In [None]:
with open('anna_karenina.txt', 'r', encoding='utf-8') as f:
    anna_text = f.read()

with open('jane_eyre.txt', 'r', encoding='utf-8') as f:
    jane_text = f.read()

In [None]:
# Funkcja do podziału tekstu na fragmenty o stałej długości
def split_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


In [None]:
# Podział tekstu i przypisanie etykiet
anna_chunks = split_text(anna_text)
jane_chunks = split_text(jane_text)


In [None]:
data = pd.DataFrame({
    'text': anna_chunks + jane_chunks,
    'label': [0] * len(anna_chunks) + [1] * len(jane_chunks)
})

# Podział na zbiory treningowy i testowy
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

In [None]:
# 2. Przygotowanie modelu i tokenizatora

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512, return_tensors='pt')
    return encodings, torch.tensor(labels.tolist())

train_encodings, train_labels = tokenize_data(train_texts, train_labels)
test_encodings, test_labels = tokenize_data(test_texts, test_labels)

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# 3. Fine-tuning modelu

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions)

In [None]:
# Trening
epochs = 2
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss:.4f}')


Epoch 1/2, Loss: 0.0841
Epoch 2/2, Loss: 0.0323


In [None]:
# 4. Ocena skuteczności
accuracy = evaluate(model, test_loader)
print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.9778


### 2. Wykorzystać model BERT do analizy toksyczności komentarzy.


1. Załaduj zbiór danych o toksycznych komentarzach(dostępny na platformie).
2. Skorzystaj z modelu `BertForSequenceClassification` i przeprowadź fine-tuning na tym zbiorze danych.
3. Oceń model na zbiorze testowym i zinterpretuj wyniki.
4. Przeprowadź analizę – znajdź komentarze, które model zaklasyfikował jako toksyczne, a które jako neutralne.


In [None]:
data = pd.read_csv('sample.csv')

data = data[['comment_text', 'target']]
data['label'] = (data['target'] >= 0.5).astype(int)  # 1 dla toksycznych, 0 dla neutralnych

# Podział na zbiory treningowy i testowy
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['comment_text'], data['label'], test_size=0.2, random_state=42
)


In [None]:
# 2. Przygotowanie modelu i tokenizatora

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512, return_tensors='pt')
    return encodings, torch.tensor(labels.tolist())

train_encodings, train_labels = tokenize_data(train_texts, train_labels)
test_encodings, test_labels = tokenize_data(test_texts, test_labels)

class ToxicCommentsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = ToxicCommentsDataset(train_encodings, train_labels)
test_dataset = ToxicCommentsDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
# 3. Fine-tuning modelu

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

def train_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return predictions, true_labels

In [None]:
# Trening
epochs = 2
for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {train_loss:.4f}')


Epoch 1/2, Loss: 0.2811
Epoch 2/2, Loss: 0.2802


In [None]:
# 4. Ocena modelu
predictions, true_labels = evaluate(model, test_loader)
print(classification_report(true_labels, predictions, target_names=['Neutral', 'Toxic']))


              precision    recall  f1-score   support

     Neutral       0.93      1.00      0.96      1858
       Toxic       0.00      0.00      0.00       142

    accuracy                           0.93      2000
   macro avg       0.46      0.50      0.48      2000
weighted avg       0.86      0.93      0.89      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 5. Analiza wyników
results = pd.DataFrame({
    'comment': test_texts,
    'true_label': true_labels,
    'predicted_label': predictions
})

In [None]:
toxic_comments = results[results['predicted_label'] == 1]
neutral_comments = results[results['predicted_label'] == 0]
print("Toxic Comments:")
print(toxic_comments.head())


Toxic Comments:
Empty DataFrame
Columns: [comment, true_label, predicted_label]
Index: []


In [None]:
print("Neutral Comments:")
print(neutral_comments.head())


Neutral Comments:
                                                comment  true_label  \
6252  Ya, its almost like we need to do something be...           0   
4684  Trump is under investigation for his Russian t...           0   
1731  That argument makes no sense, WM. Society move...           0   
4742  Well then I certainly hope you are going to go...           0   
4521  Key words: "mythical and mystical" and opening...           0   

      predicted_label  
6252                0  
4684                0  
1731                0  
4742                0  
4521                0  
