In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import string

In [2]:
def replace_polish_lowercase(text):
    polish_to_latin = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', 
        'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'
    }
    return ''.join(polish_to_latin.get(char, char) for char in text)

def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove polish letters
    text = replace_polish_lowercase(text)
    # Tokenize (split by whitespace)
    tokens = text.split()
    return ' '.join(tokens)  # Join tokens back into a string for TF-IDF

In [3]:
FILENAME_DEV = '/kaggle/input/dataset-conll/all.sentence.dev.txt'
FILENAME_TRAIN = '/kaggle/input/dataset-conll/all.sentence.train.txt'
FILENAME_TEST = '/kaggle/input/dataset-conll/all.sentence.test.txt'

LABELS = {
    "__label__z_minus_m": "Negative sentiment", 
    "__label__z_plus_m": "Positive sentiment",
    "__label__z_zero": "No sentiment",
    "__label__z_amb": "Unsure",
    }


In [4]:
def load_df(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        data = [line.strip() for line in lines]
        # get last word from each item as a label and match with data
        labeled_data = []
        for item in data:
            label = item.split()[-1]
            sentence_data = ' '.join(item.split()[:-1])
            labeled_data.append((sentence_data, LABELS[label]))

        # create a dataframe
        df = pd.DataFrame(labeled_data, columns=['text', 'labels'])
        return df

# Read data

In [5]:
df_train = load_df(FILENAME_TRAIN)
df_test = load_df(FILENAME_TEST)

In [6]:
df_train['text'] = df_train['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [7]:
df_train.sample(5)

Unnamed: 0,text,labels
37274,do trzech razy sztuka ktos nareszcie potrafil ...,Positive sentiment
44200,nie zawarla z nami zadnego kontraktu 2,Unsure
2457,polecam zarowno na duze imprezy jak i kameraln...,Positive sentiment
499,polecam przy okazji pobytu odwiedzic piekne mi...,Positive sentiment
42417,obecnosc warto chodzic pani doktor dlugosz ma ...,Unsure


# BERT fine tuning

In [8]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch
from torch.utils.data import Dataset
import numpy as np

In [9]:
tokenizer = AutoTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")
model = AutoModelForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-uncased-v1",
    num_labels=4
)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/531M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dkleczek/bert-base-polish-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
def prepare_data_for_training(df):
    label_map = {
        'Negative sentiment': 0,
        'No sentiment': 1,
        'Unsure': 2,
        'Positive sentiment': 3
    }
    
    labels = [label_map[label] for label in df['labels']]
    texts = df['text'].tolist()
    
    return texts, labels

In [12]:
def train_classifier(train_df, test_df):
    train_texts, train_labels = prepare_data_for_training(train_df)
    test_texts, test_labels = prepare_data_for_training(test_df)
    
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to=["none"],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()

    return trainer, model

In [13]:
def predict(texts, model):
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt").to('cuda')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    label_map_reverse = {
        0: 'Negative sentiment',
        1: 'No sentiment',
        2: 'Unsure',
        3: 'Positive sentiment'
    }
    
    pred_labels = [label_map_reverse[pred.argmax()] for pred in predictions.detach().numpy()]
    return pred_labels, predictions.detach().cpu().numpy()

In [14]:
trainer, model = train_classifier(df_train, df_test)



model.safetensors:   0%|          | 0.00/531M [00:00<?, ?B/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.598,0.567414


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [15]:
model.save_pretrained("/kaggle/working/results")

In [18]:
test_texts, test_labels = prepare_data_for_training(df_test)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer)

from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
import torch

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(model.device) for key, val in batch.items() if key != "labels"}
        labels = batch['labels'].to(model.device)

        # Forward pass
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)

        # Collect predictions and labels
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7869
