In [22]:
import torch
import numpy as np
import random
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from dataclasses import dataclass
from pathlib import Path
from sklearn.linear_model import LogisticRegression

device = "cpu"

tokenizer_bert = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
model_bert = AutoModel.from_pretrained("allegro/herbert-base-cased").to(device)
model_bert.eval()

tokenizer_polka = AutoTokenizer.from_pretrained("eryk-mazus/polka-1.1b")
model_polka = AutoModelForCausalLM.from_pretrained("eryk-mazus/polka-1.1b").to(device)
model_polka.eval()

print("Models loaded")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Models loaded


In [36]:
def get_bert_vect(L):
    txt = ' '.join(L)
    input_ids = tokenizer_bert(txt, return_tensors='pt')['input_ids'].to(device)
    output = model_bert(input_ids=input_ids)
    return output.last_hidden_state.detach().cpu().numpy()[0,0,:]

def log_probs_from_logits(logits, labels):
    logp = F.log_softmax(logits, dim=-1)
    logp_label = torch.gather(logp, 2, labels.unsqueeze(2)).squeeze(-1)
    return logp_label

def get_polka_scores(text_list):

    raw_text = ' '.join(text_list)
    prompts = [
        f"Opinia pozytywna: {raw_text}",
        f"Opinia negatywna: {raw_text}"
    ]

    scores = []
    for prompt in prompts:
        input_ids = tokenizer_polka(prompt, return_tensors="pt")["input_ids"].to(device)

        with torch.no_grad():
            output = model_polka(input_ids=input_ids)
            log_probs = log_probs_from_logits(output.logits[:, :-1, :], input_ids[:, 1:])
            seq_log_prob = torch.sum(log_probs)

            score = seq_log_prob / input_ids.shape[1]

        scores.append(score.item())

    return scores

def spoil(L):
    res = []
    for w in L:
        if random.random() < 0.85:
            res.append(w)
        else:
            res.append(w.upper())
    return res

In [37]:
@dataclass(frozen=True)
class Review:
    positive: bool
    content: str


def read_reviews(path: Path) -> list[Review]:
    reviews = []
    with path.open() as f:
        for line in f.readlines():
            r_type, content = line.split(maxsplit=1)

            if r_type == "GOOD":
                positive = True
            elif r_type == "BAD":
                positive = False
            else:
                raise RuntimeError("Unknown review type")

            reviews.append(Review(positive=positive, content=content))

    return reviews

repo_root = Path(".").resolve().parent.parent
reviews  = read_reviews(repo_root / "datasets" / "reviews_for_task3.txt")
random.shuffle(reviews)

N = len(reviews)
test_size = N // 4
train_size = N - test_size

train_reviews = reviews[:train_size]
test_reviews  = reviews[train_size:]

In [4]:
train_reviews[0], test_reviews[0]

(Review(positive=False, content='W basenie zimna woda, w jacuzzi te≈º.\n'),
 Review(positive=False, content='Szczerze nie polecam!!!\n'))

In [38]:
X_train = []
y_train = []
X_test = []
y_test = []

print("Processing Training Data...")
for i, review in enumerate(train_reviews):
    words = review.content.split()
    label = 1 if review.positive else 0

    polka_score = get_polka_scores(words)

    bert_vec = get_bert_vect(words)
    X_train.append(np.concatenate([bert_vec, polka_score]))
    y_train.append(label)

    for _ in range(3):
        spoiled_words = spoil(words)
        bert_vec_spoiled = get_bert_vect(spoiled_words)
        # polka_score_spoiled = get_polka_scores(spoiled_words)
        X_train.append(np.concatenate([bert_vec_spoiled, polka_score]))
        y_train.append(label)

    if (i + 1) % 50 == 0:
        print(f"Train: {i + 1}/{len(train_reviews)}")

print("Processing Test Data...")
for i, review in enumerate(test_reviews):
    words = review.content.split()
    label = 1 if review.positive else 0

    polka_score = get_polka_scores(words)
    bert_vec = get_bert_vect(words)

    X_test.append(np.concatenate([bert_vec, polka_score]))
    y_test.append(label)

    if (i + 1) % 50 == 0:
        print(f"Test: {i + 1}/{len(test_reviews)}")

Processing Training Data...
Train: 50/300
Train: 100/300
Train: 150/300
Train: 200/300
Train: 250/300
Train: 300/300
Processing Test Data...
Test: 50/100
Test: 100/100


In [39]:
len(X_train), len(X_test)

(1200, 100)

In [41]:
clf = LogisticRegression(max_iter=2000).fit(X_train, y_train)

print ('Train accuracy:', clf.score(X_train, y_train))
print ('Test accuracy:', clf.score(X_test, y_test))



Train accuracy: 0.9991666666666666
Test accuracy: 0.79
