In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import spacy
nlp = spacy.load("en_core_web_sm")
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

import datasets
from datasets import Dataset
import transformers
from transformers import BertTokenizer, AutoConfig, AutoModel, modeling_outputs
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from torcheval.metrics import R2Score
import pickle

DATA_DIR = "../datasets"
MODEL_DIR = "../models"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package stopwords to /home/ktc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Загрузка данных

In [2]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

!rm -rf ./checkpoints

source = os.path.join(DATA_DIR, "aclImdb_v1.tar.gz")
if not os.path.exists(source):
    !torify wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P $DATA_DIR

!tar -xf $source -C /tmp
train_dir = "/tmp/aclImdb/train"
test_dir = "/tmp/aclImdb/test"

In [3]:
def extract_data(path):
    review = []
    for label in ["pos", "neg"]:
        files = os.listdir(os.path.join(path, label))
        for file in files:
            with open(os.path.join(path, label, file)) as f:
                text = f.readline()
                rating = file.split(".")[0].split("_")[1]
                review.append([text, label, int(rating)])
    df = pd.DataFrame(review, columns=["text", "label", "rating"])
    return df

In [4]:
train_df = extract_data(train_dir)
test_df = extract_data(test_dir)
train_df.shape, test_df.shape, \
train_df.label.value_counts(), test_df.label.value_counts(), \
train_df.rating.value_counts(), test_df.rating.value_counts()

((25000, 3),
 (25000, 3),
 label
 pos    12500
 neg    12500
 Name: count, dtype: int64,
 label
 pos    12500
 neg    12500
 Name: count, dtype: int64,
 rating
 1     5100
 10    4732
 8     3009
 4     2696
 7     2496
 3     2420
 2     2284
 9     2263
 Name: count, dtype: int64,
 rating
 1     5022
 10    4999
 8     2850
 4     2635
 3     2541
 9     2344
 7     2307
 2     2302
 Name: count, dtype: int64)

## Baseline

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [8]:
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range = (1,3))
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])
le = LabelEncoder()
y_train = le.fit_transform(train_df["label"])
y_test = le.transform(test_df["label"])
clf = LogisticRegression(max_iter = 10_000).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8082

In [10]:
with open(os.path.join(MODEL_DIR, "vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)
with open(os.path.join(MODEL_DIR, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)
with open(os.path.join(MODEL_DIR, "clf.pkl"), "wb") as f:
    pickle.dump(clf, f)

In [16]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)
catboost = CatBoostRegressor(iterations=200, eval_metric='R2', metric_period=10)
catboost.fit(X_train, y_train, eval_set=(X_val, y_val))

Learning rate set to 0.202815
0:	learn: 0.0812353	test: 0.0784610	best: 0.0784610 (0)	total: 483ms	remaining: 1m 36s
10:	learn: 0.3364505	test: 0.3143412	best: 0.3143412 (10)	total: 5.61s	remaining: 1m 36s
20:	learn: 0.4204975	test: 0.3793310	best: 0.3793310 (20)	total: 10.7s	remaining: 1m 31s
30:	learn: 0.4660014	test: 0.4137897	best: 0.4137897 (30)	total: 15.9s	remaining: 1m 26s
40:	learn: 0.4983486	test: 0.4343162	best: 0.4343162 (40)	total: 21s	remaining: 1m 21s
50:	learn: 0.5264498	test: 0.4490742	best: 0.4490742 (50)	total: 26.1s	remaining: 1m 16s
60:	learn: 0.5553902	test: 0.4619172	best: 0.4619172 (60)	total: 31.5s	remaining: 1m 11s
70:	learn: 0.5803161	test: 0.4747657	best: 0.4747657 (70)	total: 36.6s	remaining: 1m 6s
80:	learn: 0.6017811	test: 0.4785598	best: 0.4785598 (80)	total: 41.6s	remaining: 1m 1s
90:	learn: 0.6193026	test: 0.4832783	best: 0.4832783 (90)	total: 46.6s	remaining: 55.8s
100:	learn: 0.6370172	test: 0.4851170	best: 0.4851170 (100)	total: 52.2s	remaining: 51.

<catboost.core.CatBoostRegressor at 0x76a4cf85ead0>

In [17]:
r2_score(y_test, catboost.predict(X_test))

0.5221826991653495

In [24]:
with open(os.path.join(MODEL_DIR, "reg.pkl"), "wb") as f:
    pickle.dump(catboost, f)

## Передобработка данных

In [5]:
class ImdbDataset():
    def __init__(self):
        super().__init__()

    def _lemmatizer(self, doc):
        doc = " ".join(str(doc).split())
        doc = [token.lemma_ for token in nlp(doc) if token.lemma_ not in stop_words]
        return " ".join(doc)

    def _lemmatize(self, text):
        tqdm.pandas()
        text = text.progress_apply(self._lemmatizer)
        return text

    def _label_encoder(self, labels):
        return [1 if label == "pos" else 0 for label in labels]

    def _preprocess_function(self, batch, tokenizer, max_length):
        return tokenizer(batch['text'], truncation=True, max_length=max_length)

    def from_df(self, data):
        data = Dataset.from_dict({
            "text": self._lemmatize(data["text"]), 
            "labels": self._label_encoder(data["label"]),
            "rating": data["rating"]}
        )
        tokenized_data = data.map(
            self._preprocess_function, 
            batched=True, 
            fn_kwargs={"tokenizer": tokenizer, "max_length": 512}
        )
        tokenized_data = tokenized_data.class_encode_column("labels")
        return tokenized_data

In [6]:
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
dataset = ImdbDataset()

In [82]:
train_ds = dataset.from_df(train_df)
train_ds.save_to_disk(os.path.join(DATA_DIR, "train.hf"))

  0%|          | 0/25000 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/25000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

## Архитектура

In [7]:
class Classifier(nn.Module):
    def __init__(self, in_channels, out_channels, hid_dim=1024):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_channels, hid_dim),
            nn.LayerNorm(hid_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hid_dim, hid_dim),
            nn.LayerNorm(hid_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hid_dim, out_channels),
        )

    def forward(self, x):
        return self.fc(x)


class ImdbBertClassifier(nn.Module):
    def __init__(self, out_features):
        super(ImdbBertClassifier, self).__init__()
        self.out_features = out_features
        self.config = AutoConfig.from_pretrained(checkpoint, output_attentions=True, attn_implementation="eager")
        self.backbone = AutoModel.from_pretrained(checkpoint, config=self.config)
        in_features = self.backbone.pooler.dense.out_features
        self.dropout = nn.Dropout(0.1)
        self.classifier = Classifier(in_features, out_features)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        out = self.dropout(out.pooler_output)
        logits = self.classifier(out)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        out = modeling_outputs.TokenClassifierOutput({"loss": loss, "logits": logits})
        return out

## Модель для предсказания статуса отзыва

In [8]:
train_ds = datasets.load_from_disk(os.path.join(DATA_DIR, "train.hf"))
small_train_ds = train_ds.train_test_split(
    train_size=12000, stratify_by_column="labels"
)
split_ds = small_train_ds["train"].train_test_split(
    test_size=0.2, stratify_by_column="labels"
)
print(pd.Series(split_ds["train"]["labels"]).value_counts(), pd.Series(split_ds["test"]["labels"]).value_counts(), sep="\n")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(y_true=labels, y_pred=preds)
    return {"Accuracy": acc}

clf = ImdbBertClassifier(2)

for param in clf.parameters():
    param.data = param.data.contiguous()
    param.requires_grad = False
for param in clf.backbone.encoder.layer[-1].parameters():
    param.requires_grad = True
for param in clf.backbone.pooler.parameters():
    param.requires_grad = True
for param in clf.classifier.parameters():
    param.requires_grad = True

bs = 16

training_args = transformers.TrainingArguments(
    output_dir="checkpoints",
    learning_rate=1e-3,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs=15,
    weight_decay=1e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = transformers.Trainer(
    model=clf,
    args=training_args,
    train_dataset=split_ds["train"],
    eval_dataset=split_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
)
trainer.train()

0    4800
1    4800
Name: count, dtype: int64
0    1200
1    1200
Name: count, dtype: int64


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6579,0.457637,0.809583
2,0.5228,0.442038,0.832917
3,0.4555,0.421957,0.819583
4,0.4351,0.379953,0.830833
5,0.4022,0.410502,0.83625
6,0.3929,0.367281,0.85125
7,0.379,0.369013,0.831667
8,0.376,0.352784,0.856667
9,0.3579,0.347854,0.85
10,0.3477,0.353267,0.85375


TrainOutput(global_step=9000, training_loss=0.3923843824598524, metrics={'train_runtime': 7434.1657, 'train_samples_per_second': 19.37, 'train_steps_per_second': 1.211, 'total_flos': 0.0, 'train_loss': 0.3923843824598524, 'epoch': 15.0})

In [98]:
test_ds = dataset.from_df(test_df)
test_ds.save_to_disk(os.path.join(DATA_DIR, "test.hf"))

  0%|          | 0/25000 [00:00<?, ?it/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/25000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [9]:
test_ds = datasets.load_from_disk(os.path.join(DATA_DIR, "test.hf"))
compute_metrics(trainer.predict(test_ds)[:2])

{'Accuracy': 0.86004}

In [15]:
torch.save(clf, os.path.join(MODEL_DIR, "bert_clf2.pt"))

In [17]:
def predict_label(text, path_to_model=os.path.join(MODEL_DIR, "bert_clf2.pt")):
    model = torch.load(path_to_model, weights_only=False)
    model.eval()
    model.cpu()
    rds = dataset.from_df(pd.DataFrame({"text": [text], "label": [None], "rating": [None]}))
    print(rds["text"])
    batch = next(iter(rds))
    input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0)
    attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0)
    preds = model(input_ids, attention_mask)[0].detach().numpy()
    return "pos" if np.argmax(preds, axis=1) else "neg"

In [21]:
review_pos = """
My boyfriend and I went to watch The Guardian.At first I didn't want to watch it, but I loved the movie- It was definitely 
the best movie I have seen in sometime.They portrayed the USCG very well, it really showed me what they do 
and I think they should really be appreciated more.Not only did it teach but it was a really good movie. The movie shows 
what the really do and how hard the job is.I think being a USCG would be challenging and very scary. It was a great movie all around. 
I would suggest this movie for anyone to see.The ending broke my heart but I know why he did it. The storyline was great 
I give it 2 thumbs up. I cried it was very emotional, I would give it a 20 if I could!
"""
review_neg = """
This is a pale imitation of 'Officer and a Gentleman.' There is NO chemistry between Kutcher and the unknown woman 
who plays his love interest. The dialog is wooden, the situations hackneyed. It's too long and the climax is anti-climactic(!). 
I love the USCG, its men and women are fearless and tough. The action scenes are awesome, but this movie doesn't do much 
for recruiting, I fear. The script is formulaic, but confusing. Kutcher's character is trying to redeem himself for an accident 
that wasn't his fault? Costner's is raging against the dying of the light, but why? His 'conflict' with his wife is about as deep 
as a mud puddle. I saw this sneak preview for free and certainly felt I got my money's worth.
"""
predict_label(review_pos), \
predict_label(review_neg)

  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1 [00:00<?, ? examples/s]

['boyfriend I go watch Guardian . first I want watch , I love movie- definitely good movie I see sometime . portray uscg well , really show I I think really appreciate . teach really good movie . movie show really hard job . I think uscg would challenge scary . great movie around . I would suggest movie anyone see . ending break heart I know . storyline great I give 2 thumb . I cry emotional , I would give 20 I could !']


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1 [00:00<?, ? examples/s]

["pale imitation ' Officer Gentleman . ' chemistry Kutcher unknown woman play love interest . dialog wooden , situation hackneye . long climax anti - climactic ( ! ) . I love uscg , man woman fearless tough . action scene awesome , movie much recruit , I fear . script formulaic , confusing . Kutcher 's character try redeem accident fault ? Costner 's rage dying light , ? ' conflict ' wife deep mud puddle . I see sneak preview free certainly feel I get money 's worth ."]


('pos', 'neg')

## Модель для присвоения рейтинга

In [8]:
def switch_labels(dataset):
    d = {k: dataset[k] for k in dataset.features.keys()}
    d["labels"] = dataset["rating"]
    d.pop("rating", None)
    dataset = Dataset.from_dict(d)
    dataset = dataset.class_encode_column("labels")
    return dataset

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    acc = accuracy_score(y_true=labels, y_pred=preds)
    f1 = f1_score(y_true=labels, y_pred=preds, average="macro")
    return {"Accuracy": acc, "F1": f1}

In [34]:
from imblearn.under_sampling import RandomUnderSampler

train_ds = datasets.load_from_disk(os.path.join(DATA_DIR, "train.hf"))
train_ds = train_ds.class_encode_column("rating")

rus = RandomUnderSampler(random_state=0)
X = np.array(range(train_ds.num_rows))[:,None]
y = np.array(train_ds["rating"])
X_res, _ = rus.fit_resample(X, y)

small_train_ds = train_ds.select(X_res).train_test_split(
    train_size=12000, stratify_by_column="rating"
)
small_train_ds = switch_labels(small_train_ds["train"])
split_ds = small_train_ds.train_test_split(
    test_size=0.2, stratify_by_column="labels"
)
print(pd.Series(split_ds["train"]["labels"]).value_counts(), pd.Series(split_ds["test"]["labels"]).value_counts(), sep="\n")

clf = ImdbBertClassifier(np.unique(y).size)

for param in clf.parameters():
    param.data = param.data.contiguous()
    param.requires_grad = False
for param in clf.backbone.encoder.layer[-1].output.parameters():
    param.requires_grad = True
for param in clf.backbone.pooler.parameters():
    param.requires_grad = True
for param in clf.classifier.parameters():
    param.requires_grad = True

bs = 16

training_args = transformers.TrainingArguments(
    output_dir="checkpoints",
    learning_rate=1e-3,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs,
    num_train_epochs=15,
    weight_decay=1e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = transformers.Trainer(
    model=clf,
    args=training_args,
    train_dataset=split_ds["train"],
    eval_dataset=split_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, 
)
trainer.train()

Stringifying the column:   0%|          | 0/12000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/12000 [00:00<?, ? examples/s]

3    1200
1    1200
0    1200
5    1200
4    1200
7    1200
6    1200
2    1200
Name: count, dtype: int64
6    300
1    300
5    300
2    300
7    300
3    300
0    300
4    300
Name: count, dtype: int64


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.03,1.869003,0.2325,0.167016
2,1.9177,1.819223,0.245833,0.18407
3,1.8896,1.843386,0.243333,0.168426
4,1.8458,1.812253,0.255417,0.19866
5,1.8126,1.761141,0.273333,0.239726
6,1.7961,1.776707,0.270833,0.25196
7,1.7961,1.769253,0.291667,0.263038
8,1.7731,1.735238,0.30625,0.274512
9,1.7682,1.733372,0.299583,0.269951
10,1.7507,1.7243,0.297917,0.264585


TrainOutput(global_step=9000, training_loss=1.7973870849609375, metrics={'train_runtime': 6982.8684, 'train_samples_per_second': 20.622, 'train_steps_per_second': 1.289, 'total_flos': 0.0, 'train_loss': 1.7973870849609375, 'epoch': 15.0})

In [35]:
test_ds = datasets.load_from_disk(os.path.join(DATA_DIR, "test.hf"))
compute_metrics(trainer.predict(test_ds)[:2])

{'Accuracy': 0.35936, 'F1': 0.12741014871512976}

In [None]:
torch.save(clf, os.path.join(MODEL_DIR, "bert_clf8.pt"))