In [11]:
import joblib
import polars as pl
from clearml import Dataset, Task

task = Task.init(
    project_name="Amazon reviews", task_name="bert_solution", output_uri=True
)
frame_path = Dataset.get(
    dataset_name="Raw data", dataset_project="Amazon reviews"
).get_local_copy()
task.set_progress(0)
data = pl.read_csv(
    frame_path + "/train.csv",
    has_header=False,
    new_columns=["Polarity", "Title", "Review"],
    n_rows=5000,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable Event reporting sub-process lost, switching to thread based reporting


ClearML Task: created new task id=dc71d4102df14ecd8ad761a4ca38a59d
ClearML results page: http://localhost:8080/projects/0d9acbd12b6a4e83a3d19d4a43632763/experiments/dc71d4102df14ecd8ad761a4ca38a59d/output/log


TOKENIZERS_PARALLELISM=(true | false)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

random_state = 42

train, test = train_test_split(
    data,
    test_size=0.3,
    shuffle=True,
    random_state=random_state,
)

Загружаем Bert модель и его токенайзер:

In [13]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

In [14]:
device

device(type='cuda')

Будем загружать все батчами, т.к. все не влезет в память:

In [15]:
from torch.utils.data import DataLoader

fixed_batch_size = 128
train_dataloader = DataLoader(
    train["Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
)
test_dataloader = DataLoader(
    test["Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
)

Инференс получаем как выход с последнего слоя берта:

In [16]:
def batch_inference(batch):
    tokenized_batch = tokenizer(
        batch, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        hidden_batch = bert_model(**tokenized_batch)
        batch_embeddings = hidden_batch.last_hidden_state[:, 0, :].detach().to("cpu")
        return batch_embeddings


train_embeddings = torch.concat(
    [batch_inference(batch_data) for batch_data in train_dataloader]
)
test_embeddings = torch.concat(
    [batch_inference(batch_data) for batch_data in test_dataloader]
)

In [8]:
task.upload_artifact(
    name="train_embeddings",
    artifact_object=train_embeddings,
)
task.upload_artifact(
    name="test_embeddings",
    artifact_object=test_embeddings,
)

True

Попробуем обучить на этих эмбеддингах логистическую регрессию:

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

model_params = {
    "multi_class": "multinomial",
    "solver": "saga",
    "random_state": random_state,
}
task.connect(model_params)
model_lr = LogisticRegression(**model_params)
model_lr.fit(train_embeddings, train["Polarity"])
joblib.dump(model_lr, "models/model.pkl", compress=True)
predicts = model_lr.predict(test_embeddings)
report = classification_report(test["Polarity"], predicts, output_dict=True)
confusion = confusion_matrix(test["Polarity"], predicts)

logger = task.get_logger()
logger.report_single_value("accuracy", report.pop("accuracy"))
for class_name, metrics in report.items():
    for metric, value in metrics.items():
        logger.report_single_value(f"{class_name}_{metric}", value)
logger.report_confusion_matrix("conflusion matrix", "ignored", matrix=confusion)



In [18]:
task.close()