# BERT Transformer

In [1]:
import joblib
import polars as pl
from clearml import Dataset, Task
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from omegaconf import OmegaConf
import os
import polars as pl
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModel, AutoTokenizer
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import pickle

In [2]:
TASK_NAME = "Bert model"

In [3]:
GlobalHydra.instance().clear()

# hydra init
initialize(version_base=None, config_path=".", job_name="bert_model")
cfg = compose(config_name="config")
print(OmegaConf.to_yaml(cfg=cfg))

default:
- _self_
project:
  name: Toxic comments
dataset:
  project: Toxic comments
  name: Raw data
  file: toxic_comments.csv
params:
  nrows: 50000
  test_size: 0.2
  max_features: 10000
  random_state: 42
  shuffle: true



initialize tracking in clearML

In [4]:
# init task
task = Task.init(project_name=cfg.project.name, task_name=TASK_NAME, output_uri=True)

# get local copy of dataset
dataset = Dataset.get(
    dataset_project=cfg.dataset.project, dataset_name=cfg.dataset.name
).get_local_copy()

task.set_progress(0)

ClearML Task: created new task id=c808946ef99440afa7c6314ed0be9de9
2024-05-24 00:13:14,683 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/ad182336b7f24902a01fce478ef1d156/experiments/c808946ef99440afa7c6314ed0be9de9/output/log
2024-05-24 00:13:19,101 - clearml - INFO - Dataset.get() did not specify alias. Dataset information will not be automatically logged in ClearML Server.
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E325F2DA90>: Failed to resolve 'api.clear.ml' ([Errno 11001] getaddrinfo failed)")': /v2.23/tasks.get_all




In [5]:
# loading dataset
data = pl.read_csv(
    os.path.join(dataset, cfg.dataset.file),
    has_header=False,
    new_columns=["id", "text", "toxic"],
    n_rows=cfg.params.nrows,
)

pl.Config.set_fmt_str_lengths(100)
data.head()

id,text,toxic
i64,str,str
,"""text""","""toxic"""
0.0,"""Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't…","""0"""
1.0,"""D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11…","""0"""
2.0,"""Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant …","""0"""
3.0,""""" More I can't make any real suggestions on improvement - I wondered if the section statistics sho…","""0"""


In [6]:
task.set_progress(10)

In [7]:
data["toxic"].value_counts()

toxic,count
str,u32
"""toxic""",1
"""0""",44852
"""1""",5147


In [8]:
train, test = train_test_split(
    data,
    test_size=cfg.params.test_size,
    random_state=cfg.params.random_state,
    shuffle=cfg.params.shuffle,
)

In [9]:
def get_device():
    # Если в системе есть GPU ...
    if torch.cuda.is_available():
        # Тогда говорим PyTorch использовать GPU.
        device = torch.device("cuda")
        print("There are %d GPU(s) available." % torch.cuda.device_count())
        print("We will use the GPU:", torch.cuda.get_device_name(0))
    # Если нет GPU, то считаем на обычном процессоре ...
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")
    return device


device = get_device()

No GPU available, using the CPU instead.


In [10]:
# initialize BERT model
model_name = "bert-base-uncased"
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

In [11]:
device

device(type='cpu')

In [12]:
from torch.utils.data import DataLoader

batch_size = 128
train_dataloader = DataLoader(train["text"].to_list(), batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test["text"].to_list(), batch_size=batch_size, shuffle=False)

In [13]:
# Fix progress in clearml
task.set_progress(20)

In [14]:
def batch_inference(batch):
    tokenized_batch = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(
        device
    )
    with torch.no_grad():
        hidden_batch = bert_model(**tokenized_batch)
        batch_embeddings = hidden_batch.last_hidden_state[:, 0, :].detach().to("cpu")
        return batch_embeddings


train_embeddings = torch.concat([batch_inference(batch_data) for batch_data in train_dataloader])


test_embeddings = torch.concat([batch_inference(batch_data) for batch_data in test_dataloader])

ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


KeyboardInterrupt: 

: 

In [None]:
task.set_progress(85)

task.upload_artifact(name="train_embeddings", artifact_object=train_embeddings)


task.upload_artifact(name="test_embeddings", artifact_object=test_embeddings)

In [None]:
# Training model of logistic regression on embeddings taking from bert

In [None]:
model_params = {
    "random_state": cfg.params.random_state,
    "multi_class": "multinomial",
    "solver": "saga",
}

task.connect(model_params)

model_lr = LogisticRegression(**model_params)
model_lr.fit(train_embeddings, train["toxic"])

In [None]:
task.set_progress(80)

task.upload_artifacts(name="LogisticRegression(BERT)", artifact_object=pickle.dumps(model_lr))

In [None]:
predicts = model_lr.predict(test_embeddings)

report = classification_report(test["Polarity"], predicts, output_dict=True)


# building confusion matrix
conf_matrix = confusion_matrix(test["Polarity"], predicts)

In [None]:
# Fix progress in CLearML
task.set_progress(95)

# Fix parameters of model
logger = task.get_logger()

logger.report_single_value("Accuracy", report.pop("accuracy"))

for class_name, metrics in report.items():
    for metric, value in metrics.items():
        logger.report_single_value(f"{class_name}_{metric}", value)

logger.report_confusion_matrix("Confusion matrix", "ignored", matrix=conf_matrix)

In [None]:
task.close()

In [None]:
%matplotlib inline


fig, ax = plt.subplots(figsize=(5, 5))
ConfusionMatrixDisplay.from_predictions(test["Polarity"], predicts, ax=ax, colorbar=False)
ax.xaxis.set_tick_params(rotation=90)
_ = ax.set_title("Confusion Matrix")
plt.tight_layout()