In [None]:
# !pip -q install catalyst==20.10.1 transformers datasets nlpaug

In [None]:
import torch
from catalyst.utils import set_global_seed, get_device

set_global_seed(42)
device = "cuda:0"
# device = get_device()

In [None]:
from datasets import load_dataset


imdb_dataset = load_dataset("imdb")

In [None]:
imdb_dataset

In [None]:
imdb_dataset["train"][0]

In [None]:
test = imdb_dataset["train"][0]["text"]

In [None]:
from transformers import BertTokenizer


tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-6_H-256_A-4")

In [None]:
print(tokenizer.tokenize(test))

In [None]:
print(tokenizer.encode(test))

In [None]:
print(tokenizer.encode_plus(test))

In [None]:
print(tokenizer.encode_plus(test, max_length=64, truncation=True, padding="max_length"))

In [None]:
print(tokenizer.encode_plus(test, max_length=64, truncation=True, padding="max_length", return_tensors="pt"))

In [None]:
import torch

from catalyst.utils import get_loader


def text_data_transforms(row):
    tokens = tokenizer.encode_plus(row["text"],
                                   max_length=64,
                                   truncation=True,
                                   padding="max_length",
                                   return_tensors="pt")
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens


train_dataloader = get_loader(
    imdb_dataset["train"],
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)

valid_dataloader = get_loader(
    imdb_dataset["test"],
    open_fn=lambda x: x,
    dict_transform=text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)

In [None]:
loaders = {
    "train": train_dataloader,
    "valid": valid_dataloader
}

In [None]:
from transformers import BertForSequenceClassification


model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")

In [None]:
from catalyst.contrib.nn import RAdam
from torch.nn import CrossEntropyLoss


optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
from datetime import datetime
from pathlib import Path


logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
from catalyst.dl import SupervisedRunner


class BertRunner(SupervisedRunner):
    def _handle_batch(self, batch):
        self.input = batch
        self.output = self.model(**{k: batch[k] for k in self.input_key}, return_dict=True)


runner = BertRunner(input_key=["input_ids", "attention_mask"])

In [None]:
from catalyst.dl import AccuracyCallback


runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

### Text augmentation

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumps over the lazy dog .'
print(text)

In [None]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
aug = naw.SynonymAug(action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute"
)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='google/bert_uncased_L-2_H-128_A-2', action="substitute")

In [None]:
def aug_text_data_transforms(row):
    sentence = aug.augment(row["text"][:256])# можно потом поставить больше
    tokens = tokenizer.encode_plus(sentence,
                                   max_length=64,
                                   truncation=True,
                                   padding="max_length",
                                   return_tensors="pt")
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens


aug_train_dataloader = get_loader(
    imdb_dataset["train"],
    open_fn=lambda x: x,
    dict_transform=aug_text_data_transforms,
    batch_size=256,
    num_workers=32,
    shuffle=True,
    drop_last=True,
)

In [None]:
aug_loaders = {
    "train": aug_train_dataloader,
    "valid": valid_dataloader
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"])
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=aug_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

### Domain adaptation

In [None]:
sst_dataset = load_dataset("glue", "sst2", split="train[:10%]")

In [None]:
sst_dataset[0]

In [None]:
def sst_text_data_transforms(row):
    tokens = tokenizer.encode_plus(row["text"],
                                   max_length=64,
                                   truncation=True,
                                   padding="max_length",
                                   return_tensors="pt")
    tokens = {k: v[0] for k, v in tokens.items()}
    tokens.update({"targets": row["label"]})
    return tokens

sst_train_dataloader = get_loader(
    sst_dataset,
    open_fn=lambda x: x,
    dict_transform=sst_text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)


sst_loaders = {
    "train": sst_train_dataloader,
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"])
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=sst_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

In [None]:
sst_dataset = load_dataset("glue", "sst2", split="train")

In [None]:
from transformers import BertModel


model = BertModel.from_pretrained("google/bert_uncased_L-2_H-256_A-4").to(device)

In [None]:
from tqdm.notebook import tqdm


imdb_vectors = []


with torch.no_grad():
    for row in tqdm(imdb_dataset["train"]):
        row = text_data_transforms(row)
        vector = model(
            input_ids=row["input_ids"].unsqueeze(0).to(device),
            attention_mask=row["attention_mask"].unsqueeze(0).to(device)
        )[0][0, 0].cpu().numpy()
        imdb_vectors.append(vector)

imdb_vectors = np.array(imdb_vectors)

In [None]:
import numpy as np

imdb_vectors = np.array(imdb_vectors)
imdb_vectors_norm = imdb_vectors/np.linalg.norm(imdb_vectors,axis=1, keepdims=True) # normalize vectors

In [None]:
sst_scores = []

with torch.no_grad():
    for row in tqdm(sst_dataset):
        row = sst_text_data_transforms(row)
        vector = model(
            input_ids=row["input_ids"].unsqueeze(0).to(device),
            attention_mask=row["attention_mask"].unsqueeze(0).to(device)
        )[0][0, 0].cpu().numpy()
        sst_scores.append(imdb_vectors_norm @ vector /np.linalg.norm(vector)) # calculate cosine metric

In [None]:
import matplotlib.pyplot as plt


plt.hist(sst_scores)

In [None]:
thr = 0.66

indeces = [i for i, value in enumerate(sst_scores) if value > thr]

In [None]:
sst_train_dataloader = get_loader(
    sst_dataset.select(indeces),
    open_fn=lambda x: x,
    dict_transform=sst_text_data_transforms,
    batch_size=256,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)


sst_loaders = {
    "train": sst_train_dataloader,
}

In [None]:
model = BertForSequenceClassification.from_pretrained("google/bert_uncased_L-6_H-256_A-4")
optimizer = RAdam(model.parameters(), lr=2e-4)
criterion = CrossEntropyLoss()

In [None]:
logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S")
runner = BertRunner(input_key=["input_ids", "attention_mask"])
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=sst_loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)

In [None]:
runner.train(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    loaders=loaders,
    logdir=logdir,
    num_epochs=3,
    verbose=True,
    callbacks=[AccuracyCallback(num_classes=2)],
)