In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, BertModel, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from tqdm.notebook import tqdm, trange
from torch.utils.data import Dataset
tqdm.pandas()

In [2]:
def convert_to_bert_line(text, tokenizer, max_seq_length=2048):
    tokens = "[CLS] " + text
    tokens = tokenizer(tokens, max_length=max_seq_length, padding="max_length", truncation=True)
    return tokens 

def convert_to_bert_lines(texts, tokenizer, max_seq_length=2048, hide=True):
    all_tokens = []
    for text in tqdm(texts, desc="Tokenizing data", disable=hide):
        tokens = convert_to_bert_line(text, tokenizer, max_seq_length)
        all_tokens.append(tokens["input_ids"])
    return np.array(all_tokens)

In [26]:
df = pd.read_csv("dataset.csv", sep=";")
df = df.dropna()
df = df.groupby("link").first().reset_index()
df.head()

Unnamed: 0,link,artist,title,text
0,https://pesnihi.com/lyrics/a/a-studio/fashion-...,А-Студио,Fashion Girl,Вступление(вокализ) Твоя аура затмит сто свече...
1,https://pesnihi.com/lyrics/a/a-studio/goddess-...,А-Студио,Goddess Of The Dance Floor,Put my best dress on and go This night my eyes...
2,https://pesnihi.com/lyrics/a/a-studio/ill-be-a...,А-Студио,I’ll be Around,"Пусть нам кружит голову месяц май Не пропадай,..."
3,https://pesnihi.com/lyrics/a/a-studio/my-world...,А-Студио,My World,Better go live me alone Don't you know I have ...
4,https://pesnihi.com/lyrics/a/a-studio/s-o-s.html,А-Студио,S.O.S.,Baby I feel that I`m falling in love Baby I kn...


In [4]:
device = "cuda:0"

class MyDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = (df["title"] + " " + df["text"]).values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(convert_to_bert_line(self.data[idx], self.tokenizer)["input_ids"], dtype=torch.long)

In [5]:
trainer = Trainer(
    model=BertForMaskedLM.from_pretrained(
        "cointegrated/rubert-tiny2", 
        cache_dir=None
        ),
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", cache_dir=None, do_lower_case=True)
    ),
    args=TrainingArguments(
        output_dir="finetuned/",
        per_device_train_batch_size=4,
        num_train_epochs=1,
        save_strategy="steps",
        save_steps=5000,
        logging_steps=100),
    train_dataset=MyDataset(df, AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", cache_dir=None, do_lower_case=True))
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
trainer.train()

***** Running training *****
  Num examples = 73916
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 18479


Step,Training Loss
100,4.1422
200,3.9544
300,3.7289
400,3.7537
500,3.7447
600,3.7748
700,3.6305
800,3.5587
900,3.5631
1000,3.6259


Saving model checkpoint to finetuned/checkpoint-5000
Configuration saved in finetuned/checkpoint-5000/config.json
Model weights saved in finetuned/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to finetuned/checkpoint-10000
Configuration saved in finetuned/checkpoint-10000/config.json
Model weights saved in finetuned/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to finetuned/checkpoint-15000
Configuration saved in finetuned/checkpoint-15000/config.json
Model weights saved in finetuned/checkpoint-15000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18479, training_loss=3.0485904548480285, metrics={'train_runtime': 3658.2031, 'train_samples_per_second': 20.206, 'train_steps_per_second': 5.051, 'total_flos': 2256424994537472.0, 'train_loss': 3.0485904548480285, 'epoch': 1.0})

In [9]:
trainer.model.save_pretrained("pretrained")

Configuration saved in pretrained/config.json
Model weights saved in pretrained/pytorch_model.bin


In [4]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", cache_dir=None, do_lower_case=True)
model = BertModel.from_pretrained("pretrained", cache_dir=None)
device="cuda:0" 
model.to(device)
model.eval()


Some weights of the model checkpoint at pretrained were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at pretrained and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably T

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [35]:
tokenized_data = convert_to_bert_lines(df["text"], tokenizer, hide=False)
embeds = None

for i in trange(len(tokenized_data), desc="Embedding data"):
    tokens = torch.tensor(tokenized_data[i].reshape(1, -1), dtype=torch.long)
    preds = model(tokens.to(device),
                  attention_mask=(tokens > 0).to(device),
                  return_dict=False)
    if embeds is None:
        embeds = preds[0].detach().cpu().numpy()[0][0]
    else:
        embeds = np.vstack([embeds, preds[0].detach().cpu().numpy()[0][0]])
    del preds
df["embeds"] = np.apply_along_axis(lambda x: x.tolist(), 1, embeds).tolist()

Tokenizing data:   0%|          | 0/73916 [00:00<?, ?it/s]

Embedding data:   0%|          | 0/73916 [00:00<?, ?it/s]

In [36]:
df.to_csv("dataset_with_embeds.csv", index=False, sep=";")