In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.1-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.19.1
    Uninstalling huggingface-hub-0.19.1:
      Successfully uninstalled huggingface-hub-0.19.1
Successfully i

In [76]:
from transformers import BertTokenizer, BertModel
import pickle

In [95]:
class BertDataSet:

    def __init__(self, model_name):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.text_embeddings = []

    def __len__(self):
        return len(self.text_embeddings)

    def __getitem__(self, index):
        return self.text_embeddings[index]

    def save_embeddings(self, file_path):
        with open(file_path, 'wb') as file:
            pickle.dump(self.text_embeddings, file)

    def load_embeddings(self, file_path):
        with open(file_path, 'rb') as file:
            self.text_embeddings = pickle.load(file)

    def load_text_corpus(self, texts):
        self.text_embeddings = []
        for text in texts:
            encoded_input = self.tokenizer(text, return_tensors='pt')
            output = self.model(**encoded_input)
            self.text_embeddings.append(output.pooler_output.squeeze())


In [96]:
train_text_corpus_en = [
    'sample text',
    'some text',
    'another very big text',
    'last text',
]

test_text_corpus_en = [
    'test text',
]

train_text_corpus_ru = [
    'обычный текст',
    'какой-то текст',
    'ещё один текст, но побольше',
    'последний текст',
]

test_text_corpus_ru = [
    'тестовый текст',
]

In [97]:
dataset = BertDataSet('bert-base-uncased')

In [98]:
dataset.load_text_corpus(train_text_corpus_en)

In [99]:
dataset.save_embeddings('emb.pk')

In [100]:
dataset.load_embeddings('emb.pk')

In [102]:
dataset[0].shape

torch.Size([768])

In [64]:
import torch
from torch import nn

class TextClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.classifier(x)

In [65]:
model = TextClassifier()