In [1]:
%pip install -qU torch transformers polars

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m977.6 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from pathlib import Path
import polars as pl
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from tqdm import tqdm, trange
from transformers import AutoTokenizer, AutoModel

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
ad_text_path = Path("/content/drive/MyDrive/Telegram Marketing/Embeddings/ad_text.csv")
ad_posts = pl.read_csv(ad_text_path)
ad_posts

id,raw_text
i64,str
7719933,"""👀 ICML 2024 глазами ML-лидов Я…"
7719924,"""🎉 Коворкинг Яндекса на крыше М…"
7719923,"""👀 Показываем зумеров глазами з…"
7719914,"""🔥 Яндекс Х Epic Growth Confere…"
7719559,"""💫 Стартует новый сезон трениро…"
…,…
7719936,"""💫 В гармонии с кодом и природо…"
7719964,"""📎 Роман Елизаров о 20-летнем о…"
7719955,"""📕 Виртуальный рассказчик Букме…"
7719946,"""👀 Личный опыт участия в Multit…"


In [5]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Задача: {task_description}\nДанные: {query}'

In [6]:
# Each query must come with a one-sentence instruction that describes the task
task = 'По рекламному тексту сформируй профиль идеального Telegram-канала для размещения этой рекламы.'

input_texts = []
for ad_text in tqdm(ad_posts["raw_text"].to_list(), leave=False):
    input_texts.append(get_detailed_instruct(task, ad_text))



In [7]:
model_name = 'intfloat/multilingual-e5-large-instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [8]:
def process_in_batches(model, tokenizer, input_texts, batch_size=16, device=torch.device("cpu")):
    embeddings = []
    for i in trange(0, len(input_texts), batch_size):
        batch_texts = input_texts[i:i + batch_size]

        batch_dict = tokenizer(batch_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

        with torch.no_grad():
            outputs = model(**batch_dict)
            batch_embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

        batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1)
        embeddings.append(batch_embeddings.cpu())

        del batch_dict, outputs, batch_embeddings
        torch.cuda.empty_cache()


    return torch.cat(embeddings, dim=0)


In [14]:
embeddings = process_in_batches(model, tokenizer, input_texts, batch_size=16, device=device)
embeddings.shape

100%|██████████| 11086/11086 [3:14:44<00:00,  1.05s/it]


torch.Size([177373, 1024])

In [17]:
# Save embeddings
EMB_FILE = Path("/content/drive/MyDrive/Telegram Marketing/Embeddings/ad_emb_t5_instruct.npy")
np.save(EMB_FILE, embeddings.numpy())

# Also save indices of posts
IDS_FILE = Path("/content/drive/MyDrive/Telegram Marketing/Embeddings/ids_ad_emb_t5_instruct.npy")
np.save(IDS_FILE, ad_posts["id"].to_numpy())

In [18]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   2659 MiB |   8291 MiB | 117567 GiB | 117565 GiB |
|       from large pool |   2654 MiB |   8286 MiB | 115948 GiB | 115945 GiB |
|       from small pool |      5 MiB |     10 MiB |   1619 GiB |   1619 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   2659 MiB |   8291 MiB | 117567 GiB | 117565 GiB |
|       from large pool |   2654 MiB |   8286 MiB | 115948 GiB | 115945 GiB |
|       from small pool |      5 MiB |     10 MiB |   1619 GiB |   1619 GiB |
|---------------------------------------------------------------