### Подключение к базе и таблицы с юзерами и постами

In [None]:
# ! pip3 install psycopg2-binary

In [None]:
from sqlalchemy import create_engine


engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")

connection = engine.connect().execution_options(stream_results=True)

In [None]:
### Посты и топики
### сгенерируем эмбеддинги постов с помощью моделей из 10 занятия
import pandas as pd


posts_info = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    con=connection
)

posts_info

In [None]:
### Сделаем эмбеддинги постов с помощью моделей из 10 занятия

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [None]:
tokenizer, model = get_model('distilbert')

In [None]:
### Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])
    
    
dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [None]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

In [None]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

In [None]:
### Пытаемся кластеризовать тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [None]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

In [None]:
posts_info = pd.concat((posts_info, dists_df), axis=1)

posts_info.drop(["text"], axis=1, inplace=True)

posts_info

In [None]:
### Очищаем память чтобы все влезло

model.cpu()

del model
del tokenizer

del dataset
del loader

del embeddings
del centered
del pca
del pca_decomp

In [None]:
import gc

gc.collect()

In [None]:
posts_info.to_sql(
   "posts_info_features_dl",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml",
    schema="public",
    if_exists='replace'
)

## Теперь приступаем к обработке действий

In [None]:
### Попробуем забрать 9 миллионов, сразу очистим и оставим только view


feed_data = pd.read_sql(
    """
    SELECT
        cast(extract(hour from timestamp) as int) as hour,
        cast(extract(month from timestamp) as int) as month,
        post_id,
        gender,
        age,
        country,
        city,
        exp_group,
        os,
        source,
        target
    FROM public.feed_data JOIN public.user_data ON public.feed_data.user_id = public.user_data.user_id
    WHERE action = 'view'
    LIMIT 9000000
    """,
    con=connection
)

feed_data.head()

In [None]:
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm


object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

catboost = CatBoostClassifier(
    iterations=200,
    learning_rate=1,
    depth=2,
    random_seed=12345612,
    thread_count=-1,
    task_type="GPU"
)

feed_data = pd.merge(
    feed_data,
    posts_info,
    on='post_id',
    how='left'
)

feed_data.drop(['post_id'], axis=1, inplace=True)

catboost.fit(X=feed_data.drop(['target'], axis=1), y=feed_data['target'], cat_features=object_cols)

catboost.save_model(
    'catboost_model.cbm',
    format="cbm"                  
)