In [1]:
from sqlalchemy import create_engine


engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")

connection = engine.connect().execution_options(stream_results=True)

In [2]:
### Посты и топики
### С постами придется повозиться, сгенерируем эмбеддинги постов с помощью моделей из 10 занятия
import pandas as pd


posts_info = pd.read_sql(
    """SELECT * FROM public.post_text_df""",
    con=connection
)

posts_info

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [3]:
### Сделаем эмбеддинги постов с помощью моделей из 10 занятия

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [4]:
tokenizer, model = get_model('distilbert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [5]:
### Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])


dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [6]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [8]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

100%|██████████| 220/220 [01:50<00:00,  2.00it/s]


array([[ 3.6315086e-01,  4.8937496e-02, -2.6408118e-01, ...,
        -1.4159346e-01,  1.5918216e-02,  9.1982896e-05],
       [ 2.3641640e-01, -1.5950108e-01, -3.2779828e-01, ...,
        -2.8993604e-01,  1.1936528e-01, -1.6235473e-03],
       [ 3.7519148e-01, -1.1394388e-01, -2.4054705e-01, ...,
        -3.3891949e-01,  5.8694065e-02, -2.1265799e-02],
       ...,
       [ 3.4038273e-01,  6.6492192e-02, -1.6318429e-01, ...,
        -8.6562753e-02,  2.0340374e-01,  3.2090571e-02],
       [ 4.3209219e-01,  1.1091532e-02, -1.1730607e-01, ...,
         7.5401559e-02,  1.0273975e-01,  1.5274222e-02],
       [ 3.0427766e-01, -7.6215670e-02, -6.7758739e-02, ...,
        -5.4348916e-02,  2.4438348e-01, -1.4148588e-02]], dtype=float32)

In [9]:
### Пытаемся кластеризовать тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [10]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()



Unnamed: 0,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,3.468532,3.385353,3.370952,3.40149,2.839169,2.998907,1.820302,3.407521,2.319735,3.666843,3.438987,3.607736,3.464358,1.967131,2.21149
1,3.245615,3.367247,3.325195,3.206344,2.556945,2.846266,1.417452,3.326378,2.296977,3.471824,2.977058,3.349583,3.141739,2.199386,2.235819
2,3.393113,3.495816,3.26427,3.263666,2.886763,3.057023,1.704455,3.356817,2.373559,3.454646,2.965785,3.339587,3.137806,1.810363,3.033427
3,4.062727,3.746369,3.516744,3.677976,3.38147,3.262566,2.470073,3.739873,2.79604,3.150546,3.71387,3.79261,3.797389,2.432608,3.39137
4,3.239508,2.800996,3.036209,2.846339,2.144595,2.65301,2.131008,2.809505,2.004131,3.172822,2.635893,3.032179,2.781052,1.47419,2.930253


In [11]:
posts_info = pd.concat((posts_info, dists_df), axis=1)

posts_info.drop(["text"], axis=1, inplace=True)

posts_info

Unnamed: 0,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1,business,6,3.468532,3.385353,3.370952,3.401490,2.839169,2.998907,1.820302,3.407521,2.319735,3.666843,3.438987,3.607736,3.464358,1.967131,2.211490
1,2,business,6,3.245615,3.367247,3.325195,3.206344,2.556945,2.846266,1.417452,3.326378,2.296977,3.471824,2.977058,3.349583,3.141739,2.199386,2.235819
2,3,business,6,3.393113,3.495816,3.264270,3.263666,2.886763,3.057023,1.704455,3.356817,2.373559,3.454646,2.965785,3.339587,3.137806,1.810363,3.033427
3,4,business,13,4.062727,3.746369,3.516744,3.677976,3.381470,3.262566,2.470073,3.739873,2.796040,3.150546,3.713870,3.792610,3.797389,2.432608,3.391370
4,5,business,13,3.239508,2.800996,3.036209,2.846339,2.144595,2.653010,2.131008,2.809505,2.004131,3.172822,2.635893,3.032179,2.781052,1.474190,2.930253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,movie,7,3.397617,1.802317,3.134332,2.071407,2.335500,2.816583,3.030622,1.281892,2.734563,2.144108,3.052176,2.946540,1.820922,3.009048,3.355394
7019,7316,movie,7,3.383263,1.446620,2.928633,1.831074,2.235439,2.500822,3.043121,0.927261,2.447862,1.958760,3.194599,2.593976,1.839164,2.974894,3.189076
7020,7317,movie,7,3.463894,2.013117,2.833793,2.233146,2.450910,2.535851,3.288661,1.501821,2.810105,2.397254,3.153662,2.373365,1.984437,3.188824,3.403875
7021,7318,movie,1,3.410543,1.049805,3.431536,1.934341,2.310634,3.106439,3.313971,1.490453,2.991921,1.789914,3.218118,3.298051,1.526987,3.198726,3.443346


In [12]:
### Очищаем память чтобы все влезло

model.cpu()

del model
del tokenizer

del dataset
del loader

del embeddings
del centered
del pca
del pca_decomp

In [13]:
import gc

gc.collect()

53

In [14]:
posts_info.to_sql(
   "mfkky2",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml",
    schema="public",
    if_exists='replace'
)

23

In [15]:
test = pd.read_sql("SELECT * FROM mfkky2", con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")
test

Unnamed: 0,index,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,0,1,business,6,3.468532,3.385353,3.370952,3.401490,2.839169,2.998907,1.820302,3.407520,2.319735,3.666843,3.438987,3.607736,3.464358,1.967131,2.211490
1,1,2,business,6,3.245614,3.367247,3.325195,3.206344,2.556945,2.846266,1.417452,3.326378,2.296977,3.471824,2.977058,3.349583,3.141739,2.199386,2.235819
2,2,3,business,6,3.393113,3.495816,3.264270,3.263666,2.886763,3.057023,1.704455,3.356818,2.373559,3.454646,2.965785,3.339587,3.137806,1.810363,3.033427
3,3,4,business,13,4.062728,3.746369,3.516744,3.677976,3.381470,3.262566,2.470073,3.739873,2.796040,3.150546,3.713870,3.792610,3.797389,2.432608,3.391370
4,4,5,business,13,3.239508,2.800996,3.036209,2.846338,2.144595,2.653010,2.131008,2.809506,2.004131,3.172821,2.635893,3.032179,2.781052,1.474190,2.930253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7018,7315,movie,7,3.397617,1.802317,3.134332,2.071407,2.335500,2.816583,3.030622,1.281892,2.734563,2.144108,3.052176,2.946540,1.820922,3.009048,3.355394
7019,7019,7316,movie,7,3.383263,1.446620,2.928633,1.831074,2.235439,2.500822,3.043121,0.927261,2.447862,1.958759,3.194599,2.593977,1.839164,2.974894,3.189076
7020,7020,7317,movie,7,3.463894,2.013117,2.833793,2.233146,2.450910,2.535851,3.288661,1.501821,2.810105,2.397254,3.153662,2.373365,1.984436,3.188824,3.403875
7021,7021,7318,movie,1,3.410543,1.049805,3.431536,1.934341,2.310634,3.106439,3.313971,1.490453,2.991921,1.789914,3.218118,3.298051,1.526987,3.198726,3.443345


In [16]:
### Попробуем забрать, скажем, 9 миллионов, сразу очистим и оставим только view


feed_data = pd.read_sql(
    """
    SELECT
        cast(extract(hour from timestamp) as int) as hour,
        cast(extract(month from timestamp) as int) as month,
        post_id,
        gender,
        age,
        country,
        city,
        exp_group,
        os,
        source,
        target
    FROM public.feed_data JOIN public.user_data ON public.feed_data.user_id = public.user_data.user_id
    WHERE action = 'view'
    LIMIT 1000000
    """,
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml"
)

feed_data.head()

Unnamed: 0,hour,month,post_id,gender,age,country,city,exp_group,os,source,target
0,10,10,101,1,15,Russia,Novodvinsk,2,Android,organic,0
1,10,10,1238,1,15,Russia,Novodvinsk,2,Android,organic,0
2,10,10,1347,1,15,Russia,Novodvinsk,2,Android,organic,0
3,10,10,5940,1,15,Russia,Novodvinsk,2,Android,organic,1
4,10,10,6006,1,15,Russia,Novodvinsk,2,Android,organic,1


In [19]:
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import pandas as pd


object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]




catboost = CatBoostClassifier(
    iterations=200,
    learning_rate=0.5,
    depth=2,
    random_seed=12345612,
    thread_count=-1,
    task_type="GPU"
)

feed_data = pd.merge(
    feed_data,
    posts_info,
    on='post_id',
    how='left'
)

feed_data.drop(['post_id'], axis=1, inplace=True)
feed_data





Unnamed: 0,hour,month,gender,age,country,city,exp_group,os,source,target,...,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,10,10,1,15,Russia,Novodvinsk,2,Android,organic,0,...,3.612790,2.505223,3.616951,3.062695,3.647959,3.583918,3.942473,3.331117,2.815611,3.308552
1,10,10,1,15,Russia,Novodvinsk,2,Android,organic,0,...,3.402462,2.530417,3.416208,2.782855,3.695047,3.250417,3.730466,3.273979,2.800575,1.031087
2,10,10,1,15,Russia,Novodvinsk,2,Android,organic,0,...,3.307303,2.597260,3.152012,2.650591,3.394868,3.453131,3.760763,3.109985,2.994552,1.462386
3,10,10,1,15,Russia,Novodvinsk,2,Android,organic,1,...,2.554979,3.187874,1.338334,2.673318,1.947870,3.473159,2.719651,2.155663,3.025272,3.457149
4,10,10,1,15,Russia,Novodvinsk,2,Android,organic,1,...,2.790670,2.800752,1.637603,2.349257,2.428610,3.103940,2.891082,1.217432,2.849182,2.898876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,12,11,1,34,Russia,Shakhty,1,Android,organic,0,...,2.526679,2.007642,2.522195,1.872030,2.921929,2.449395,2.846731,2.412287,2.032156,1.926002
999996,12,11,1,34,Russia,Shakhty,1,Android,organic,0,...,3.591667,2.696925,3.643361,2.982992,3.951999,3.602880,3.938025,3.508961,3.028809,1.195882
999997,12,11,1,34,Russia,Shakhty,1,Android,organic,0,...,3.038791,2.157437,3.259377,2.476523,3.572337,3.125969,3.411896,3.085254,2.367481,0.920997
999998,12,11,1,34,Russia,Shakhty,1,Android,organic,0,...,3.941620,3.318270,3.991980,3.704111,4.065576,2.905790,4.024712,3.683149,3.518441,3.671527


In [20]:
feed_data.columns, test.columns


(Index(['hour', 'month', 'gender', 'age', 'country', 'city', 'exp_group', 'os',
        'source', 'target', 'topic', 'TextCluster', 'DistanceToCluster_0',
        'DistanceToCluster_1', 'DistanceToCluster_2', 'DistanceToCluster_3',
        'DistanceToCluster_4', 'DistanceToCluster_5', 'DistanceToCluster_6',
        'DistanceToCluster_7', 'DistanceToCluster_8', 'DistanceToCluster_9',
        'DistanceToCluster_10', 'DistanceToCluster_11', 'DistanceToCluster_12',
        'DistanceToCluster_13', 'DistanceToCluster_14'],
       dtype='object'),
 Index(['index', 'post_id', 'topic', 'TextCluster', 'DistanceToCluster_0',
        'DistanceToCluster_1', 'DistanceToCluster_2', 'DistanceToCluster_3',
        'DistanceToCluster_4', 'DistanceToCluster_5', 'DistanceToCluster_6',
        'DistanceToCluster_7', 'DistanceToCluster_8', 'DistanceToCluster_9',
        'DistanceToCluster_10', 'DistanceToCluster_11', 'DistanceToCluster_12',
        'DistanceToCluster_13', 'DistanceToCluster_14'],
       dty

In [21]:
X = feed_data.drop(['target'], axis=1)
y = feed_data['target']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800000, 26), (200000, 26))

In [22]:

catboost = CatBoostClassifier(
    iterations=200,
    learning_rate=0.5,
    depth=2,
    random_seed=12345612,
    thread_count=-1,
    task_type="GPU"
)

In [23]:
catboost.fit(X_train, y_train, cat_features=object_cols)

0:	learn: 0.4325844	total: 77.4ms	remaining: 15.4s
1:	learn: 0.3772517	total: 121ms	remaining: 12s
2:	learn: 0.3658193	total: 174ms	remaining: 11.4s
3:	learn: 0.3583927	total: 279ms	remaining: 13.7s
4:	learn: 0.3563144	total: 361ms	remaining: 14.1s
5:	learn: 0.3553374	total: 469ms	remaining: 15.2s
6:	learn: 0.3550448	total: 599ms	remaining: 16.5s
7:	learn: 0.3547890	total: 684ms	remaining: 16.4s
8:	learn: 0.3544856	total: 761ms	remaining: 16.1s
9:	learn: 0.3543159	total: 794ms	remaining: 15.1s
10:	learn: 0.3542713	total: 832ms	remaining: 14.3s
11:	learn: 0.3541981	total: 869ms	remaining: 13.6s
12:	learn: 0.3541151	total: 966ms	remaining: 13.9s
13:	learn: 0.3540491	total: 1.04s	remaining: 13.8s
14:	learn: 0.3536398	total: 1.12s	remaining: 13.8s
15:	learn: 0.3527772	total: 1.2s	remaining: 13.8s
16:	learn: 0.3527563	total: 1.3s	remaining: 14s
17:	learn: 0.3525473	total: 1.37s	remaining: 13.9s
18:	learn: 0.3522917	total: 1.43s	remaining: 13.6s
19:	learn: 0.3522619	total: 1.51s	remaining: 1

<catboost.core.CatBoostClassifier at 0x7c900d2cfa00>

In [24]:
from sklearn.metrics import roc_auc_score

print(f'Качество на трейне: {roc_auc_score(y_train, catboost.predict_proba(X_train)[:,1])}')
print(f'Качество на тесте: {roc_auc_score(y_test, catboost.predict_proba(X_test)[:,1])}')

Качество на трейне: 0.6778599736347853
Качество на тесте: 0.6700381406420192


In [46]:
catboost.save_model(
    'catboost_modelDL.cbm',
    format="cbm"
)