In [1]:
%load_ext autoreload
%autoreload 2

# import logging
import torch
import pytorch_lightning as pl

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score
import sentence_transformers
# import warnings

# warnings.filterwarnings('ignore')
# logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

In [2]:
region_to_utc = {
    'Chelyabinsk': '+05:00',
    'Bashkortostan Republic': '+05:00',
    'St.-Petersburg': '+03:00',
    'Moscow': '+03:00',
    'Rostov': '+03:00',
    'Moscow Oblast': '+03:00',
    'Kursk Oblast': '+03:00',
    'Kemerovo Oblast': '+07:00',
    'Arkhangelskaya': '+03:00',
    'Tomsk Oblast': '+07:00',
    'Novosibirsk Oblast': '+07:00',
    'Sverdlovsk Oblast': '+05:00',
    'Leningradskaya Oblast': '+03:00',
    'Krasnodar Krai': '+03:00',
    'Tatarstan Republic': '+03:00',
    'Belgorod Oblast': '+03:00',
    'Kuzbass': '+07:00',
    'Udmurtiya Republic': '+04:00',
    'Chuvashia': '+03:00',
    'Ryazan Oblast': '+03:00',
    'Perm Krai': '+05:00',
    'Sakha': '+09:00',
    'Orenburg Oblast': '+05:00',
    'Primorye': '+10:00',
    'Zabaykalskiy Transbaikal Kray': '+09:00',
    'Bryansk Oblast': '+03:00',
    'Tver Oblast': '+03:00',
    'Stavropol Kray': '+03:00',
    'Khabarovsk': '+10:00',
    'Penza Oblast': '+03:00',
    'Mariy-El Republic': '+03:00',
    'Smolensk Oblast': '+03:00',
    'Tambov Oblast': '+03:00',
    'Novgorod Oblast': '+03:00',
    'Khakasiya Republic': '+07:00',
    'Ulyanovsk': '+04:00',
    'Volgograd Oblast': '+03:00',
    'Irkutsk Oblast': '+08:00',
    'Komi': '+03:00',
    'Nizhny Novgorod Oblast': '+03:00',
    'Krasnoyarsk Krai': '+07:00',
    'Kurgan Oblast': '+05:00',
    'Kirov Oblast': '+03:00',
    'Omsk Oblast': '+06:00',
    'Vladimir Oblast': '+03:00',
    'Yaroslavl Oblast': '+03:00',
    'Saratov Oblast': '+04:00',
    'Khanty-Mansia': '+05:00',
    'Tula Oblast': '+03:00',
    'Amur Oblast': '+09:00',
    'Altay Kray': '+07:00',
    'Buryatiya Republic': '+08:00',
    'Dagestan': '+03:00',
    'Kaluga Oblast': '+03:00',
    'Kaliningrad Oblast': '+02:00',
    'Murmansk': '+03:00',
    'Samara Oblast': '+04:00',
    'Stavropol Kray': '+03:00',
    'Voronezh Oblast': '+03:00',
    'Kursk': '+03:00',
    'Sverdlovsk': '+05:00',
    'Karelia': '+03:00',
    'Lipetsk Oblast': '+03:00',
    'Adygeya Republic': '+03:00',
    'Ivanovo Oblast': '+03:00',
    'Oryol Oblast': '+03:00',
    'Tula': '+03:00',
    'Kamchatka': '+12:00',
    'Tyumen Oblast': '+05:00',
    'Krasnodarskiy': '+03:00',
    'Krasnoyarskiy': '+07:00',
    'Pskov Oblast': '+03:00',
    'Crimea': '+03:00',
    'Chechnya': '+03:00',
    'Saratovskaya Oblast': '+04:00',
    'Kalmykiya Republic': '+03:00',
    'North Ossetia–Alania': '+03:00',
    'Vologda Oblast': '+03:00',
    'Karachayevo-Cherkesiya Republic': '+03:00',
    'Voronezh': '+03:00',
    'Chukotka': '+12:00',
    'Mordoviya Republic': '+03:00',
    'Kostroma Oblast': '+03:00',
    'Yamalo-Nenets': '+05:00',
    'Magadan Oblast': '+11:00',
    'Altai': '+07:00',
    'Vladimir': '+03:00',
    'Ivanovo': '+03:00',
    'Astrakhan Oblast': '+04:00',
    'Penza': '+03:00',
    'Kabardino-Balkariya Republic': '+03:00',
    'Jaroslavl': '+03:00',
    'Sakhalin Oblast': '+11:00',
    'Sebastopol City': '+03:00',
    'Tyumen Oblast': '+05:00',
    'Kirov': '+03:00',
    'Orel Oblast': '+03:00',
    'Omsk': '+06:00',
    'Smolenskaya Oblast': '+03:00',
    'Nenets': '+03:00',
    'Tver Oblast': '+03:00',
    'Jewish Autonomous Oblast': '+10:00',
    'Ingushetiya Republic': '+03:00',
    'Kaluga': '+03:00',
    'Kaliningrad': '+02:00',
    'North Ossetia': '+03:00',
    'Perm': '+05:00',
    'Smolensk': '+03:00',
    'Primorskiy Maritime Kray': '+10:00',
    'Vologda': '+03:00',
    'Stavropol Krai': '+03:00',
    'Astrakhan': '+04:00',
    'Transbaikal Territory': '+09:00',
    'Tambov': '+03:00',
    'Tyva Republic': '+07:00',
    'Arkhangelsk Oblast': '+03:00'
}

# Define a function to convert local time to UTC
def convert_to_utc(row):
    # Get the UTC offset from the mapping
    utc_offset = int(region_to_utc.get(row['region'], '+03:00').split(":")[0][1:]) # Default to UTC if region not found
    # Convert the local time to UTC
    local_time = row['event_timestamp']
    utc_time = local_time.tz_convert(f'Etc/GMT+{utc_offset}').value # Convert to the appropriate UTC
    return utc_time


In [3]:
# 2.1. Load Data
train_events = pd.read_csv('train_events.csv')
all_events = pd.read_csv('all_events.csv')
video_info = pd.read_csv('video_info_v2.csv')
train_targets = pd.read_csv('train_targets.csv')

In [4]:
events = pd.concat([train_events,all_events])

In [5]:
# 2.2. Merge DataFrames
data = events.merge(video_info, on='rutube_video_id', how='left')
#data = data.merge(train_targets, on='viewer_uid', how='left')

# 2.3. Convert event_timestamp to datetime
data['event_timestamp'] = pd.to_datetime(data['event_timestamp'])


In [6]:
data['event_timestamp'] = data.apply(convert_to_utc, axis=1)

In [6]:
#data = data.drop(columns  =['age','age_class','sex'])
#data = data.drop(columns = ['title'])

In [8]:
from ptls.preprocessing import PandasDataPreprocessor

preprocessor = PandasDataPreprocessor(
    col_id='viewer_uid',
    col_event_time='event_timestamp',
    event_time_transformation='none',
    cols_category=['title','category','author_id','ua_client_name','rutube_video_id','ua_client_type','ua_os','ua_device_type','region'],
    cols_numerical=['total_watchtime','duration'],
    return_records=True,
)

In [9]:
dataset = preprocessor.fit_transform(data)

In [17]:
import pickle

with open('preprocessor_new', 'wb') as f:
    pickle.dump(preprocessor, f)

In [None]:
dataset = sorted(dataset, key=lambda x: x['viewer_uid'])
train_targets = train_targets.sort_values(by=['viewer_uid'])

In [11]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'total_watchtime': 'identity','duration':'identity'},
    embeddings={
        'category':{'in': 128, 'out': 16},
        'author_id':{'in': 128, 'out': 64},
        'ua_client_name':{'in': 128, 'out': 16},
        'rutube_video_id':{'in': 128, 'out': 64},
        'ua_client_type':{'in': 16, 'out': 16},
        'ua_os':{'in': 16, 'out': 16},
        'ua_device_type':{'in': 16, 'out': 16},
        'title':{'in': 128, 'out': 64},
        'region':{'in': 16, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

In [None]:
from functools import partial
from ptls.nn import TrxEncoder, RnnSeqEncoder
from ptls.frames.coles import CoLESModule

trx_encoder_params = dict(
    embeddings_noise=0.003,
    numeric_values={'total_watchtime': 'identity','duration':'identity'},
    embeddings={
        'category':{'in': 128, 'out': 16},
        'author_id':{'in': 128, 'out': 64},
        'ua_client_name':{'in': 128, 'out': 16},
        'rutube_video_id':{'in': 128, 'out': 64},
        'ua_client_type':{'in': 16, 'out': 16},
        'ua_os':{'in': 16, 'out': 16},
        'ua_device_type':{'in': 16, 'out': 16},
        'title':{'in': 128, 'out': 64},
        'region':{'in': 16, 'out': 16},
    },
)

seq_encoder = RnnSeqEncoder(
    trx_encoder=TrxEncoder(**trx_encoder_params),
    hidden_size=256,
    type='gru',
)

model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [12]:
model = CoLESModule(
    seq_encoder=seq_encoder,
    optimizer_partial=partial(torch.optim.Adam, lr=0.001),
    lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=30, gamma=0.9),
)

In [13]:
from ptls.data_load.datasets import MemoryMapDataset
from ptls.data_load.iterable_processing import SeqLenFilter
from ptls.frames.coles import ColesDataset
from ptls.frames.coles.split_strategy import SampleSlices
from ptls.frames import PtlsDataModule

train_dl = PtlsDataModule(
    train_data=ColesDataset(
        MemoryMapDataset(
            data=dataset,
            i_filters=[
                SeqLenFilter(min_seq_len=10),
            ],
        ),
        splitter=SampleSlices(
            split_count=5,
            cnt_min=10,
            cnt_max=200,
        ),
    ),
    train_num_workers=4,
    train_batch_size=128,
)

In [14]:
import torch
import pytorch_lightning as pl

import logging

trainer = pl.Trainer(
    precision=16,
    max_epochs=120,
    accelerator="cuda" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else "auto",
    enable_progress_bar=True,
)

Using 16bit None Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
print(f'logger.version = {trainer.logger.version}')
torch.set_float32_matmul_precision('medium')
trainer.fit(model, train_dl)
print(trainer.logged_metrics)

logger.version = 29


  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name               | Type            | Params
-------------------------------------------------------
0 | _loss              | ContrastiveLoss | 0     
1 | _seq_encoder       | RnnSeqEncoder   | 450 K 
2 | _validation_metric | BatchRecallTopK | 0     
3 | _head              | Head            | 0     
-------------------------------------------------------
450 K     Trainable params
0         Non-trainable params
450 K     Total params
0.902     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

{'loss': tensor(22.1830, device='cuda:0'), 'seq_len': tensor(17.6359, device='cuda:0')}


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [16]:
torch.save(seq_encoder.state_dict(), "coles-title2.pt")

In [45]:
from ptls.data_load.datasets import inference_data_loader
all_dl = inference_data_loader(dataset, num_workers=0, batch_size=128)
train_embeds = torch.vstack(trainer.predict(model, all_dl,))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Predicting: 447it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


TypeError: vstack(): argument 'tensors' (position 1) must be tuple of Tensors, not NoneType

In [60]:
y_pred = model.predict(emb_test)
print(f'Weighted F1-score: {f1:.4f}')

Weighted F1-score: 0.4628
