In [434]:
#!g1.1
%pip install annoy
%pip install torchlars
%pip install welford

Defaulting to user installation because normal site-packages is not writeable
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
     |████████████████████████████████| 647 kB 2.5 MB/s            
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25ldone
[?25h  Created wheel for annoy: filename=annoy-1.17.1-cp38-cp38-linux_x86_64.whl size=396909 sha256=9e0d2ba5ee664c66a0cea754beeb2602cc6a18c09e8dac15fe0d589816726e48
  Stored in directory: /tmp/xdg_cache/pip/wheels/f9/93/19/30511c4a9ae6b4937455a134c34a39e13943e2c6f46fcd2ed2
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting torchlars
  Downloading torchlars-0.1.2.tar.gz (6.5 kB)
  Preparing m

In [882]:
#!g1.1
%state_exclude [model train_loader val_loader test_loader model]

In [883]:
#!g1.1

# encoding=utf-8

import numpy as np
import pandas as pd
import os
from argparse import ArgumentParser
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.spatial.distance import cdist
from tqdm import tqdm
import random
import annoy
from torchlars import LARS
import time
from datetime import datetime
from collections import OrderedDict
from itertools import islice

#### Datasets 😊

In [884]:
#!g1.1

class SimCLR_TrainMusicDataset(torch.utils.data.Dataset):
    def __init__(self, features_dir_path, meta_info, device='cpu', crop_size = 60):
        self.features_dir_path = features_dir_path
        self.meta_info = meta_info
        self.trackid2path = meta_info.copy().set_index('trackid')['archive_features_path'].to_dict()
        self.artist_track_ids = meta_info.copy().groupby('artistid').agg(list)
        # drop where list has len <= 8
        
        # исходный код с которым почему то лушче результат!!!
        # self.artist_track_ids.drop(self.artist_track_ids[self.artist_track_ids.trackid.agg(len) < 8].index)        
        
        self.artist_track_ids = self.artist_track_ids.drop(self.artist_track_ids[self.artist_track_ids.trackid.agg(len) < 8].index)        
        self.crop_size = crop_size
        self.data = None # initialized via reshuffle
        self.reshuffle()

    def _generate_pairs(self, track_ids):
        np.random.shuffle(track_ids)
        pairs = [track_ids[i-2:i] for i in range(2, len(track_ids)+1, 2)]
        return pairs

    def reshuffle(self):
        # it must be called after each epoch
        artist_track_ids = self.artist_track_ids.copy()
        artist_track_pairs = artist_track_ids['trackid'].map(self._generate_pairs)
        self.data = artist_track_pairs.explode().dropna()
        
    def _load_item(self, track_id):
        track_features_file_path = self.trackid2path[track_id]
        track_features = np.load(os.path.join(self.features_dir_path, track_features_file_path))
        padding = (track_features.shape[1] - self.crop_size) // 2
        return track_features[:, padding:padding+self.crop_size]

    def _load_duplete(self, tracks_duplete):
        return torch.cat([torch.tensor(self._load_item(x)).unsqueeze(0) for x in tracks_duplete])

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, random_idx):
        file_idx = self.data.iloc[random_idx]
        return self._load_duplete(file_idx), torch.tensor([self.data.index[random_idx]] * 2)
    
class Clustering_TrainMusicDataset(torch.utils.data.Dataset):
    def __init__(self, features_dir_path, meta_info, device='cpu', crop_size = 60):
        self.features_dir_path = features_dir_path
        self.meta_info = meta_info
        self.trackid2path = meta_info.copy().set_index('trackid')['archive_features_path'].to_dict()
        self.artist_track_ids = meta_info.copy().groupby('artistid').agg(list)
        self.crop_size = crop_size
        self.data = None # initialized via reshuffle
        self.reshuffle()

    def _generate_pairs(self, track_ids):
        np.random.shuffle(track_ids)
        pairs = [track_ids[i-2:i] for i in range(2, len(track_ids)+1, 2)]
        return pairs

    def reshuffle(self):
      # it must be called after each epoch
        artist_track_ids = self.artist_track_ids.copy()
        artist_track_pairs = artist_track_ids['trackid'].map(self._generate_pairs)
        self.data = artist_track_pairs.explode().dropna()
        
    def _load_item(self, track_id):
        track_features_file_path = self.trackid2path[track_id]
        track_features = np.load(os.path.join(self.features_dir_path, track_features_file_path))
        padding = (track_features.shape[1] - self.crop_size) // 2
        return track_features[:, padding:padding+self.crop_size]

    def _load_duplete(self, tracks_duplete):
        return torch.cat([torch.tensor(self._load_item(x)).unsqueeze(0) for x in tracks_duplete])

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, random_idx):
        file_idx = self.data.iloc[random_idx]
        return self._load_duplete(file_idx), torch.tensor([self.data.index[random_idx]] * 2)

In [885]:
#!g1.1
class TrainMusicDataset(torch.utils.data.Dataset):
    def __init__(self, features_dir_path, meta_info, device='cpu', crop_size = 60):
        self.features_dir_path = features_dir_path
        self.data = meta_info
        self.crop_size = crop_size

    def _load_item(self, random_idx):
        track_features_file_path = self.data.archive_features_path.iloc[random_idx]
        track_features = np.load(os.path.join(self.features_dir_path, track_features_file_path))
        padding = (track_features.shape[1] - self.crop_size) // 2
        return track_features[:, padding:padding+self.crop_size]

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, random_idx):
        return torch.tensor(self._load_item(random_idx)), torch.tensor([self.data.artistid.iloc[random_idx]])

In [886]:
#!g1.1
class TestMusicDataset(torch.utils.data.Dataset):
    def __init__(self, features_dir_path, meta_info, device='cpu', crop_size = 60):
        self.features_dir_path = features_dir_path
        self.data = meta_info
        self.crop_size = crop_size

    def _load_item(self, random_idx):
        track_features_file_path = self.data.archive_features_path.iloc[random_idx]
        track_features = np.load(os.path.join(self.features_dir_path, track_features_file_path))
        padding = (track_features.shape[1] - self.crop_size) // 2
        return track_features[:, padding:padding+self.crop_size]

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, random_idx):
        return torch.tensor(self._load_item(random_idx)), torch.tensor([self.data.trackid.iloc[random_idx]])

In [887]:
#!g1.1
def train_val_split(dataset, val_size = 0.2): # Сплит по artistid
    artist_ids = dataset['artistid'].unique()
    train_artist_ids, val_artist_ids = train_test_split(artist_ids, test_size = val_size)
    trainset = dataset[dataset['artistid'].isin(train_artist_ids)].copy()
    valset = dataset[dataset['artistid'].isin(val_artist_ids)].copy()
    return trainset, valset

#### Losses

Лосс используемый для SIM_CLR фреймворка

In [888]:
#!g1.1
class NT_Xent(nn.Module):
    def __init__(self, temperature):
        super(NT_Xent, self).__init__()
        self.temperature = temperature
        self.criterion = nn.CrossEntropyLoss(reduction="sum")
        self.similarity_f = nn.CosineSimilarity(dim=2)

    def mask_correlated_samples(self, batch_size):
        N = 2 * batch_size
        mask = torch.ones((N, N), dtype=bool)
        mask = mask.fill_diagonal_(0)
        for i in range(batch_size):
            mask[i, batch_size + i] = 0
            mask[batch_size + i, i] = 0
        return mask
    
    def forward(self, z_i, z_j):
        batch_size = z_i.shape[0]
        N = 2 * batch_size
        z = torch.cat((z_i, z_j), dim=0)
 
        sim = self.similarity_f(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature
        sim_i_j = torch.diag(sim, batch_size)
        sim_j_i = torch.diag(sim, -batch_size)

        mask = self.mask_correlated_samples(batch_size)
        positive_samples = torch.cat((sim_i_j, sim_j_i), dim=0).reshape(N, 1)
        negative_samples = sim[mask].reshape(N, -1)

        labels = torch.zeros(N).to(positive_samples.device).long()
        logits = torch.cat((positive_samples, negative_samples), dim=1)
        loss = self.criterion(logits, labels)
        loss /= N
        
        with torch.no_grad():
            top1_negative_samples, _ = negative_samples.topk(1)
            avg_rank = logits.argsort(descending=True).argmin(dim=1).float().mean().cpu().numpy()

        return loss, avg_rank

Лосс для representation кластеризации

In [889]:
#!g1.1
class StudentSimilarityLoss(nn.Module):
    def __init__(self, alpha):
        super().__init__()
        self.repr_alpha = 1
        self.latent_alpha = alpha
        self.metric = torch.cdist
        self.loss = nn.KLDivLoss(reduction='batchmean')
  
    def _student_prob(self, x, alpha):
        x_matrix = self.metric(x, x)
        x_matrix = torch.pow(x_matrix / alpha + 1, - (alpha + 1) / 2)
        x_matrix = x_matrix / x_matrix.sum()
        return x_matrix

    def forward(self, x1, x2):
        return self.loss(self._student_prob(x1, self.latent_alpha).log(), self._student_prob(x2, self.repr_alpha))

#### Usable functions ✅ (produced by yandex)

In [890]:
#!g1.1

def get_ranked_list(embeds, top_size, annoy_num_trees = 32):
    annoy_index = None
    annoy2id = []
    id2annoy = dict()
    for track_id, track_embed in embeds.items():
        id2annoy[track_id] = len(annoy2id)
        annoy2id.append(track_id)
        if annoy_index is None:
            annoy_index = annoy.AnnoyIndex(len(track_embed), 'angular')
        annoy_index.add_item(id2annoy[track_id], track_embed)
    annoy_index.build(annoy_num_trees)
    ranked_list = dict()
    for track_id in embeds.keys():
        candidates = annoy_index.get_nns_by_item(id2annoy[track_id], top_size+1)[1:] # exclude trackid itself
        candidates = list(filter(lambda x: x != id2annoy[track_id], candidates))
        ranked_list[track_id] = [annoy2id[candidate] for candidate in candidates]
    return ranked_list

def position_discounter(position):
    return 1.0 / np.log2(position+1)   

def get_ideal_dcg(relevant_items_count, top_size):
    dcg = 0.0
    for result_indx in range(min(top_size, relevant_items_count)):
        position = result_indx + 1
        dcg += position_discounter(position)
    return dcg

def compute_dcg(query_trackid, ranked_list, track2artist_map, top_size):
    query_artistid = track2artist_map[query_trackid]
    dcg = 0.0
    for result_indx, result_trackid in enumerate(ranked_list[:top_size]):
        assert result_trackid != query_trackid
        position = result_indx + 1
        discounted_position = position_discounter(position)
        result_artistid = track2artist_map[result_trackid]
        if result_artistid == query_artistid:
            dcg += discounted_position
    return dcg

def eval_submission(submission, gt_meta_info, top_size = 100):
    track2artist_map = gt_meta_info.set_index('trackid')['artistid'].to_dict()
    artist2tracks_map = gt_meta_info.groupby('artistid').agg(list)['trackid'].to_dict()
    ndcg_list = []
    for query_trackid in tqdm(submission.keys()):
        ranked_list = submission[query_trackid]
        query_artistid = track2artist_map[query_trackid]
        query_artist_tracks_count = len(artist2tracks_map[query_artistid])
        ideal_dcg = get_ideal_dcg(query_artist_tracks_count-1, top_size=top_size)
        dcg = compute_dcg(query_trackid, ranked_list, track2artist_map, top_size=top_size)
        try:
            ndcg_list.append(dcg/ideal_dcg)
        except ZeroDivisionError:
            continue
    return np.mean(ndcg_list)

#### Main Net ✅ (produced by yandex)
this place is needed to be changed

In [891]:
#!g1.1
class BasicNet(nn.Module):
    def __init__(self, output_features_size):
        super().__init__()
        self.model_type = 'conv'
        self.output_features_size = output_features_size
        self.conv_1 = nn.Conv1d(512, output_features_size, kernel_size=3, padding=1)
        self.conv_2 = nn.Conv1d(output_features_size, output_features_size, kernel_size=3, padding=1)
        self.mp_1 = nn.MaxPool1d(2, 2)
        self.conv_3 = nn.Conv1d(output_features_size, output_features_size, kernel_size=3, padding=1)
        self.conv_4 = nn.Conv1d(output_features_size, output_features_size, kernel_size=3, padding=1)

    def forward(self, x):
        x = F.relu(self.conv_1(x))
        x = F.relu(self.conv_2(x))
        x = self.mp_1(x)
        x = F.relu(self.conv_3(x))
        return self.conv_4(x).mean(axis = 2)

##### My Net 😊

In [892]:
#!g1.1

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 60):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(-1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        x = x + self.pe
        return self.dropout(x)
    

class TransformerEncoder(nn.Module):
    def __init__(self, d_model: int, input_dim: int = 512, nhead: int = 8, d_hid: int = 512,
                 nlayers: int = 8, dropout: float = 0.1):
        super().__init__()
        # transpose input from [BATCH, 512, 60] -> [BATCH, 60, 512]
        # реализовано в форварде

        # лучше использовать прожектор в конце, пушо предположим что исходные вектора
        # ембедингов получены натренированой моделью
        self.input_encoder = nn.Linear(input_dim, d_model)
        
        self.output_features_size = d_model
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        
        # to project tensor to [BATCH, 1, OUTPUTSIZE] need to use result.mean(axis = 1)

    def forward(self, x):
        # transpose input from [BATCH, 512, 60] -> [BATCH, 60, 512]
        x = torch.transpose(x, 1, 2)
        x = self.input_encoder(x)
        x = self.pos_encoder(x)
        
        # to project tensor to [BATCH, OUTPUTSIZE(d_model)] need to use result.mean(axis = 1)
        x = self.transformer_encoder(x).mean(dim=1)
        return x

##### Resnet  my  😊

In [893]:
#!g1.1
class BottleNeck(nn.Module):
    def __init__(self, in_planes, out_planes, kernel, features_size, downsample):
        super().__init__()
        self.stride = 2 if downsample else 1
        self.in_planes = in_planes
        self.out_planes = out_planes
        self.kernel = kernel
        self.feature_size = features_size
        self.downsample = downsample
        
        self.residual = nn.Sequential(nn.Conv1d(in_planes, out_planes, kernel_size=1, padding=0, stride=2), nn.BatchNorm1d(out_planes)) if downsample else nn.Identity()
        self.net = nn.Sequential(
            nn.Conv1d(in_planes, out_planes // 4, kernel_size=1, padding=0, stride=self.stride),
            nn.BatchNorm1d(out_planes // 4),
            nn.ReLU(),
            nn.Conv1d(out_planes // 4, out_planes // 4, kernel_size=kernel, padding=kernel // 2, stride=1),
            nn.BatchNorm1d(out_planes // 4),
            nn.ReLU(),
            nn.Conv1d(out_planes // 4, out_planes, kernel_size=1, padding=0, stride=1),
            nn.BatchNorm1d(out_planes),
        )
        self.activation = nn.ReLU()

    def forward(self, x):
        res_connect = self.residual(x)
        x = self.net(x)
        x += res_connect
        return self.activation(x)
    
class SimpleBlock(nn.Module):
    def __init__(self, in_planes, out_planes, kernel, features_size, downsample):
        super().__init__()
        self.stride = 2 if downsample else 1
        self.in_planes = in_planes
        self.out_planes = out_planes
        self.kernel = kernel
        self.feature_size = features_size
        self.downsample = downsample
        
        self.residual = nn.Sequential(nn.Conv1d(in_planes, out_planes, kernel_size=1, padding=0, stride=2), nn.BatchNorm1d(out_planes)) if downsample else nn.Identity()
        self.net = nn.Sequential(
            nn.Conv1d(in_planes, out_planes, kernel_size=1, padding=0, stride=self.stride),
            nn.BatchNorm1d(out_planes),
            nn.ReLU(),
            nn.Conv1d(out_planes, out_planes, kernel_size=kernel, padding=kernel // 2, stride=1),
            nn.BatchNorm1d(out_planes),
        )
        self.activation = nn.ReLU()

    def forward(self, x):
        res_connect = self.residual(x)
        x = self.net(x)
        x = x + res_connect
        return self.activation(x)
    
class CLRMBlock(nn.Module):
    def __init__(self, in_planes, out_planes, kernel):
        super().__init__()
        self.in_planes = in_planes
        self.out_planes = out_planes
        self.kernel = kernel
        
#         self.residual = nn.Sequential(nn.MaxPool1d(3, 2, 1), nn.BatchNorm1d(out_planes))
        self.net = nn.Sequential(
            nn.Conv1d(in_planes, out_planes, kernel_size=3, padding=1),
            nn.BatchNorm1d(out_planes),
            nn.ReLU(),
            nn.MaxPool1d(3, 2, 1)
        )

    def forward(self, x):
#         res_connect = self.residual(x)
        x = self.net(x)
        return x
#         return x + res_connect
    
    
class ResNetEncoder(nn.Module):
    def __init__(self, in_planes, out_planes, features_size, blocks_size_list, blocks_kernel_list, building_block):
        super().__init__()
        self.features_size = features_size
        self.in_planes = in_planes
        self.out_planes = out_planes
        self.output_features_size = out_planes      
        self.model_type = 'ResNet'
        self.BuildingBlock = building_block

        blocks_stack = []
        for i in range(blocks_size_list[0]):
            blocks_stack.append(self.BuildingBlock(in_planes, in_planes, blocks_kernel_list[0], features_size, False)) 
        for size, kernel in zip(blocks_size_list[1:], blocks_kernel_list[1:]):
            features_size = int(features_size // 2)
            blocks_stack.append(self.BuildingBlock(in_planes, in_planes * 2, kernel, features_size, True))
            in_planes = in_planes * 2
            for i in range(size - 1):
                blocks_stack.append(self.BuildingBlock(in_planes, in_planes, kernel, features_size, False))   
                
        blocks_stack.append(nn.Conv1d(in_planes, out_planes, kernel_size=1, stride=1))
        self.net = nn.Sequential(*blocks_stack)
        
    def forward(self, x):
        x = self.net(x)
        return x.mean(axis = 2)
    
    
class SampleCNNEncoder(nn.Module):
    def __init__(self, in_planes, out_planes, blocks_num):
        super().__init__()
        self.in_planes = in_planes
        self.out_planes = out_planes
        self.output_features_size = out_planes
        self.blocks_num = blocks_num
        self.model_type = 'SampleCNN'
        
        blocks_stack = []
        blocks_stack.append(CLRMBlock(in_planes, out_planes, 3))
        for i in range(blocks_num - 1):
            blocks_stack.append(CLRMBlock(out_planes, out_planes, 3))
            
        self.net = nn.Sequential(*blocks_stack)
        
    def forward(self, x):
        x = self.net(x)
        return x.mean(axis = 2)

In [894]:
#!g1.1
class LSTMEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim, dropout):
        super().__init__()
        self.model_type = 'GRU'
        self.in_dim = in_dim
        self.dropout = dropout
        self.hidden_dim = hidden_dim
        self.output_features_size = hidden_dim 
        
#         self.downsampler = nn.Sequential(
#             nn.Conv1d(in_dim, in_dim, kernel_size=3, padding=1, stride=2),
#             nn.Dropout(p=self.dropout),
#             nn.Conv1d(in_dim, in_dim, kernel_size=3, padding=1, stride=2),
#             nn.Dropout(p=self.dropout)
#         )
        self.downsampler = nn.Sequential(
            nn.Conv1d(in_dim, in_dim, kernel_size=3, padding=1),
            nn.MaxPool1d(2, 2),
            nn.Conv1d(in_dim, in_dim, kernel_size=3, padding=1),
            nn.MaxPool1d(2, 2),
            nn.Conv1d(in_dim, in_dim, kernel_size=3, padding=1),
            nn.MaxPool1d(2, 2),
            nn.Dropout(p=self.dropout)
        )
        
#         self.bgru1 = nn.GRU(in_dim, hidden_dim // 2, batch_first=True, bidirectional=True)
#         self.bgru2 = nn.GRU(hidden_dim, hidden_dim // 2, batch_first=True, bidirectional=True)
#         self.out_gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        
#         self.bgru1 = nn.GRU(in_dim, hidden_dim, batch_first=True, bidirectional=True)
#         self.out_gru = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True)   
        
        self.bgru1 = nn.GRU(in_dim, hidden_dim, batch_first=True, num_layers=3)
        self.out_gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        
        
    def forward(self, x):
        x = self.downsampler(x)
        x = torch.transpose(x, 1, 2)
        out, _ = self.bgru1(x)
#         out, _ = self.bgru2(out)
        _, out = self.out_gru(out)
        return out.squeeze()

#### warmup optimizer my  😊

In [804]:
#!g1.1

class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, lr_mul, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.lr_mul = lr_mul
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0


    def step(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        "Zero out the gradients with the inner optimizer"
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))


    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_steps += 1
        lr = self.lr_mul * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

#### SymCLR definition ✅ (produced by yandex)

In [895]:
#!g1.1

class SimCLR(nn.Module):
    def __init__(self, encoder, projection_dim):
        super().__init__()
        self.framework_type = "sim_clr"
        self.encoder = encoder
        self.n_features = encoder.output_features_size
        self.projection_dim = projection_dim
        self.projector = nn.Sequential(
            nn.Linear(self.n_features, self.n_features, bias=False),
            nn.ReLU(),
            nn.Linear(self.n_features, self.projection_dim, bias=False),
        )
        
        self.repr_net = nn.Sequential(
            nn.Linear(self.n_features, 2000, bias=False),
            nn.ReLU(),
            nn.Linear(2000, self.n_features, bias=False),
        )
        
    def forward(self, x_i, x_j):
        h_i = self.encoder(x_i)
        h_j = self.encoder(x_j)

        z_i = self.projector(h_i)
        z_j = self.projector(h_j)
        return h_i, h_j, z_i, z_j
    
    def get_repr(self, h):
        return self.repr_net(h)

def inference(model, loader, standartizer):
    embeds = dict()
    for tracks_features, tracks_ids in loader:
        tracks_ids = tracks_ids.cpu().numpy().reshape(-1).tolist()
        tracks_features = tracks_features.to('cuda')
        if standartizer is not None:
            tracks_features = standartizer.transform(tracks_features)
        with torch.no_grad():
            tracks_embeds = model(tracks_features)
            for track_id, track_embed in zip(tracks_ids, tracks_embeds):
                embeds[track_id] = track_embed.cpu().numpy()
    return embeds

In [896]:
#!g1.1
class EncoderClassificator(nn.Module):
    def __init__(self, encoder, projection_dim):
        super().__init__()
        self.framework_type = "classifier"
        self.encoder = encoder
        self.n_features = encoder.output_features_size
        self.projection_dim = projection_dim
        self.projector = nn.Sequential(
            nn.Linear(self.n_features, self.n_features, bias=False),
            nn.ReLU(),
            nn.Linear(self.n_features, self.projection_dim, bias=False),
        )
        
    def forward(self, x):
        x = self.encoder(x)
        return self.projector(x)

In [897]:
#!g1.1
class Standartizer():
    def __init__(self, device='cuda:0'):
        self.mean = torch.load("dataset_mean.pt").float().to(device)
        self.std = torch.load("dataset_std.pt").float().to(device)
    
    def transform(self, x):
        x = torch.transpose(x, -2, -1)
        x = (x - self.mean) / self.std
        return torch.transpose(x, -2, -1)

    def transform_back(self, x):
        x * (self.std / self.num_batches) + (self.mean / self.num_batches)

#### DataLoading ✅ (produced by yandex)

In [None]:
#!g1.1

base_dir = "/home/jupyter/mnt/datasets/yandex_cup_dataset/"
results_dir = "results_transformers/" + datetime.now().strftime('%Y-%m-%d %H:%M:%S')
os.makedirs(results_dir, exist_ok=True)

# Seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

TRAINSET_DIRNAME = 'train_features'
TESTSET_DIRNAME = 'test_features'
TRAINSET_META_FILENAME = 'train_meta.tsv'
TESTSET_META_FILENAME = 'test_meta.tsv'
SUBMISSION_FILENAME = 'submission.txt'
MODEL_FILENAME = 'model.pt'
CHECKPOINT_FILENAME = 'best.pt'
device = 'cuda'

BATCH_SIZE = 512
N_CHANNELS = 1024

PROJECTION_DIM = 256
# PROJECTION_DIM = 18468   # train_meta_info.artistid.max()
NUM_EPOCHS = 50
LR = 1e-4
TEMPERATURE = 0.1
WARMUP_STEPS = 2000

TRAINSET_PATH = os.path.join(base_dir, TRAINSET_DIRNAME)
TESTSET_PATH = os.path.join(base_dir, TESTSET_DIRNAME)
TRAINSET_META_PATH = os.path.join(base_dir, TRAINSET_META_FILENAME)
TESTSET_META_PATH = os.path.join('dataset/', TESTSET_META_FILENAME)   # не загрузил 1 файл в датасет, надо дозагрузить
SUBMISSION_PATH = os.path.join(results_dir, SUBMISSION_FILENAME)
MODEL_PATH = os.path.join(results_dir, MODEL_FILENAME)
CHECKPOINT_PATH = os.path.join(results_dir, CHECKPOINT_FILENAME)

In [None]:
#!g1.1

model = SimCLR(
#     encoder =  TransformerEncoder(d_model= N_CHANNELS, nlayers = 6, nhead=8, dropout=0.2),
    encoder = SampleCNNEncoder(512, N_CHANNELS, 4),
#     encoder = BasicNet(N_CHANNELS),
#     encoder = ResNetEncoder(512, N_CHANNELS, 60, [2, 2, 3], [5, 3, 3], BottleNeck),
#     encoder = LSTMEncoder(512, N_CHANNELS, dropout=0.2),
    projection_dim = PROJECTION_DIM
).to(device)
with open(os.path.join(results_dir, "model_architecture.txt"), "w") as model_file:
    model_file.write(repr(model))

train_meta_info = pd.read_csv(TRAINSET_META_PATH, sep='\t')
_, validation_train_meta_info = train_val_split(train_meta_info, val_size=0.1)
train_meta_info = pd.read_csv(TRAINSET_META_PATH, sep='\t')
test_meta_info = pd.read_csv(TESTSET_META_PATH, sep='\t')
train_meta_info, validation_meta_info = train_val_split(train_meta_info, val_size=0.1)

train_dataset = SimCLR_TrainMusicDataset(TRAINSET_PATH, train_meta_info, device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(TestMusicDataset(TRAINSET_PATH, validation_meta_info, device), batch_size=BATCH_SIZE, num_workers=8)
val_train_loader = torch.utils.data.DataLoader(TestMusicDataset(TRAINSET_PATH, validation_train_meta_info, device), batch_size=BATCH_SIZE, num_workers=8)
test_loader = torch.utils.data.DataLoader(TestMusicDataset(TESTSET_PATH, test_meta_info, device), batch_size=BATCH_SIZE, num_workers=8)

print("Loaded data")
print("Train set size: {}".format(len(train_meta_info)))
print("Validation set size: {}".format(len(validation_meta_info)))
print("Test set size: {}".format(len(test_meta_info)))
print(model)

In [None]:
#!g1.1   
model.load_state_dict(torch.load("results_transformers/gru_best/best.pt"))
# model.encoder.load_state_dict(torch.load("results_transformers/verybig/best.pt"))
# model.encoder.load_state_dict(torch.load(CHECKPOINT_PATH))

In [879]:
#!g1.1
optimizer = None
scheduler = None
print(model.framework_type, "  ", model.encoder.model_type)
if model.framework_type == "sim_clr":
    if model.encoder.model_type == 'Transformer':
#         optimizer = torch.optim.Adam(model.parameters(), lr=LR)
        optimizer = ScheduledOptim(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), 
            1, 
            N_CHANNELS, 
            WARMUP_STEPS
        )
    elif model.encoder.model_type in ['ResNet', 'SampleCNN'] :
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
        optimizer = LARS(optimizer=optimizer, eps=1e-8, trust_coef=0.001)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=0.0001)
elif model.framework_type == "classifier":
    if model.encoder.model_type == 'ResNet':
        optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
            factor=0.1, patience=10, threshold=0.001, threshold_mode='abs')
    elif model.encoder.model_type == 'Transformer':
        optimizer = ScheduledOptim(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), 
            1, 
            N_CHANNELS, 
            WARMUP_STEPS
        )

sim_clr    ResNet


In [822]:
#!g1.1

# sanity check
standartizer = Standartizer()
from itertools import islice
with torch.no_grad():
    for item, label in islice(train_loader, 1):
        print(item.shape)
        item = item.to(device)
        print(standartizer.transform(item[:,0,:,:]).shape)
        print("mean: ", torch.transpose(standartizer.transform(item[:,0,:,:]), -1, -2).reshape(-1, 512).mean(dim=0))
        print("std: ", torch.transpose(standartizer.transform(item[:,0,:,:]), -1, -2).reshape(-1, 512).std(dim=0))
        item = item.to(device)
        print(model.encoder(item[:,0,:,:]).shape)

torch.Size([512, 2, 512, 60])
torch.Size([512, 512, 60])
mean:  tensor([-2.3566e-02, -3.8010e-02, -2.1055e-02, -4.3218e-02,  1.5220e-03,
        -8.9258e-04, -3.0401e-02, -4.0252e-02,  7.6105e-03,  1.4333e-02,
        -2.7575e-02, -2.3669e-02, -1.5745e-02, -1.9528e-02, -3.9606e-02,
         3.2809e-02,  3.6028e-02, -2.1577e-02, -3.6904e-02, -4.7996e-02,
         8.2398e-03,  1.8565e-02, -1.9955e-02, -4.0573e-02, -4.0094e-02,
         1.5161e-02,  1.4935e-02, -5.6856e-03, -5.3570e-04,  1.0829e-02,
         2.2429e-03,  9.1977e-02,  1.1174e-02, -1.1680e-02, -1.3353e-02,
        -4.7472e-02, -7.0498e-03, -2.9419e-02, -6.2780e-03,  1.0440e-02,
        -6.0663e-03, -1.3389e-02, -2.3208e-02,  1.7112e-02,  1.6010e-02,
        -2.9152e-02, -9.8055e-03,  6.8727e-03, -1.0013e-02, -4.2652e-03,
        -1.5899e-02, -1.0839e-02, -9.5819e-03, -2.9699e-02, -1.6217e-02,
        -9.5440e-03, -6.5245e-02, -1.1891e-02, -2.9378e-02, -5.2754e-03,
        -3.0358e-02, -3.3414e-02,  6.2473e-03, -3.1717e-03, 

#### Train and save subbmission ✅ (produced by yandex)

In [855]:
#!g1.1
def train(module, standartizer, train_loader, train_dataset, val_loader, valset_meta, optimizer, scheduler, criterion, criterion_repr, num_epochs, checkpoint_path, device, top_size = 100):
    # need to get not deviced tensors because of multiprocessing
    max_ndcg = None
    counter_max_ndcg = 0
    saturate = False  

    for epoch in range(num_epochs):
        if counter_max_ndcg > 2:  # initiate clustering loss
            saturate = True
            
        pbar = tqdm(enumerate(train_loader))
        avg_rang_sum = 0
        loss_sum = 0
        train_dataset.reshuffle()
        for batch_num, batch in pbar:
            optimizer.zero_grad()
            module.train()
            inputs, labels = batch[0].to(device), batch[1].to(device)
            
            # if sim_clr
            
            x_i, x_j = inputs[:, 0, :, :], inputs[:, 1, :, :]
            if standartizer is not None:
                x_i, x_j = standartizer.transform(x_i), standartizer.transform(x_j)
                
            h_i, h_j, z_i, z_j = module(x_i, x_j)
            loss, avg_rank = criterion(z_i, z_j)
            if saturate:
                h = torch.cat((h_i, h_j))
                loss = loss + 0.2 * criterion_repr(h, module.get_repr(h))
            avg_rang_sum += avg_rank            
                
            loss_sum += loss.item()
            loss.backward()
            optimizer.step()
#             scheduler.step(loss)
            
            pbar.set_postfix({"Epoch": epoch+1, "mean loss": loss_sum / (batch_num + 1), "mean avg_rank" :avg_rang_sum / (batch_num + 1)})
#             pbar.set_postfix({"Epoch": epoch+1, "mean loss": loss_sum / (batch_num + 1)})

        with torch.no_grad():
            model_encoder = module.encoder
            embeds_encoder = inference(model_encoder, val_loader, standartizer)
            ranked_list_encoder = get_ranked_list(embeds_encoder, top_size)
            val_ndcg_encoder = eval_submission(ranked_list_encoder, valset_meta)
            
            model_projector = nn.Sequential(module.encoder, module.projector)
            embeds_projector = inference(model_projector, val_loader, standartizer)
            ranked_list_projector = get_ranked_list(embeds_projector, top_size)
            val_ndcg_projector = eval_submission(ranked_list_projector, valset_meta)
            
            print("Validation nDCG on epoch {}".format(epoch))
            print("Encoder - {}".format(val_ndcg_encoder))
            print("Projector - {}".format(val_ndcg_projector))
            if (max_ndcg is None) or (val_ndcg_encoder > max_ndcg):
                max_ndcg = val_ndcg_encoder
                counter_max_ndcg = 0
                torch.save(model_encoder.state_dict(), checkpoint_path)
            else:
                counter_max_ndcg += 1

def save_submission(submission, submission_path):
    with open(submission_path, 'w') as f:
        for query_trackid, result in submission.items():
            f.write("{}\t{}\n".format(query_trackid, " ".join(map(str, result))))

#### Main training-eval cycle ✅ (produced by yandex)

В клеточке ниже я считаю при помощи алгоритма вилфорда среднее и стандартное отклонение по всему датасету

In [693]:
#!g1.1
# calculate mean variance

# from welford import Welford
# normalizer = Welford()

# for data, _ in tqdm(train_loader):
#     data = data.reshape(-1, 512, 60)
#     normalizer.add_all(torch.transpose(data, 1, 2).reshape((-1, 512)).numpy())

100%|██████████| 146/146 [01:39<00:00,  1.47it/s]


Главный тренировочный цикл

In [880]:
#!g1.1

print("Train")
train(
    module = model,
    standartizer = Standartizer(), #None
    train_loader = train_loader,
    train_dataset = train_dataset,
    val_loader = val_loader,
    valset_meta = validation_meta_info,
    optimizer = optimizer,
    scheduler=scheduler,
    criterion = NT_Xent(temperature = TEMPERATURE),
#     criterion = nn.CrossEntropyLoss(),
    criterion_repr = StudentSimilarityLoss(alpha=2 * N_CHANNELS),
    num_epochs = NUM_EPOCHS,
    checkpoint_path = CHECKPOINT_PATH,
    device = device
)

Train
Validation nDCG on epoch 0
Encoder - 0.11931760187880033
Projector - 0.13193957451477736
Validation nDCG on epoch 1
Encoder - 0.15061952017905908
Projector - 0.16771937706158294
Validation nDCG on epoch 2
Encoder - 0.1705590639685832
Projector - 0.19123150562784486
Validation nDCG on epoch 3
Encoder - 0.18295250966770032
Projector - 0.2047412867890922
Validation nDCG on epoch 4
Encoder - 0.19192885484813096
Projector - 0.21228116820168416
Validation nDCG on epoch 5
Encoder - 0.19748388124764607
Projector - 0.2165337856771514
Validation nDCG on epoch 6
Encoder - 0.20002003092284718
Projector - 0.21630448630335758
Validation nDCG on epoch 7
Encoder - 0.20076328669412055
Projector - 0.21316432089122744
Validation nDCG on epoch 8
Encoder - 0.20152217422708524
Projector - 0.20688233401333048
Validation nDCG on epoch 9
Encoder - 0.19966772875338995
Projector - 0.20134903673692034
Validation nDCG on epoch 10
Encoder - 0.1989789799517578
Projector - 0.19406709183532758
Validation nDCG on

137it [00:54,  2.51it/s, Epoch=3, mean loss=4.81, mean avg_rank=76.6]
100%|██████████| 16680/16680 [00:02<00:00, 5853.53it/s]
100%|██████████| 16680/16680 [00:02<00:00, 5857.13it/s]
137it [00:54,  2.52it/s, Epoch=4, mean loss=4.6, mean avg_rank=66.5]
100%|██████████| 16680/16680 [00:02<00:00, 5880.71it/s]
100%|██████████| 16680/16680 [00:02<00:00, 5762.27it/s]
137it [00:54,  2.52it/s, Epoch=5, mean loss=4.41, mean avg_rank=57.8]
100%|██████████| 16680/16680 [00:02<00:00, 5893.93it/s]
100%|██████████| 16680/16680 [00:02<00:00, 5890.44it/s]
137it [00:54,  2.51it/s, Epoch=6, mean loss=4.27, mean avg_rank=51.4]
100%|██████████| 16680/16680 [00:02<00:00, 5899.56it/s]
100%|██████████| 16680/16680 [00:02<00:00, 5826.72it/s]
137it [00:54,  2.52it/s, Epoch=7, mean loss=4.11, mean avg_rank=44.8]
100%|██████████| 16680/16680 [00:02<00:00, 5889.09it/s]
100%|██████████| 16680/16680 [00:02<00:00, 5813.28it/s]
137it [00:54,  2.51it/s, Epoch=8, mean loss=3.96, mean avg_rank=38.8]
100%|██████████| 1668

KeyboardInterrupt: 

In [881]:
#!g1.1
torch.save(optimizer.state_dict(), results_dir + '/optimizer.pt')
torch.save(model.encoder.state_dict(), MODEL_PATH)

#### Generating submission

In [790]:
#!g1.1

# model.encoder = LSTMEncoder(512, 1024)
# model.encoder.load_state_dict(torch.load("results_transformers/2022-11-03 10:19:54/best.pt"))
# model.encoder.to(device)
print("Submission")
test_loader = test_loader
model_for_inference = model.encoder
embeds = inference(model_for_inference, test_loader, Standartizer())
submission = get_ranked_list(embeds, 100)
save_submission(submission, SUBMISSION_PATH)
# save_submission(submission, "results_transformers/2022-11-03 10:19:54/submission.txt")

Submission


In [None]:
#!g1.1
