In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
#%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:21"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    print_freq = 1000
    num_workers = 24
    model = 'model/stage-1-paraphrase-multilingual-mpnet-base-v2-epochs-40-seq-50-tuned'
    tokenizer = AutoTokenizer.from_pretrained(model)
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 7
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    weight_decay = 0.01
    max_grad_norm = 0.012
    #max_len = 512
    max_len = 128
    n_folds = 5
    seed = 42
    
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    train = pd.read_parquet('data/candidates_25.parquet')
    train['title1'].fillna("no title", inplace = True)
    train['title2'].fillna("no title", inplace = True)
    #topics['description'].fillna("no description", inplace = True)
    #content['description'].fillna("no description", inplace = True)
    
    correlations = pd.read_csv('data/correlation_folds.csv')
    
    # Create feature column
    train['text'] = train['title1'] + '[SEP]' + train['title2']
    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, correlations


# =========================================================================================
# Get max length
# =========================================================================================
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = max(lengths) + 2 # cls & sep
    print(f"max_len: {cfg.max_len}")

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Custom dataset
# =========================================================================================
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
# =========================================================================================
# Collate function for training
# =========================================================================================
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# =========================================================================================
# Helper functions
# =========================================================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.0001, 0.99, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold
    
# =========================================================================================
# Train & Evaluate
# =========================================================================================
def train_and_evaluate_one_fold(train, correlations, fold, cfg):
    print(' ')
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    #pos = x_train[x_train.target==1]
    #neg = x_train[x_train.target==0].groupby('topics_ids').sample(frac=0.4)
    #x_train = pd.concat([pos,neg],axis=0).reset_index(drop=True)
    #x_train = x_train[x_train.topics_ids.isin(x_train.groupby(['topics_ids']).sample(1).sample(frac=0.5,random_state=1).topics_ids.unique())]
    x_val = train[train['fold'] == fold]
    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    for epoch in range(cfg.epochs):
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        if score > best_score:
            best_score = score
        print(f'Epoch {epoch+1} - Save: {score:.4f} Model')
        torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}_20_{epoch}.pth"
                )
        val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')
    

In [2]:
# Seed everything
seed_everything(CFG)
# Read data
train, correlations = read_data(CFG)

 
--------------------------------------------------
train.shape: (1537925, 7)
correlations.shape: (61517, 3)


In [3]:
# Get max length
#get_max_length(train, CFG)
#2321

In [4]:
CFG.max_len

128

In [5]:
# Train and evaluate one fold
# Stage 1 Max Recall: 8468
train_and_evaluate_one_fold(train, correlations, 0, CFG)

 
Epoch: [1][0/38447] Elapsed 0m 2s (remain 1877m 40s) Loss: 0.6818(0.6818) Grad: 1.0100  LR: 0.00000000  
Epoch: [1][1000/38447] Elapsed 2m 7s (remain 79m 15s) Loss: 0.4495(0.6082) Grad: 0.7898  LR: 0.00000037  
Epoch: [1][2000/38447] Elapsed 4m 12s (remain 76m 40s) Loss: 0.2687(0.5177) Grad: 0.9810  LR: 0.00000074  
Epoch: [1][3000/38447] Elapsed 6m 22s (remain 75m 19s) Loss: 0.3034(0.4859) Grad: 1.8262  LR: 0.00000112  
Epoch: [1][4000/38447] Elapsed 8m 40s (remain 74m 36s) Loss: 0.3209(0.4670) Grad: 1.5145  LR: 0.00000149  
Epoch: [1][5000/38447] Elapsed 10m 57s (remain 73m 16s) Loss: 0.3587(0.4566) Grad: 1.6685  LR: 0.00000186  
Epoch: [1][6000/38447] Elapsed 13m 20s (remain 72m 8s) Loss: 0.4350(0.4487) Grad: 1.9599  LR: 0.00000223  
Epoch: [1][7000/38447] Elapsed 15m 43s (remain 70m 37s) Loss: 0.2409(0.4406) Grad: 1.4282  LR: 0.00000260  
Epoch: [1][8000/38447] Elapsed 18m 6s (remain 68m 54s) Loss: 0.3646(0.4342) Grad: 3.0226  LR: 0.00000297  
Epoch: [1][9000/38447] Elapsed 20m 2

Epoch: [2][27000/38447] Elapsed 55m 42s (remain 23m 36s) Loss: 0.2144(0.2656) Grad: 3.8990  LR: 0.00000939  
Epoch: [2][28000/38447] Elapsed 57m 45s (remain 21m 32s) Loss: 0.2003(0.2650) Grad: 4.0993  LR: 0.00000936  
Epoch: [2][29000/38447] Elapsed 59m 49s (remain 19m 29s) Loss: 0.3444(0.2646) Grad: 4.5430  LR: 0.00000932  
Epoch: [2][30000/38447] Elapsed 61m 52s (remain 17m 25s) Loss: 0.0779(0.2640) Grad: 2.4535  LR: 0.00000929  
Epoch: [2][31000/38447] Elapsed 63m 55s (remain 15m 21s) Loss: 0.1139(0.2635) Grad: 2.4088  LR: 0.00000926  
Epoch: [2][32000/38447] Elapsed 65m 59s (remain 13m 17s) Loss: 0.3600(0.2630) Grad: 4.7007  LR: 0.00000922  
Epoch: [2][33000/38447] Elapsed 68m 2s (remain 11m 13s) Loss: 0.4316(0.2625) Grad: 6.1413  LR: 0.00000919  
Epoch: [2][34000/38447] Elapsed 70m 6s (remain 9m 10s) Loss: 0.2379(0.2620) Grad: 3.8547  LR: 0.00000915  
Epoch: [2][35000/38447] Elapsed 72m 9s (remain 7m 6s) Loss: 0.2124(0.2614) Grad: 5.0433  LR: 0.00000912  
Epoch: [2][36000/38447] E

Epoch: [4][6000/38447] Elapsed 16m 0s (remain 86m 31s) Loss: 0.0996(0.1616) Grad: 1.4623  LR: 0.00000670  
Epoch: [4][7000/38447] Elapsed 18m 39s (remain 83m 50s) Loss: 0.1591(0.1616) Grad: 5.0124  LR: 0.00000663  
Epoch: [4][8000/38447] Elapsed 21m 19s (remain 81m 10s) Loss: 0.1686(0.1621) Grad: 10.1068  LR: 0.00000657  
Epoch: [4][9000/38447] Elapsed 23m 59s (remain 78m 29s) Loss: 0.1427(0.1617) Grad: 7.6765  LR: 0.00000651  
Epoch: [4][10000/38447] Elapsed 26m 39s (remain 75m 49s) Loss: 0.1857(0.1621) Grad: 8.4563  LR: 0.00000645  
Epoch: [4][11000/38447] Elapsed 29m 19s (remain 73m 9s) Loss: 0.3753(0.1624) Grad: 14.0093  LR: 0.00000639  
Epoch: [4][12000/38447] Elapsed 31m 59s (remain 70m 29s) Loss: 0.2497(0.1625) Grad: 4.3934  LR: 0.00000633  
Epoch: [4][13000/38447] Elapsed 34m 39s (remain 67m 49s) Loss: 0.1068(0.1623) Grad: 6.2711  LR: 0.00000626  
Epoch: [4][14000/38447] Elapsed 37m 18s (remain 65m 9s) Loss: 0.0394(0.1625) Grad: 2.9870  LR: 0.00000620  
Epoch: [4][15000/38447] 

Epoch: [5][34000/38447] Elapsed 88m 36s (remain 11m 35s) Loss: 0.0844(0.1279) Grad: 5.0905  LR: 0.00000253  
Epoch: [5][35000/38447] Elapsed 90m 59s (remain 8m 57s) Loss: 0.3017(0.1279) Grad: 13.7292  LR: 0.00000248  
Epoch: [5][36000/38447] Elapsed 93m 14s (remain 6m 20s) Loss: 0.1122(0.1278) Grad: 4.6718  LR: 0.00000242  
Epoch: [5][37000/38447] Elapsed 95m 18s (remain 3m 43s) Loss: 0.0136(0.1277) Grad: 0.7361  LR: 0.00000237  
Epoch: [5][38000/38447] Elapsed 97m 38s (remain 1m 8s) Loss: 0.0185(0.1277) Grad: 1.2292  LR: 0.00000231  
Epoch: [5][38446/38447] Elapsed 98m 36s (remain 0m 0s) Loss: 0.0171(0.1276) Grad: 2.0235  LR: 0.00000229  
EVAL: [0/9613] Elapsed 0m 2s (remain 330m 31s) Loss: 2.5860(2.5860) 
EVAL: [1000/9613] Elapsed 0m 49s (remain 7m 5s) Loss: 0.2997(0.1786) 
EVAL: [2000/9613] Elapsed 1m 42s (remain 6m 30s) Loss: 0.2016(0.2212) 
EVAL: [3000/9613] Elapsed 2m 33s (remain 5m 38s) Loss: 0.0076(0.2298) 
EVAL: [4000/9613] Elapsed 3m 27s (remain 4m 51s) Loss: 0.1452(0.2446) 


KeyboardInterrupt: 