In [12]:
import torch
import torch.nn as nn

from transformers import AutoTokenizer, AutoModel

from transformers import get_cosine_schedule_with_warmup

from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from torch.optim import AdamW

import random

import numpy as np
import pandas as pd

import os

from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
from train import train_epoch, eval_epoch
from model import MegaSiameseModel
from dataset import TopG_Dataset, topg_collate

In [3]:
def single_model(model, 
                     dataset, 
                     loss_function,
                     collate_fn,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle=True,
                     epochs: int=15, 
                     lr: float=1e-6,
                     batch_size: int=32,
                     start_epoch=0,
                     ):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    model.to(device)
    optimizer = AdamW(
        model.parameters(),
#        lr = lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5
    )
    data_loader = torch.utils.data.DataLoader(
                    dataset,
                    batch_size=batch_size,
                    shuffle=shuffle,
                     collate_fn=collate_fn
    )
    
    total_steps = len(data_loader) * epochs 

    scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

    for epoch_i in range(0, epochs):
        if epoch_i >= start_epoch:
            train_metrics = train_epoch(model, data_loader, loss_function, optimizer, scheduler, device)
            print("EPOCH", epoch_i)
            print(train_metrics)
            #eval_metrics = eval_epoch(model, eval_loader, loss_function, device)
            #print(eval_metrics)

In [4]:
DATA = "data/"
train = pd.read_csv(f"{DATA}train.csv")

In [5]:

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

#model = AutoModelForPreTraining.from_pretrained("cointegrated/rubert-tiny2")
bert1 = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
bert2 = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
model = MegaSiameseModel(bert1, bert2, 2)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.

In [6]:
dataset = TopG_Dataset(train, tokenizer, 200, 60)

In [15]:
def cross_validation(project_name,
                     model, 
                     dataset, 
                     loss_function,
                     collate_fn,
                     strat_array=None,
                     device=torch.device("cuda"),
                     random_state: int=69, 
                     shuffle: bool=True, 
                     n_folds: int=4, 
                     epochs: int=5, 
                     lr: float=1e-6,
                     start_fold: int=0, 
                     batch_size: int=32,
                     iters_to_accumulate=None,
                     n_accumulated_grads: int = 0):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)
    if strat_array:
        kfold = StratifiedKFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset, strat_array)
    else: 
        kfold = KFold(n_folds, shuffle=shuffle, random_state=random_state)
        split = kfold.split(dataset)

    for fold, (train_ids, eval_ids) in enumerate(split):
        if fold >= start_fold:
            print(f'FOLD {fold}')
            print('--------------------------------')
            
            
            '''run = wandb.init(
                name=f"fold_{fold}",
                project=f"{project_name}_fold_{fold}",
                config={ 
                         "random_state": random_state, 
                         "shuffle": shuffle,
                         "epochs": epochs, 
                         "learning_rate": lr,
                         "batch_size": batch_size,
                         "iters_to_accumulate": iters_to_accumulate
                        }
            )'''

            optimizer = AdamW(
            model.parameters(),
            #lr = lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5
        )

            train_subsampler = torch.utils.data.Subset(dataset,  train_ids)
            train_loader = torch.utils.data.DataLoader(
                          train_subsampler, 
                          batch_size=batch_size,
                          shuffle=shuffle,collate_fn=collate_fn,drop_last=True)

            eval_subsampler = torch.utils.data.Subset(dataset,  eval_ids)
            eval_loader = torch.utils.data.DataLoader(
                          eval_subsampler,
                          batch_size=batch_size,
                          shuffle=shuffle,collate_fn=collate_fn,drop_last=True)
            
            total_steps = len(train_loader) * epochs 

            scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

            mrrs = []

            for epoch_i in range(0, epochs):
                train_metrics = train_epoch(model, train_loader, loss_function, optimizer, scheduler, device)
                eval_metrics = eval_epoch(model, eval_loader, loss_function, device)
                
                print(f"EPOCH: {epoch_i}")
                print(train_metrics)
                print(eval_metrics)
                
                #run.log(train_metrics)
                #run.log(eval_metrics)
                            

In [None]:
cross_validation("A", model = model,
             dataset=dataset, 
             loss_function=nn.CrossEntropyLoss(), 
             collate_fn=topg_collate,
             device=torch.device("cuda"),
                     random_state=69,
                     shuffle=True)

FOLD 0
--------------------------------


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 0.31681531455574763, 'Train Accuracy': 0.8834635615348816, 'Train F1*100': 88.03209271227992}
{'Eval Loss': 0.2474297343287617, 'Eval Accuracy': 0.9049479365348816, 'Eval F1': 90.44502617801048}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 0.24715655990358856, 'Train Accuracy': 0.9129774570465088, 'Train F1*100': 91.12242638919636}
{'Eval Loss': 0.23868200516638657, 'Eval Accuracy': 0.9264323115348816, 'Eval F1': 92.48170326014638}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 0.22286956508954367, 'Train Accuracy': 0.9201388955116272, 'Train F1*100': 91.77101967799642}
{'Eval Loss': 0.2725509109441191, 'Eval Accuracy': 0.912109375, 'Eval F1': 90.9090909090909}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 3
{'Train Loss': 0.216647440334782, 'Train Accuracy': 0.9325087070465088, 'Train F1*100': 92.9970727313668}
{'Eval Loss': 0.24896533919187883, 'Eval Accuracy': 0.923828125, 'Eval F1': 91.88063844552393}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 4
{'Train Loss': 0.19374608892636994, 'Train Accuracy': 0.9348958134651184, 'Train F1*100': 93.24019828751692}
{'Eval Loss': 0.25867491130096215, 'Eval Accuracy': 0.92578125, 'Eval F1': 92.22373806275579}
FOLD 1
--------------------------------


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 0.2574085464908017, 'Train Accuracy': 0.913194477558136, 'Train F1*100': 90.87591240875913}
{'Eval Loss': 0.24366629340996346, 'Eval Accuracy': 0.9251302480697632, 'Eval F1': 92.8259513412352}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 0.2298929796088487, 'Train Accuracy': 0.9197048544883728, 'Train F1*100': 91.5832575068244}
{'Eval Loss': 0.2502700425684452, 'Eval Accuracy': 0.9264323115348816, 'Eval F1': 92.72376046361882}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 0.22139333496387634, 'Train Accuracy': 0.9279513955116272, 'Train F1*100': 92.53261358524516}
{'Eval Loss': 0.260721428009371, 'Eval Accuracy': 0.923828125, 'Eval F1': 92.52396166134186}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 3
{'Train Loss': 0.19271298691940805, 'Train Accuracy': 0.9392361044883728, 'Train F1*100': 93.5513588208199}
{'Eval Loss': 0.24519921959533045, 'Eval Accuracy': 0.9244791865348816, 'Eval F1': 92.39842726081258}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 4
{'Train Loss': 0.17086433510606489, 'Train Accuracy': 0.947265625, 'Train F1*100': 94.47852760736197}
{'Eval Loss': 0.2508231078196938, 'Eval Accuracy': 0.9231771230697632, 'Eval F1': 92.35751295336789}
FOLD 2
--------------------------------


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 0.23052460146653983, 'Train Accuracy': 0.9214409589767456, 'Train F1*100': 91.91603394372488}
{'Eval Loss': 0.2434702329337597, 'Eval Accuracy': 0.9251302480697632, 'Eval F1': 92.67982176957352}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 0.2278642606559313, 'Train Accuracy': 0.9273003339767456, 'Train F1*100': 92.44304083013762}
{'Eval Loss': 0.2172356527298689, 'Eval Accuracy': 0.93359375, 'Eval F1': 93.22709163346613}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 0.22236991903951597, 'Train Accuracy': 0.9246962070465088, 'Train F1*100': 92.07219556774045}
{'Eval Loss': 0.2206818088501071, 'Eval Accuracy': 0.9296875, 'Eval F1': 92.79038718291055}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 3
{'Train Loss': 0.19973498370705378, 'Train Accuracy': 0.9359809160232544, 'Train F1*100': 93.46622369878185}
{'Eval Loss': 0.2279243153752759, 'Eval Accuracy': 0.9348958730697632, 'Eval F1': 93.50649350649351}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 4
{'Train Loss': 0.18445630942005664, 'Train Accuracy': 0.9388020634651184, 'Train F1*100': 93.69127516778522}
{'Eval Loss': 0.22688488433292756, 'Eval Accuracy': 0.9329427480697632, 'Eval F1': 93.29863370201691}
FOLD 3
--------------------------------


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 0.2468160952363784, 'Train Accuracy': 0.9223090410232544, 'Train F1*100': 91.84881602914389}
{'Eval Loss': 0.23880745008743057, 'Eval Accuracy': 0.9270833730697632, 'Eval F1': 91.88405797101449}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 0.26514890921923023, 'Train Accuracy': 0.9157986044883728, 'Train F1*100': 91.38925876608965}
{'Eval Loss': 0.2422049029264599, 'Eval Accuracy': 0.9290364980697632, 'Eval F1': 92.67965077233042}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 0.24141717875479823, 'Train Accuracy': 0.9203559160232544, 'Train F1*100': 91.85349611542729}
{'Eval Loss': 0.2231445969082415, 'Eval Accuracy': 0.9361979365348816, 'Eval F1': 93.14685314685313}


  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]