In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torch.nn.functional import softmax
from sklearn.model_selection import KFold
import random
import os
from sklearn.metrics import f1_score
import torch.nn.functional as F

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_TO_DATA_FOLDER = "vkcup2022-first-stage/"

data = pd.read_csv(os.path.join(PATH_TO_DATA_FOLDER,'train.csv'))
test = pd.read_csv(os.path.join(PATH_TO_DATA_FOLDER,'test.csv'))

In [3]:
# !pip install -U transformers==4.13

In [4]:
raw_model = "ai-forever/ruRoberta-large"
tokenizer = RobertaTokenizer.from_pretrained(raw_model)

In [5]:
map_scen = dict(zip(sorted(data['sm'].unique().tolist()),range(len(data.sm))))

In [6]:
# data[data['scenario_id'].map(dict(zip(sorted(data['scenario_id'].unique().tolist()),range(1,len(topics))))).isna()]

In [7]:
# map_scen = dict(**map_scen, **{'Сбермобайл. Wi Fi звонки': 28})

In [8]:
data['sm'].map(map_scen).isna().sum()

0

In [9]:
def transform_df(df):
    df['is_fake'] = df['sm'].map(map_scen)
    return df

In [10]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

    def __len__(self):
        return len(self.x)

def data_collator(batch):
    y = torch.Tensor([p[1] for p in batch]).to(model.device)
    x = tokenizer([p[0] for p in batch], return_tensors='pt', padding=True).to(model.device)
    return (x, y)

In [11]:
# data.sm.unique()

In [12]:
data = transform_df(data)
test = transform_df(test)

In [13]:
def get_dataloaders(train, valid):
    train_dataset = PairsDataset(train.text.values, train.is_fake.values)
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=True, collate_fn=data_collator)
    
    valid_dataset = PairsDataset(valid.text.values, valid.is_fake.values)
    valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)
    
    return train_dataloader, valid_dataloader

In [14]:
def evaluate_model(model, test_dataloader):
    num = 0
    den = 0
    y_true = list()
    y_pred = list()
    y_pred_prob = list()
    f1_valid = .0
    for x, y in test_dataloader:
        with torch.no_grad():
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                # labels=y,
                return_dict=True
            )
            loss = F.cross_entropy(output.logits, y.long())
            # loss = output.loss
            
            num += len(x) * loss.item()
            den += len(x)
            
            y_pred.extend(torch.argmax(output.logits, 1).tolist())
            y_pred_prob.extend(softmax(output.logits, dim = 1)[:, 1].tolist())
            y_true.extend(torch.argmax(y.unsqueeze(1), 1).tolist())
            
    val_loss = num / den
    f1_valid = f1_score(y_true, y_pred, average = 'micro')
    return val_loss, f1_valid, y_pred_prob

In [15]:
num_classes=len(map_scen)

In [16]:
def train_loop(
    model, train_dataloader, val_dataloader, 
    max_epochs=10, 
    lr=1e-5,
    eval_steps = 50
):
    optimizer = torch.optim.Adam(params = model.parameters(), lr=lr)
    scheduler = StepLR(optimizer, step_size = 3, gamma=0.5)
    best_f1 = float('-inf')
    
    for epoch in range(max_epochs):
        print('EPOCH', epoch)
        losses = list()
        for i, (x, y) in enumerate(train_dataloader):
            output = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                # labels=y,
                return_dict=True,
            )
            # class_indices = torch.argmax(y, dim=1)
            loss = F.cross_entropy(output.logits,  y.long())
            # loss = output.loss
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            losses.append(loss.item())
            
            if i % eval_steps == 0:
                model.eval()
                train_loss = np.mean(losses[-eval_steps:])
                eval_loss, eval_f1, _ = evaluate_model(model, val_dataloader)
                if eval_f1 > best_f1:
                    best_f1 = eval_f1
                    torch.save(model.state_dict(), SAVE_PATH)
                print(f'step {i} train_loss: {train_loss:.3} eval_loss: {eval_loss:.3} eval_f1: {eval_f1:.3}')
                model.train()
        scheduler.step()

In [None]:
N_SPLITS = 2
BATCH_SIZE = 16
EPOCHS = 1
SAVE_PATH = 'ruroberta_model'

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=0)
test_results = list()

test_dataset = PairsDataset(test.text.values, test.is_fake.values)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, drop_last=False, shuffle=False, collate_fn=data_collator)

for i, (train_index, test_index) in enumerate(kf.split(data)):
    print(f"=====  FOLD {i}  =====")
    train, valid = data.iloc[train_index], data.iloc[test_index]
    train_dataloader, valid_dataloader = get_dataloaders(train, valid)
    
    model = RobertaForSequenceClassification.from_pretrained(raw_model, num_labels = num_classes);
    model = model.to(device)
    model.train()
    
    train_loop(model, train_dataloader, valid_dataloader, max_epochs=EPOCHS, lr=2e-5, eval_steps = 50)
    
    model.load_state_dict(torch.load(SAVE_PATH))
    model.eval()
    
    _, _, test_probability = evaluate_model(model, test_dataloader)
    test_results.append(test_probability)

=====  FOLD 0  =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


EPOCH 0
step 0 train_loss: 3.67 eval_loss: 3.02 eval_f1: 0.397
step 50 train_loss: 1.61 eval_loss: 1.18 eval_f1: 0.398
step 100 train_loss: 1.14 eval_loss: 0.875 eval_f1: 0.382
step 150 train_loss: 0.855 eval_loss: 0.717 eval_f1: 0.385
step 200 train_loss: 0.718 eval_loss: 0.631 eval_f1: 0.381
step 250 train_loss: 0.641 eval_loss: 0.563 eval_f1: 0.384
step 300 train_loss: 0.538 eval_loss: 0.528 eval_f1: 0.383
step 350 train_loss: 0.512 eval_loss: 0.497 eval_f1: 0.379
step 400 train_loss: 0.511 eval_loss: 0.473 eval_f1: 0.384
step 450 train_loss: 0.486 eval_loss: 0.423 eval_f1: 0.384
step 500 train_loss: 0.448 eval_loss: 0.407 eval_f1: 0.383
step 550 train_loss: 0.336 eval_loss: 0.38 eval_f1: 0.385
step 600 train_loss: 0.486 eval_loss: 0.348 eval_f1: 0.381
step 650 train_loss: 0.341 eval_loss: 0.325 eval_f1: 0.382
step 700 train_loss: 0.389 eval_loss: 0.354 eval_f1: 0.383


In [16]:
assert all([len(x) == len(test) for x in test_results])

predictions = np.mean(test_results, axis = 0)
test.is_fake = [1 if x >= 0.5 else 0 for x in predictions]
test.to_csv('predictions.tsv', index = False, sep = '\t')

In [2]:
!pwd

/Users/aleksandr/Desktop/mobile
