In [1]:
import random
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Subset

from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score

from tqdm.auto import tqdm

In [2]:
SEED = 42


def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        # torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False
        
seed_all(SEED)

# Data

In [3]:
def read_texts(path):
    with open(path, "r") as fin:
        return [word.strip() for word in fin.readlines()]

In [4]:
DATA = "data/"
train_path = DATA + "train_stresses_labels.txt"
test_path = DATA + "public_test_stresses.txt"
train_path

'data/train_stresses_labels.txt'

In [5]:
words_train = (read_texts(train_path))
words_test = (read_texts(test_path))

len(words_train), len(words_test)

(588490, 294253)

In [6]:
words_train

['аа^к',
 'аа^ка',
 'аа^ке',
 'аа^ки',
 'аа^ков',
 'аа^ком',
 'аа^м',
 'аа^му',
 'аа^нгича',
 'аа^нгичам',
 'ааро^не',
 'ааро^новец',
 'ааро^новские',
 'ааро^новский',
 'ааро^новца',
 'ааро^новцами',
 'ааро^новце',
 'ааро^новцы',
 'ааро^новщин',
 'ааро^новщинами',
 'ааро^новщинах',
 'ааро^новщины',
 'ааро^ну',
 'а^ахенец',
 'аа^хенский',
 'абаа^сами',
 'абаа^сов',
 'абаа^су',
 'абаа^сы',
 'абада^н',
 'абада^нец',
 'абада^нках',
 'абада^нки',
 'абада^нкою',
 'абада^нку',
 'абада^нские',
 'абада^нский',
 'абада^нца',
 'абада^нцами',
 'абада^нце',
 'абада^нцев',
 'абада^нцы',
 'абажу^рами',
 'абажу^рно',
 'абажу^рны',
 'абажу^ров',
 'абажу^ру',
 'абази^ею',
 'абази^на',
 'абази^нам',
 'абази^нки',
 'абази^нкою',
 'абази^нские',
 'абази^нско',
 'абази^нца',
 'абази^нцу',
 'абази^нцы',
 'абази^я',
 'аба^зов',
 'аба^зом',
 'аба^им',
 'аба^й',
 'аба^к',
 'аба^кам',
 'аба^ками',
 'абака^н',
 'абака^не',
 'абака^нский',
 'абако^вый',
 'аба^ком',
 'абако^ст',
 'абако^стам',
 'абако^сте',
 'абако

In [7]:
words_test

['аакам',
 'ааками',
 'ааленец',
 'аама',
 'аамами',
 'аамов',
 'аамом',
 'аамы',
 'аангичами',
 'аангичах',
 'аангиче',
 'аангичи',
 'аангичу',
 'аарон',
 'аарона',
 'ааронов',
 'аароновцам',
 'аароновцах',
 'аароновцев',
 'аароновцем',
 'аароновщиной',
 'аароновщиною',
 'аароны',
 'аахен',
 'абаас',
 'абаасам',
 'абаасах',
 'абаасе',
 'абаасом',
 'абаданка',
 'абаданке',
 'абаданкой',
 'абаданцем',
 'абаддон',
 'абадзех',
 'абажур',
 'абажура',
 'абажурах',
 'абажурна',
 'абажурный',
 'абажуродержатель',
 'абажуры',
 'абазин',
 'абазинкам',
 'абазинками',
 'абазинку',
 'абазином',
 'абазинска',
 'абазински',
 'абазину',
 'абазинце',
 'абазию',
 'абазой',
 'абайя',
 'абаканец',
 'абаках',
 'абакостами',
 'абакостах',
 'абактериально',
 'абактериальный',
 'абактинальный',
 'абакумычами',
 'абакумыче',
 'абакумычем',
 'абалона',
 'абалонами',
 'абалоне',
 'абалонов',
 'абами',
 'абанамат',
 'абант',
 'абарг',
 'абаргам',
 'абаргу',
 'абарогнозия',
 'абарогнозом',
 'абат',
 'абатиса',
 '

# Tokenization

In [8]:
unique_symbols = sorted(list(set("".join(words_train) + "".join(words_test))))
unique_symbols

['^',
 'а',
 'б',
 'в',
 'г',
 'д',
 'е',
 'ж',
 'з',
 'и',
 'й',
 'к',
 'л',
 'м',
 'н',
 'о',
 'п',
 'р',
 'с',
 'т',
 'у',
 'ф',
 'х',
 'ц',
 'ч',
 'ш',
 'щ',
 'ъ',
 'ы',
 'ь',
 'э',
 'ю',
 'я',
 'ё']

In [9]:
char_to_id = {
    'а': 0,
    'б': 1,
    'в': 2,
    'г': 3,
    'д': 4,
    'е': 5,
    'ё': 6,
    'ж': 7,
    'з': 8,
    'и': 9,
    'й': 10,
    'к': 11,
    'л': 12,
    'м': 13,
    'н': 14,
    'о': 15,
    'п': 16,
    'р': 17,
    'с': 18,
    'т': 19,
    'у': 20,
    'ф': 21,
    'х': 22,
    'ц': 23,
    'ч': 24,
    'ш': 25,
    'щ': 26,
    'ъ': 27,
    'ы': 28,
    'ь': 29,
    'э': 30,
    'ю': 31,
    'я': 32,
}

nonstressable = set("бвгджзйклмнпрстфхцчшщъь")
stressable = set("аеёиоуыэюя")

assert len(nonstressable) + len(stressable) == 33

VOCAB_SIZE = len(list(char_to_id.keys())) + 1 # 33 russian letters and [PAD]
VOCAB_SIZE

34

In [10]:
MAX_LEN = max([len(word) - 1 for word in words_train] + [len(word) for word in words_test])
MAX_LEN

36

In [11]:
def preprocess_train_word(word):
    chars = list(word)
    
    stress_ind = chars.index("^")
    chars.pop(stress_ind)

    ids = [char_to_id[char] for char in chars]

    stressability_mask = [1  if char in stressable else 0 for char in chars] + [0 for i in range(MAX_LEN - len(ids))]
    
    attn_mask = [1 for i in range(len(ids))] + [0 for i in range(MAX_LEN - len(ids))]
    pads = [33] * (MAX_LEN - len(ids))
    ids = ids + pads
    labels = [0] * MAX_LEN
    labels[stress_ind - 1] = 1

    return torch.LongTensor(ids).unsqueeze(0), torch.LongTensor(attn_mask).unsqueeze(0), torch.LongTensor(stressability_mask).unsqueeze(0), torch.LongTensor(labels).unsqueeze(0)
    

In [12]:
preprocess_train_word(words_train[0])

(tensor([[ 0,  0, 11, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
          33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33]]),
 tensor([[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

In [13]:
train_dataset_ids, train_dataset_attn_mask, train_stressability_mask, train_dataset_labels = [], [], [], []

for word in tqdm(words_train):
    sample_ids, sample_attn_mask, sample_stressability_mask, sample_labels = preprocess_train_word(word)
    train_dataset_ids.append(sample_ids)
    train_dataset_attn_mask.append(sample_attn_mask)
    train_stressability_mask.append(sample_stressability_mask)
    train_dataset_labels.append(sample_labels)

train_dataset_ids, train_dataset_attn_mask, train_stressability_mask, train_dataset_labels = torch.cat(train_dataset_ids, dim=0), torch.cat(train_dataset_attn_mask, dim=0), torch.cat(train_stressability_mask, dim=0), torch.cat(train_dataset_labels, dim=0)

  0%|          | 0/588490 [00:00<?, ?it/s]

In [14]:
train_dataset_ids

tensor([[ 0,  0, 11,  ..., 33, 33, 33],
        [ 0,  0, 11,  ..., 33, 33, 33],
        [ 0,  0, 11,  ..., 33, 33, 33],
        ...,
        [ 6, 21,  9,  ..., 33, 33, 33],
        [ 6, 21,  9,  ..., 33, 33, 33],
        [ 6, 21,  9,  ..., 33, 33, 33]])

In [15]:
dataset = TensorDataset(train_dataset_ids, train_dataset_attn_mask, train_stressability_mask, train_dataset_labels)
example_dataloader = DataLoader(dataset, batch_size=64, num_workers=16)

In [16]:
next(iter(example_dataloader))

[tensor([[ 0,  0, 11,  ..., 33, 33, 33],
         [ 0,  0, 11,  ..., 33, 33, 33],
         [ 0,  0, 11,  ..., 33, 33, 33],
         ...,
         [ 0,  1,  0,  ..., 33, 33, 33],
         [ 0,  1,  0,  ..., 33, 33, 33],
         [ 0,  1,  0,  ..., 33, 33, 33]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 1,  ..., 0, 0, 0],
         [1, 0, 1,  ..., 0, 0, 0],
         [1, 0, 1,  ..., 0, 0, 0]]),
 tensor([[0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]])]

# Model

In [17]:
class StressTransformer(nn.Module):
    def __init__(self, embedding_dim, nhead, dim_feedforward, num_layers, norm_first, enable_nested_tensor=False):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, embedding_dim)
        self.transformer = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=embedding_dim, 
                nhead=nhead,
                dim_feedforward=dim_feedforward,
                norm_first=True,
                batch_first=True,
            ),
            num_layers=num_layers,
            # norm=norm,
            enable_nested_tensor=enable_nested_tensor
        )
        self.out = nn.Linear(embedding_dim, 1)
        
    def forward(self, input_ids, attention_masks, stress_masks):
        x = self.embedding(input_ids) # cfm multiply by attn mask??))?)??)?
        x = self.transformer(
            x,
            # src_key_padding_mask=attention_masks.bool()
        )
        x = self.out(x) * attention_masks.unsqueeze(dim=-1)
        x = x * stress_masks.unsqueeze(dim=-1) 
        return x.squeeze(dim=-1)
        
        

In [18]:
model = StressTransformer(
    embedding_dim=64, 
    nhead=4, 
    dim_feedforward=128, 
    num_layers=1, 
    norm_first=True, 
    enable_nested_tensor=True
)
model

StressTransformer(
  (embedding): Embedding(34, 64)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (out): Linear(in_features=64, out_features=1, bias=True)
)

In [19]:
example_ids, example_attm_mask, example_stress_mask, example_labels = next(iter(example_dataloader))
example_attm_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [20]:
model(example_ids, example_attm_mask, example_stress_mask).shape, example_labels.shape

(torch.Size([64, 36]), torch.Size([64, 36]))

# Loops

In [21]:
def train_epoch_tr(model, data_loader, loss_function, optimizer, scheduler, device, n_acum_steps = 1):
    model.to(device)
    model.train()
    total_train_loss = 0

    dl_size = len(data_loader)
    
    preds = []
    targets = []

    batch_i = 0
    for batch in tqdm(data_loader):
        input_ids, attention_masks, stress_masks, target = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        stress_masks = stress_masks.to(device)
        target = target.to(device)
        
        optimizer.zero_grad()        
        logits = model(input_ids, attention_masks, stress_masks)
                
        preds.append(logits.argmax(dim=1).detach().cpu())
        targets.append(target.argmax(dim=1).cpu())
                
        loss = loss_function(logits, target.argmax(dim=1)) / n_acum_steps
        total_train_loss += loss.item()
        
        loss.backward()
        if batch_i % n_acum_steps == 0 or batch_i == len(data_loader.dataset): 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        batch_i += 1

    preds = torch.cat(preds, dim=0).numpy()
    targets = torch.cat(targets, dim=0).numpy()

    acc = ((targets == preds).sum() / preds.shape[0]).item()
    # f1 = f1_score(targets, preds)
    # precision = precision_score(targets, preds)
    # recall = recall_score(targets, preds)

    metrics = {
        "Train Loss": total_train_loss / dl_size,
        "Train Accuracy": acc,
        # "Train F1": f1,
        # "Train Precision": precision, 
        # "Train Recall": recall, 
    }
    
    return metrics
    
    
def eval_epoch_tr(model, data_loader, loss_function, device):
    model.to(device)
    model.eval()
    total_train_loss = 0
    
    preds = []
    targets = []

    dl_size = len(data_loader)

    
    for batch in tqdm(data_loader):
        input_ids, attention_masks, stress_masks, target = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        stress_masks = stress_masks.to(device)
        target = target.to(device)
        
        with torch.no_grad():
            logits = model(input_ids, attention_masks, stress_masks)
            preds.append(logits.argmax(dim=1).cpu())
            targets.append(target.argmax(dim=1).cpu())
        
        loss = loss_function(logits, target.argmax(dim=1))
        total_train_loss += loss.item()
            
    preds = torch.cat(preds, dim=0).numpy()
    targets = torch.cat(targets, dim=0).numpy()
    acc = ((targets == preds).sum() / preds.shape[0]).item()
    # f1 = f1_score(targets, preds)
    # precision = precision_score(targets, preds)
    # recall = recall_score(targets, preds)
    
    metrics = {
        "Eval Loss": total_train_loss / dl_size,
        "Eval Accuracy": acc,
        # "Eval F1": f1,
        # "Eval Precision": precision, 
        # "Eval Recall": recall, 
    }
    
    return metrics


def single_model(model, 
                 train_dataset, 
                 eval_dataset,  
                 loss_function, 
                 optimizer,
                 get_scheduler,
                 device = torch.device("cuda"),
                 random_state: int = 69, 
                 shuffle: bool = True, 
                 epochs: int = 5, 
                 lr: float = 1e-6,
                 num_workers: int = 1,
                 batch_size: int = 32,
                 n_acum_steps: int = 0):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)
    
    loss_function.to(device)

    print('--------------------------------')

    train_loader = torch.utils.data.DataLoader(
                  train_dataset, 
                  batch_size=batch_size,
                  shuffle=shuffle,
                  num_workers=num_workers
    )

    eval_loader = torch.utils.data.DataLoader(
                  eval_dataset,
                  batch_size=batch_size,
                  shuffle=False,
                  num_workers=num_workers
    )
    
    total_steps = len(train_loader) * epochs 

    scheduler = get_scheduler(optimizer, 
                                        num_warmup_steps = 0, # Default value in run_glue.py
                                        num_training_steps = total_steps)

    for epoch_i in range(0, epochs):
        train_metrics = train_epoch_tr(model, train_loader, loss_function, optimizer, scheduler, device, n_acum_steps)
        eval_metrics = eval_epoch_tr(model, eval_loader, loss_function, device)
        
        print(f"EPOCH: {epoch_i}")
        print(train_metrics)
        print(eval_metrics)


In [22]:
train_inds, eval_inds = train_test_split([i for i in range(len(dataset))], test_size=0.2, random_state=42)
train_dataset = Subset(dataset, indices=train_inds)
eval_dataset = Subset(dataset, indices=eval_inds)

len(train_inds), len(eval_inds)

(470792, 117698)

In [23]:
model = StressTransformer(
    embedding_dim=128, 
    nhead=4, 
    dim_feedforward=256, 
    num_layers=3, 
    norm_first=True, 
    enable_nested_tensor=True
)

params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params_count)

optimizer = torch.optim.AdamW([
    {"params": model.embedding.parameters(), "lr" : 1e-4},
    {"params": model.transformer.parameters(), "lr" : 1e-4},
    {"params": model.out.parameters(), "lr" : 1e-4},
])

get_scheduler = get_cosine_schedule_with_warmup

401921


In [24]:
single_model(
    model=model, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,   
    loss_function=nn.CrossEntropyLoss(), 
    optimizer=optimizer,
    get_scheduler=get_scheduler,
    device=torch.device("cuda"),
    random_state=SEED, 
    shuffle=True, 
    epochs=10, 
    num_workers=0,
    batch_size=128,
    n_acum_steps=1   
)

--------------------------------


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 0
{'Train Loss': 1.357193148826315, 'Train Accuracy': 0.3777379394722085}
{'Eval Loss': 1.2923842294060666, 'Eval Accuracy': 0.4000152933779673}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 1
{'Train Loss': 1.2916585510922178, 'Train Accuracy': 0.4063216877092219}
{'Eval Loss': 1.2555527136377667, 'Eval Accuracy': 0.41869020714030825}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 2
{'Train Loss': 1.2629437404146788, 'Train Accuracy': 0.41781721014800594}
{'Eval Loss': 1.2323713208022324, 'Eval Accuracy': 0.42454417237336234}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 3
{'Train Loss': 1.2438306770884624, 'Train Accuracy': 0.4263453924450713}
{'Eval Loss': 1.2134211739768153, 'Eval Accuracy': 0.43503712892317625}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 4
{'Train Loss': 1.2282692548469798, 'Train Accuracy': 0.4327303777464358}
{'Eval Loss': 1.1980850988756055, 'Eval Accuracy': 0.44032184064300156}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 5
{'Train Loss': 1.2152536949342798, 'Train Accuracy': 0.4385482336148448}
{'Eval Loss': 1.1898516395817633, 'Eval Accuracy': 0.44374585804346717}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 6
{'Train Loss': 1.2066969966136687, 'Train Accuracy': 0.4424629135584292}
{'Eval Loss': 1.1827607640753621, 'Eval Accuracy': 0.4458869309588948}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 7
{'Train Loss': 1.2004838328738678, 'Train Accuracy': 0.4452242179136434}
{'Eval Loss': 1.1767371925970782, 'Eval Accuracy': 0.44870770956175976}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 8
{'Train Loss': 1.1971583925339475, 'Train Accuracy': 0.4460972149059457}
{'Eval Loss': 1.1743488219121228, 'Eval Accuracy': 0.4503305068905164}


  0%|          | 0/3679 [00:00<?, ?it/s]

  0%|          | 0/920 [00:00<?, ?it/s]

EPOCH: 9
{'Train Loss': 1.1952575839191197, 'Train Accuracy': 0.44657300888715185}
{'Eval Loss': 1.1740890733573748, 'Eval Accuracy': 0.4504069737803531}
