In [1]:
import sys
sys.path.append('../')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
from tqdm.notebook import tqdm

from src.data import (
    TrainDataset,
    train_collate_fn,
    InferenceDataset,
    inference_collate_fn,
    get_item_list,
    get_item_labels,
    get_pair_list,
    read_words_from_file,
)
from src.models import ( 
    single_model,
    train_model_early_stopping,
    BERT52,
    AttnLSTM52,
    AttnBiLSTM52, 
    LSTMBERT52,
)
from settings import BATCH_SIZE

In [2]:
train_file_path = "../data/raw/train_stresses_labels.txt"
public_file_path = "../data/raw/public_test_stresses.txt"

In [3]:
with open(train_file_path, "r", encoding='utf-8') as file:
    words = file.read().splitlines()

In [4]:
words[:10]

['аа^к',
 'аа^ка',
 'аа^ке',
 'аа^ки',
 'аа^ков',
 'аа^ком',
 'аа^м',
 'аа^му',
 'аа^нгича',
 'аа^нгичам']

In [5]:
train_words, val_words = [words[i] for i in range(len(words)) if i % 100 != 0], [words[i] for i in range(len(words)) if i % 100 == 0]

In [6]:
len(train_words), len(val_words)

(582605, 5885)

In [7]:
'sdfsd^f'.replace('^', '')

'sdfsdf'

In [8]:
train_dataset = TrainDataset(train_words, tokenizer=get_item_list)
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, collate_fn=train_collate_fn)

val_dataset = TrainDataset(val_words, tokenizer=get_item_list)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=True, collate_fn=train_collate_fn)

In [9]:
model52 = AttnBiLSTM52()
optimizer = AdamW(model52.parameters(), lr=5e-3)
scheduler = StepLR(optimizer, step_size=10, gamma=0.2)

In [10]:
params_count = sum(p.numel() for p in model52.parameters() if p.requires_grad)

print(f'Number of trainable parameters: {params_count}')

Number of trainable parameters: 1160847


In [11]:
early_stopping = True

if early_stopping:
    model52 = train_model_early_stopping(
        model=model52,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=torch.device('cuda'),#torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        loss_function=nn.CrossEntropyLoss(),
    )
    
else:
    single_model(
        model=model52,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        device=torch.device('cuda'),#torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        loss_function=nn.CrossEntropyLoss(),
        epochs=12
    )

  0%|          | 0/285 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (128) must match the size of tensor b (256) at non-singleton dimension 2

In [12]:
with open(public_file_path, "r", encoding='utf-8') as file:
    test_words = file.read().splitlines()

In [13]:
test_dataset = InferenceDataset(test_words, tokenizer=get_item_list)
preds = []
test_loader = DataLoader(test_dataset, batch_size=4096, collate_fn=inference_collate_fn)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model52.to(device)
model52.eval()
total_train_loss = 0

preds = []
targets = []

dl_size = len(test_loader)


for batch in tqdm(test_loader):
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        logits = model52(batch)
        preds.append(logits.argmax(dim=1))
preds = torch.cat(preds, dim=0)

  0%|          | 0/72 [00:00<?, ?it/s]

In [15]:
def insert_carot_after_vowel(w, k):
    vowels = ['а', 'е', 'ё', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я']
    count = 0
    w1 = ""

    for char in w:
        w1 += char

        if char.lower() in vowels:
            count += 1
            if count == k:
                w1 += "^"

    return w1

In [16]:
sub = []
for idx, word in tqdm(enumerate(test_words)):
    stress_idx = preds[idx].item()
    sub.append(insert_carot_after_vowel(word, stress_idx))

0it [00:00, ?it/s]

In [17]:
print(test_words[52])
print(sub[52])

абазой
аба^зой


In [18]:
def write_to_file(sub, path):
    with open(path, "w", encoding='utf-8') as file:
        for word in sub:
            file.write(word + "\n")

In [19]:
write_to_file(sub, r"subm_ATTN_one_word_10layers_1.txt")