In [1]:
import sys
sys.path.append('../')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import torch.nn as nn
from tqdm.notebook import tqdm

from src.data import (
    TrainDataset,
    train_collate_fn,
    InferenceDataset,
    inference_collate_fn,
    get_item_list,
    get_item_labels,
    read_words_from_file,
)
from src.models import single_model, Model52
from settings import BATCH_SIZE

In [2]:
words = read_words_from_file("../data/raw/train_stresses_labels.txt")

In [4]:
words

['аа^к',
 'аа^ка',
 'аа^ке',
 'аа^ки',
 'аа^ков',
 'аа^ком',
 'аа^м',
 'аа^му',
 'аа^нгича',
 'аа^нгичам',
 'ааро^не',
 'ааро^новец',
 'ааро^новские',
 'ааро^новский',
 'ааро^новца',
 'ааро^новцами',
 'ааро^новце',
 'ааро^новцы',
 'ааро^новщин',
 'ааро^новщинами',
 'ааро^новщинах',
 'ааро^новщины',
 'ааро^ну',
 'а^ахенец',
 'аа^хенский',
 'абаа^сами',
 'абаа^сов',
 'абаа^су',
 'абаа^сы',
 'абада^н',
 'абада^нец',
 'абада^нках',
 'абада^нки',
 'абада^нкою',
 'абада^нку',
 'абада^нские',
 'абада^нский',
 'абада^нца',
 'абада^нцами',
 'абада^нце',
 'абада^нцев',
 'абада^нцы',
 'абажу^рами',
 'абажу^рно',
 'абажу^рны',
 'абажу^ров',
 'абажу^ру',
 'абази^ею',
 'абази^на',
 'абази^нам',
 'абази^нки',
 'абази^нкою',
 'абази^нские',
 'абази^нско',
 'абази^нца',
 'абази^нцу',
 'абази^нцы',
 'абази^я',
 'аба^зов',
 'аба^зом',
 'аба^им',
 'аба^й',
 'аба^к',
 'аба^кам',
 'аба^ками',
 'абака^н',
 'абака^не',
 'абака^нский',
 'абако^вый',
 'аба^ком',
 'абако^ст',
 'абако^стам',
 'абако^сте',
 'абако

In [13]:
train_words, val_words = [words[i] for i in range(len(words)) if i % 5 != 0], [words[i] for i in range(len(words)) if i % 5 == 0]

train_dataset = TrainDataset(train_words)
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True, collate_fn=train_collate_fn)

val_dataset = TrainDataset(val_words)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=True, collate_fn=train_collate_fn)

In [14]:
model52 = Model52()
optimizer = AdamW(model52.parameters(), lr=1e-3)

single_model(
    model=model52,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    loss_function=nn.CrossEntropyLoss()
)

  0%|          | 0/230 [00:00<?, ?it/s]

  3%|▎         | 6/230 [01:15<43:34, 11.67s/it]  

In [None]:
test_words = words = read_words_from_file("public_test_stresses.txt")
test_dataset = InferenceDataset(test_words)
preds = []
test_loader = DataLoader(test_dataset, batch_size=4096, collate_fn=inference_collate_fn)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model52.to(device)
model52.eval()
total_train_loss = 0

preds = []
targets = []

dl_size = len(test_loader)


for batch in tqdm(test_loader):
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        logits = model52(batch)
        preds.append(logits.argmax(dim=1))
preds = torch.cat(preds, dim=0)

In [None]:
def insert_carot_after_vowel(w, k):
    vowels = ['а', 'е', 'ё', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я']
    count = 0
    w1 = ""

    for char in w:
        w1 += char

        if char.lower() in vowels:
            count += 1
            if count == k:
                w1 += "^"

    return w1

In [None]:
sub = []
for idx, word in tqdm(enumerate(test_words)):
    stress_idx = preds[idx].item()
    sub.append(insert_carot_after_vowel(word, stress_idx))

In [None]:
print(test_words[2])
print(sub[2])

In [None]:
def write_to_file(sub, path):
    with open(path, "w") as file:
        for word in sub:
            file.write(word + "\n")

In [None]:
write_to_file(sub, "/content/sample_submission.txt")