In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [None]:
def read_words_from_file(file_path):
    try:
        with open(file_path, 'r') as file:
            words = file.read().splitlines()
        return words
    except FileNotFoundError:
        print("Файл не найден")
        return []

In [None]:
words = read_words_from_file("train_stresses_labels.txt")

In [None]:
words

['аа^к',
 'аа^ка',
 'аа^ке',
 'аа^ки',
 'аа^ков',
 'аа^ком',
 'аа^м',
 'аа^му',
 'аа^нгича',
 'аа^нгичам',
 'ааро^не',
 'ааро^новец',
 'ааро^новские',
 'ааро^новский',
 'ааро^новца',
 'ааро^новцами',
 'ааро^новце',
 'ааро^новцы',
 'ааро^новщин',
 'ааро^новщинами',
 'ааро^новщинах',
 'ааро^новщины',
 'ааро^ну',
 'а^ахенец',
 'аа^хенский',
 'абаа^сами',
 'абаа^сов',
 'абаа^су',
 'абаа^сы',
 'абада^н',
 'абада^нец',
 'абада^нках',
 'абада^нки',
 'абада^нкою',
 'абада^нку',
 'абада^нские',
 'абада^нский',
 'абада^нца',
 'абада^нцами',
 'абада^нце',
 'абада^нцев',
 'абада^нцы',
 'абажу^рами',
 'абажу^рно',
 'абажу^рны',
 'абажу^ров',
 'абажу^ру',
 'абази^ею',
 'абази^на',
 'абази^нам',
 'абази^нки',
 'абази^нкою',
 'абази^нские',
 'абази^нско',
 'абази^нца',
 'абази^нцу',
 'абази^нцы',
 'абази^я',
 'аба^зов',
 'аба^зом',
 'аба^им',
 'аба^й',
 'аба^к',
 'аба^кам',
 'аба^ками',
 'абака^н',
 'абака^не',
 'абака^нский',
 'абако^вый',
 'аба^ком',
 'абако^ст',
 'абако^стам',
 'абако^сте',
 'абако

In [None]:

chars = ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з',
         'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р',
         'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ',
         'ъ', 'ы', 'ь', 'э', 'ю', 'я']

vowels = ['а', 'е', 'ё', 'и', 'о', "у", "ы", "э", "ю", "я", "^"]

char2id = {
    ch: idx + 1 for idx, ch in enumerate(chars)
}

MAX_ITEMS = 50
BATCH_SIZE = 128
VOCAB_SIZE = len(chars) + 1
HIDDEN_SIZE = 64
N_CLASSES = 15
def get_item_list(word):
    item = [char2id[ch] for ch in word if ch in char2id]
    return item

def get_item_labels(word):
    vow_word = [ch for ch in word if ch in vowels]
    return vow_word.index('^')


class TrainDataset(Dataset):
    def __init__(self, words):
        self.item_list = list(map(get_item_list, words))
        self.labels = list(map(get_item_labels, words))


    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):

        return (
            [0] * (MAX_ITEMS - len(self.item_list[idx])) + self.item_list[idx],
            self.labels[idx]
        )

def train_collate_fn(x):
    item_list, labels = zip(*x)
    item_tensor = torch.tensor(item_list)
    labels_tensor = torch.tensor(labels)
    return {
        'items': item_tensor,
        'labels': labels_tensor
    }


In [None]:
dataset = TrainDataset(words)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=train_collate_fn)

In [None]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7aaba3edf040>

In [None]:
for batch in dataloader:
    print(batch)
    break

{'items': tensor([[ 0,  0,  0,  ..., 19,  1, 33],
        [ 0,  0,  0,  ...,  1, 14, 10],
        [ 0,  0,  0,  ..., 14, 19, 33],
        ...,
        [ 0,  0,  0,  ...,  1,  6, 14],
        [ 0,  0,  0,  ...,  3,  6,  6],
        [ 0,  0,  0,  ...,  3,  1, 13]]), 'labels': tensor([1, 2, 2, 1, 4, 7, 5, 1, 2, 4, 2, 3, 3, 5, 1, 2, 4, 3, 3, 1, 3, 3, 1, 2,
        4, 1, 2, 2, 2, 2, 4, 4, 2, 2, 1, 4, 2, 1, 4, 1, 4, 4, 2, 2, 4, 1, 4, 1,
        3, 2, 3, 4, 2, 2, 3, 2, 4, 3, 1, 3, 2, 2, 2, 2, 2, 3, 2, 5, 3, 3, 2, 3,
        2, 4, 4, 3, 1, 3, 1, 3, 2, 3, 2, 4, 2, 4, 2, 3, 4, 2, 3, 3, 3, 2, 3, 1,
        1, 3, 5, 3, 2, 3, 2, 5, 3, 4, 4, 1, 2, 1, 2, 1, 2, 2, 4, 2, 4, 3, 4, 2,
        2, 2, 3, 4, 1, 2, 2, 2])}


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.8 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import BertConfig, BertForSequenceClassification

In [None]:
class Model52(nn.Module):
    def __init__(self,
                 hidden_dim=HIDDEN_SIZE,
                 n_classes=N_CLASSES):
        super().__init__()
        bert_config = BertConfig(vocab_size = VOCAB_SIZE,
                       hidden_size = 128,
                       num_hidden_layers = 4,
                       num_attention_heads = 4,
                       intermediate_size = 128,
                       hidden_act = 'gelu',
                       hidden_dropout_prob = 0.1,
                       attention_probs_dropout_prob = 0.1,
                       max_position_embeddings = MAX_ITEMS,
                       type_vocab_size = 2,
                       initializer_range = 0.02,
                       layer_norm_eps = 1e-12,
                       pad_token_id = 0,
                       position_embedding_type = 'absolute')
        self.bert = BertForSequenceClassification(bert_config)
        self.bert.classifier = nn.Linear(128, n_classes)


    def forward(self, batch):
        items = batch['items']
        mask = (items > 0)
        x = self.bert(items, mask).logits
        return x


    def calculate_loss(self, batch):
        labels = batch["labels"]
        loss_fn = nn.CrossEntropyLoss()
        logits = self.forward(batch)
        loss = loss_fn(logits, labels)
        return loss




In [None]:
model52 = Model52()
for batch in dataloader:
    loss = model52.calculate_loss(batch)
    break

In [None]:
import torch
from tqdm.auto import tqdm
from sklearn.metrics import f1_score


def train_epoch(model, data_loader, loss_function, optimizer, device):
    model.to(device)
    model.train()
    total_train_loss = 0

    dl_size = len(data_loader)

    preds = []
    targets = []

    batch_i = 0
    steps_to_accumulate_grads = 0
    for batch in tqdm(data_loader):
        for key in batch:
            batch[key] = batch[key].to(device)

        optimizer.zero_grad()
        logits = model(batch)

        preds.append(logits.argmax(dim=1))
        targets.append(batch['labels'])

        loss = loss_function(logits, batch['labels'])
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    preds = torch.cat(preds, dim=0)
    targets = torch.cat(targets, dim=0)
    acc = (targets == preds).sum() / preds.shape[0]
    #f1 = f1_score(preds.cpu(), targets.cpu())

    metrics = {
        "Train Loss": total_train_loss / dl_size,
        "Train Accuracy": acc.item()
    }


    return metrics


def eval_epoch(model, data_loader, loss_function, device):
    model.to(device)
    model.eval()
    total_train_loss = 0

    preds = []
    targets = []

    dl_size = len(data_loader)


    for batch in tqdm(data_loader):
        for key in batch:
            batch[key] = batch[key].to(device)

        with torch.no_grad():
            logits = model(batch)
            preds.append(logits.argmax(dim=1))
            targets.append(batch['label'])

        loss = loss_function(logits, batch['label'])
        total_train_loss += loss.item()

    preds = torch.cat(preds, dim=0)
    targets = torch.cat(targets, dim=0)
    #acc = (targets == preds).sum() / preds.shape[0]
    #f1 = f1_score(preds.cpu(), targets.cpu())

    metrics = {
        "Eval Loss": total_train_loss / dl_size,
        #"Eval Accuracy": acc.item(),
        #"Eval F1": f1.item()*100
    }

    return metrics

In [None]:
import random
import numpy as np
def single_model(model,
                     dataset,
                     loss_function,
                     collate_fn,
                     device=torch.device("cuda"),
                     random_state: int=69,
                     shuffle=True,
                     epochs: int=8,
                     lr: float=1e-3,
                     batch_size: int=4096,
                     start_epoch=0,
                     ):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    loss_function.to(device)
    model.to(device)
    optimizer = torch.optim.AdamW(
        [
            {"params": model.bert.parameters(), "lr": 1e-3},
        ]
    )

    data_loader = torch.utils.data.DataLoader(
                    dataset,
                    batch_size=batch_size,
                    shuffle=shuffle,
                     collate_fn=collate_fn
    )

    total_steps = len(data_loader) * epochs

    for epoch_i in range(0, epochs):
        if epoch_i >= start_epoch:
            train_metrics = train_epoch(model, data_loader, loss_function, optimizer, device)
            print("EPOCH", epoch_i)
            print(train_metrics)

In [None]:
x = ([1, 1, 12, 1], 2)


In [None]:
single_model(
    model=model52,
    dataset=dataset,
    collate_fn=train_collate_fn,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    loss_function=nn.CrossEntropyLoss()
)

  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 0
{'Train Loss': 1.1908214046723313, 'Train Accuracy': 0.48407110571861267}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 1
{'Train Loss': 0.8916001787616147, 'Train Accuracy': 0.6230029463768005}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 2
{'Train Loss': 0.7915209014382627, 'Train Accuracy': 0.6741456985473633}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 3
{'Train Loss': 0.7131186135941081, 'Train Accuracy': 0.7122092247009277}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 4
{'Train Loss': 0.6319304630160332, 'Train Accuracy': 0.7493211627006531}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 5
{'Train Loss': 0.5908426915605863, 'Train Accuracy': 0.765056312084198}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 6
{'Train Loss': 0.567446656525135, 'Train Accuracy': 0.7736766934394836}


  0%|          | 0/144 [00:00<?, ?it/s]

EPOCH 7
{'Train Loss': 0.5456621717247698, 'Train Accuracy': 0.7837992310523987}


In [None]:

chars = ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з',
         'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р',
         'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ',
         'ъ', 'ы', 'ь', 'э', 'ю', 'я']
char2id = {
    ch: idx + 1 for idx, ch in enumerate(chars)
}

MAX_ITEMS = 50
BATCH_SIZE = 128
VOCAB_SIZE = len(chars) + 1
HIDDEN_SIZE = 64

def get_item_list(word):
    item = [char2id[ch] for ch in word if ch in char2id]
    return item

class InferenceDataset(Dataset):
    def __init__(self, words):
        self.item_list = list(map(get_item_list, words))


    def __len__(self):
        return len(self.item_list)


    def __getitem__(self, idx):

        return (
            [0] * (MAX_ITEMS - len(self.item_list[idx])) + self.item_list[idx]
        )

def inference_collate_fn(x):
    item_list = x
    item_tensor = torch.tensor(item_list)
    return {
        'items': item_tensor
    }


In [None]:
test_words = words = read_words_from_file("public_test_stresses.txt")
test_dataset = InferenceDataset(test_words)
preds = []
test_loader = DataLoader(test_dataset, batch_size=4096, collate_fn=inference_collate_fn)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model52.to(device)
model52.eval()
total_train_loss = 0

preds = []
targets = []

dl_size = len(test_loader)


for batch in tqdm(test_loader):
    for key in batch:
        batch[key] = batch[key].to(device)

    with torch.no_grad():
        logits = model52(batch)
        preds.append(logits.argmax(dim=1))
preds = torch.cat(preds, dim=0)

  0%|          | 0/72 [00:00<?, ?it/s]

In [None]:
preds

tensor([2, 2, 3,  ..., 4, 4, 3], device='cuda:0')

In [None]:
preds[0].item()

2

In [None]:
preds.min()

tensor(1, device='cuda:0')

In [None]:
def insert_carot_after_vowel(w, k):
    vowels = ['а', 'е', 'ё', 'и', 'о', 'у', 'ы', 'э', 'ю', 'я']
    count = 0
    w1 = ""

    for char in w:
        w1 += char

        if char.lower() in vowels:
            count += 1
            if count == k:
                w1 += "^"

    return w1

In [None]:
sub = []
for idx, word in tqdm(enumerate(test_words)):
    stress_idx = preds[idx].item()
    sub.append(insert_carot_after_vowel(word, stress_idx))

0it [00:00, ?it/s]

In [None]:
print(test_words[2])
print(sub[2])

ааленец
аале^нец


In [None]:
def write_to_file(sub, path):
    with open(path, "w") as file:
        for word in sub:
            file.write(word + "\n")

In [None]:
write_to_file(sub, "/content/sample_submission.txt")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
