# Домашнее задание 3. Обработка текстов.

## О задании

В данном домашнем задании вам предстоит предсказывать пользовательскую оценку отеля по тексту отзыва. Нужно обучиться на данных с кэггла и заслать в [соревнование](https://www.kaggle.com/t/3e8fa6cec6d048bf8e93fb72e441d88c) предикт. По той же ссылке можете скачать данные.

Мы собрали для вас отзывы по 1500 отелям из совершенно разных уголков мира. Что это за отели - секрет. Вам дан текст отзыва и пользовательская оценка отеля. Ваша задача - научиться предсказывать оценку отеля по отзыву.

Главная метрика - Mean Absolute Error (MAE). Во всех частях домашней работы вам нужно получить значение MAE не превышающее 0.92 на публичном лидерборде

#### Использовать любые данные для обучения кроме предоставленных организаторами строго запрещено. В последней части можно использовать предобученные модели из библиотеки `transformers`.

In [2]:
import pandas as pd
import torch

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
PATH_TO_TRAIN_DATA = '/content/drive/MyDrive/data/train.csv'
df = pd.read_csv(PATH_TO_TRAIN_DATA)

In [7]:
df.head()

Unnamed: 0,review_id,negative,positive,score
0,00003c6036f30f590c0ac435efb8739b,There were issues with the wifi connection,No Positive,7.1
1,00004d18f186bf2489590dc415876f73,TV not working,No Positive,7.5
2,0000cf900cbb8667fad33a717e9b1cf4,More pillows,Beautiful room Great location Lovely staff,10.0
3,0000df16edf19e7ad9dd8c5cd6f6925e,Very business,Location,5.4
4,00025e1aa3ac32edb496db49e76bbd00,Rooms could do with a bit of a refurbishment ...,Nice breakfast handy for Victoria train stati...,6.7


In [None]:
df = df.drop('review_id', axis=1)

In [None]:
df.head()

Unnamed: 0,negative,positive,score
0,There were issues with the wifi connection,No Positive,7.1
1,TV not working,No Positive,7.5
2,More pillows,Beautiful room Great location Lovely staff,10.0
3,Very business,Location,5.4
4,Rooms could do with a bit of a refurbishment ...,Nice breakfast handy for Victoria train stati...,6.7


## Предобработка текста

Предобработка текста может сказываться на качестве вашей модели.
Сделаем небольшой препроцессинг текстов: удалим знаки препинания, приведем все слова к нижнему регистру .... Также мы добавили разбиение текстов на токены. Теперь каждая строка-ревью стала массивом токенов.

In [5]:
import string
import nltk
import re
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

In [6]:
nltk.download('punkt') # токенизатор предложений
nltk.download('stopwords') # Стоп-слова
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
lemmatizer = WordNetLemmatizer()
def process_text(text):
    if text.lower() == 'no positive' or text.lower() == 'no negative' or text == ' ' or text == '':
        text = 'empty'
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text.lower())
    text = [word for word in word_tokens if (word not in string.punctuation)]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = [word for word in text if word not in stop_words]
    return text

In [None]:
df['negative'] = df['negative'].apply(process_text)
df['positive'] = df['positive'].apply(process_text)

In [None]:
# Разметка токенов, так как могут быть как в позитиве, так и в негативе находится
df['negative'] = df['negative'].apply(lambda text: [word + '-' for word in text])
df['positive'] = df['positive'].apply(lambda text: [word + '+' for word in text])

In [None]:
df.head()

Unnamed: 0,negative,positive,score
0,"[issue-, wifi-, connection-]",[empty+],7.1
1,"[tv-, working-]",[empty+],7.5
2,[pillow-],"[beautiful+, room+, great+, location+, lovely+...",10.0
3,[business-],[location+],5.4
4,"[room-, could-, bit-, refurbishment-, could-, ...","[nice+, breakfast+, handy+, victoria+, train+,...",6.7


### Часть 1. 1 балл

Обучите логистическую или линейную регрессию на TF-IDF векторах текстов.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy import sparse

In [None]:
def Tokens2String(tokens):
    return re.sub(r'\d+', '', ' '.join(tokens))

In [None]:
def get_data(DataFrame):
    
    df_train, df_test = train_test_split(DataFrame, random_state=1412)
    
    X_train = df_train['negative'].apply(Tokens2String) + ' ' + df_train['positive'].apply(Tokens2String)
    X_test = df_test['negative'].apply(Tokens2String) + ' ' + df_test['positive'].apply(Tokens2String)
    
    y_train = df_train['score']
    y_test = df_test['score']

    return (X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = get_data(df)

In [None]:
X_train[111], y_train[111]

('nothing- everything+', 10.0)

In [None]:
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1,2))

train_tf_idf = tfidf.fit_transform(X_train)
test_tf_idf = tfidf.transform(X_test)

In [None]:
train_tf_idf.shape, test_tf_idf.shape

((75000, 728), (25000, 728))

In [None]:
linreg = LinearRegression(n_jobs=-1)
linreg.fit(train_tf_idf, y_train)
y_test_predict = linreg.predict(test_tf_idf)
print(f'LinearRegression MAE: {mean_absolute_error(y_test, y_test_predict):.4f}')

LinearRegression MAE: 0.9109


In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(train_tf_idf, y_train.mul(10).astype(int))
y_test_predict = logreg.predict_proba(test_tf_idf)
y_test_predict = (y_test_predict * logreg.classes_).sum(axis=1) / 10
print(f'LogisticRegression MAE: {mean_absolute_error(y_test, y_test_predict):.4f}')

LogisticRegression MAE: 0.8923


Предскажите этой моделью тестовые данные из [соревнования](https://www.kaggle.com/t/3e8fa6cec6d048bf8e93fb72e441d88c) и сделайте сабмит. Какой у вас получился скор? Прикрепите скриншот из кэггла.

In [None]:
TestDF = pd.read_csv("data/test.csv")
TestDF.head()

Unnamed: 0,review_id,negative,positive
0,00026f564b258ad5159aab07c357c4ca,Other than the location everything else was h...,Just the location
1,000278c73da08f4fcb857fcfe4ac6417,No UK TV but this was a minor point as we wer...,Great location very comfortable clean breakfa...
2,000404f843e756fe3b2a477dbefa5bd4,A tiny noisy room VERY deceptively photographed,The breakfast booked the preceding night but ...
3,000a66d32bcf305148d789ac156dd512,Noisy various electrical devices kicking in r...,Great location Nice bathroom
4,000bf1d8c5110701f459ffbedbf0d546,No Negative,Great location and friendly staff


In [None]:
TestDF['negative'] = TestDF['negative'].apply(process_text)
TestDF['positive'] = TestDF['positive'].apply(process_text)

In [None]:
TestDF['negative'] = TestDF['negative'].apply(lambda text: [word + '-' for word in text])
TestDF['positive'] = TestDF['positive'].apply(lambda text: [word + '+' for word in text])

In [None]:
TestSF = TestDF['negative'].apply(Tokens2String) + ' ' + TestDF['positive'].apply(Tokens2String)

In [None]:
Test_tf_idf = tfidf.transform(TestSF)

In [None]:
y_test_predict_linreg = linreg.predict(Test_tf_idf)
ResTestDFLin = TestDF["review_id"].to_frame()
ResTestDFLin["score"] = y_test_predict_linreg.round(1)
ResTestDFLin.to_csv("data/submit_linreg.csv", index=False)
ResTestDFLin

Unnamed: 0,review_id,score
0,00026f564b258ad5159aab07c357c4ca,5.5
1,000278c73da08f4fcb857fcfe4ac6417,9.2
2,000404f843e756fe3b2a477dbefa5bd4,7.4
3,000a66d32bcf305148d789ac156dd512,6.8
4,000bf1d8c5110701f459ffbedbf0d546,9.6
...,...,...
19995,ffe8a7190aee6e3a53ee2e0145a91555,6.6
19996,ffea0e2b84788c9df755efe8e2bedb23,9.2
19997,fff3997a85a1eed7ae7a937bc945fcf0,9.5
19998,fff673fe95ab8f3a0910f112549862e2,7.9


In [None]:
y_test_predict_logreg = logreg.predict_proba(Test_tf_idf)
y_test_predict_logreg = (y_test_predict_logreg * logreg.classes_).sum(axis=1) / 10
ResTestDFLog = TestDF["review_id"].to_frame()
ResTestDFLog["score"] = y_test_predict_logreg.round(1)
ResTestDFLog.to_csv("data/submit_logreg.csv", index=False)
ResTestDFLog

Unnamed: 0,review_id,score
0,00026f564b258ad5159aab07c357c4ca,5.7
1,000278c73da08f4fcb857fcfe4ac6417,9.2
2,000404f843e756fe3b2a477dbefa5bd4,8.3
3,000a66d32bcf305148d789ac156dd512,6.8
4,000bf1d8c5110701f459ffbedbf0d546,9.4
...,...,...
19995,ffe8a7190aee6e3a53ee2e0145a91555,6.4
19996,ffea0e2b84788c9df755efe8e2bedb23,9.1
19997,fff3997a85a1eed7ae7a937bc945fcf0,9.3
19998,fff673fe95ab8f3a0910f112549862e2,7.9


### Kaggle Leaderboard

![image.png](attachment:image.png)

![image.png](attachment:image.png)

### Часть 2. 2 балла

Обучите логистическую или линейную регрессию на усредненных Word2Vec векторах. 

Усредняя w2v вектора, мы предполагаем, что каждое слово имеет равноценный вклад в смысл предложения, однако это может быть не совсем так. Теперь попробуйте воспользоваться другой концепцией и перевзвесить слова при получении итогового эмбеддинга текста. В качестве весов используйте IDF (Inverse document frequency)

In [None]:
def calc_idf(texts):
    pass

Проведите эксперименты с размерностью эмбеддинга. Для каждого из двух методов постройте график зависимости качества модели от размерности эмбеддинга. 

#### Сделайте выводы:

Теперь попробуйте обучить логистическую или линейную регрессию на любых других эмбеддингах размерности 300 и сравните качество с Word2Vec.
#### Выводы:
`<ВАШ ТЕКСТ ЗДЕСЬ>`

Предскажите вашей лучшей моделью из этого задания тестовые данные из [соревнования](https://www.kaggle.com/t/3e8fa6cec6d048bf8e93fb72e441d88c) и сделайте сабмит. Какой у вас получился скор? Прикрепите скриншот из кэггла.

### Часть 3. 4 балла

Теперь давайте воспользуемся более продвинутыми методами обработки текстовых данных, которые мы проходили в нашем курсе. Обучите RNN/Transformer для предсказания пользовательской оценки.

In [7]:
!pip install pytorch_transformers

Collecting pytorch_transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[?25l[K     |█▉                              | 10 kB 20.1 MB/s eta 0:00:01[K     |███▊                            | 20 kB 10.3 MB/s eta 0:00:01[K     |█████▋                          | 30 kB 8.8 MB/s eta 0:00:01[K     |███████▍                        | 40 kB 8.1 MB/s eta 0:00:01[K     |█████████▎                      | 51 kB 5.1 MB/s eta 0:00:01[K     |███████████▏                    | 61 kB 5.6 MB/s eta 0:00:01[K     |█████████████                   | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████████▉                 | 81 kB 6.2 MB/s eta 0:00:01[K     |████████████████▊               | 92 kB 4.9 MB/s eta 0:00:01[K     |██████████████████▋             | 102 kB 5.3 MB/s eta 0:00:01[K     |████████████████████▍           | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████▎         | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████▏       | 133 kB

In [8]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

from pytorch_transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [9]:
df_train = pd.read_csv(PATH_TO_TRAIN_DATA)
df_train = df_train.drop('review_id', axis=1)
df_train['review'] = df_train['positive'].str.cat(df_train['negative'], sep =" ")

In [10]:
df_test = pd.read_csv('/content/drive/MyDrive/data/test.csv')
df_test['review'] = df_test['positive'].str.cat(df_test['negative'], sep =" ")
df_test.head()

Unnamed: 0,negative,positive,review
0,Other than the location everything else was h...,Just the location,Just the location Other than the location e...
1,No UK TV but this was a minor point as we wer...,Great location very comfortable clean breakfa...,Great location very comfortable clean breakfa...
2,A tiny noisy room VERY deceptively photographed,The breakfast booked the preceding night but ...,The breakfast booked the preceding night but ...
3,Noisy various electrical devices kicking in r...,Great location Nice bathroom,Great location Nice bathroom Noisy various e...
4,No Negative,Great location and friendly staff,Great location and friendly staff No Negative


In [11]:
set_score = list(set(df_train.score.tolist())) # множество уникальных оценок
dict_score = {set_score[idx]: idx for idx in range(len(set_score))}

In [12]:
df_train['class_score'] = df_train.apply(lambda score: dict_score[score['score']], axis=1)

In [13]:
df_train.head()

Unnamed: 0,negative,positive,score,review,class_score
0,There were issues with the wifi connection,No Positive,7.1,No Positive There were issues with the wifi c...,7
1,TV not working,No Positive,7.5,No Positive TV not working,6
2,More pillows,Beautiful room Great location Lovely staff,10.0,Beautiful room Great location Lovely staff ...,9
3,Very business,Location,5.4,Location Very business,4
4,Rooms could do with a bit of a refurbishment ...,Nice breakfast handy for Victoria train stati...,6.7,Nice breakfast handy for Victoria train stati...,5


In [14]:
def get_data(DataFrame):
    
    df_train, df_test = train_test_split(DataFrame, random_state=1412)
    
    X_train = [text for text in df_train['review']]
    X_test = [text for text in df_test['review']]
    
    y_train = [text for text in df_train['class_score'].tolist()]
    y_test = [text for text in df_test['class_score'].tolist()]

    return (X_train, X_test, y_train, y_test)

In [15]:
X_train, X_valid, y_train, y_valid = get_data(df_train)

In [16]:
X_test = [text for text in df_test['review']]

In [17]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', add_special_tokens=True)

100%|██████████| 898823/898823 [00:00<00:00, 6283563.30B/s]
100%|██████████| 456318/456318 [00:00<00:00, 3688413.90B/s]


In [18]:
MAX_LEN = 150

In [19]:
# токенизируем предложения отзывов токенизатором Роберта
train_tokens = [tokenizer.encode(x, add_special_tokens=True) for x in X_train]
# паддинг предложений (токенов) до одной длины (дозаполняем нулями)
train_tokens_pad = pad_sequence([torch.as_tensor(seq[:MAX_LEN]) for seq in train_tokens], batch_first=True)
# маска из нулей и единиц
train_masks = [[float(i > 0) for i in seq] for seq in train_tokens_pad]

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [20]:
len(train_tokens_pad[0]) == len(train_masks[0])

True

In [21]:
valid_tokens = [tokenizer.encode(x, add_special_tokens=True) for x in X_valid]
valid_tokens_pad = pad_sequence([torch.as_tensor(seq[:MAX_LEN]) for seq in valid_tokens], batch_first=True)
valid_masks = [[float(i > 0) for i in seq] for seq in valid_tokens_pad]

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [22]:
test_tokens = [tokenizer.encode(x, add_special_tokens=True) for x in X_test]
test_tokens_pad = pad_sequence([torch.as_tensor(seq[:MAX_LEN]) for seq in test_tokens], batch_first=True)
test_masks = [[float(i > 0) for i in seq] for seq in test_tokens_pad]

Token indices sequence length is longer than the specified maximum sequence length for this model (758 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (615 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (684 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (633 > 512). Running this sequence through the model will result in indexing errors


In [23]:
train_inputs = train_tokens_pad.clone().detach()
train_labels = torch.tensor(y_train)
train_masks = torch.tensor(train_masks)

valid_inputs = valid_tokens_pad.clone().detach()
valid_labels = torch.tensor(y_valid)
valid_masks = torch.tensor(valid_masks)

test_inputs = torch.tensor(test_tokens_pad)
test_masks = torch.tensor(test_masks)

  if __name__ == '__main__':


In [24]:
bs = 32

TrainData = TensorDataset(train_inputs, train_masks, train_labels)
TrainDataLoader = DataLoader(TrainData, batch_size=bs, shuffle=True)

ValidData = TensorDataset(valid_inputs, valid_masks, valid_labels)
ValidDataLoader = DataLoader(ValidData, batch_size=bs)

TestData = TensorDataset(test_inputs, test_masks)
TestDataLoader = DataLoader(TestData, batch_size=bs)

In [25]:
dict_score_class = {idx: set_score[idx] for idx in range(len(set_score))}
def MAE(predicted, actual):
    predicted = [dict_score_class[idx] for idx in predicted]
    actual = [dict_score_class[idx] for idx in actual]
    return mean_absolute_error(predicted, actual)

In [26]:
def train_one_epoch(model, train_dataloader, criterion, optimizer, device="cuda:0"):
    
    model.to(device).train()
    
    with tqdm(total=len(TrainDataLoader)) as pbar:
        for batch in TrainDataLoader:
            ids, mask, labels = batch
            ids = ids.to(device)
            mask = mask.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model.forward(ids, token_type_ids=None, attention_mask=mask)[0]
            _, predicted = torch.max(output, 1)
            
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(output.detach(), 1)
            accuracy_mae = MAE(predicted.cpu().detach().numpy(), labels.cpu().detach().numpy())
            
            pbar.set_description('CrossEntropyLoss: {:.4f}; MAE: {:.4f}'.format(loss.detach().item(), accuracy_mae))    
            pbar.update(1)
            
def predict(model, val_dataloader, criterion, device="cuda:0"):
    
    model.to(device).eval()
    
    losses = []
    predicted_classes = []
    true_classes = []
    
    with tqdm(total=len(ValidDataLoader)) as pbar:
        with torch.no_grad():
            for batch in ValidDataLoader:
                ids, mask, labels = batch
                ids = ids.to(device)
                mask = mask.to(device)
                labels = labels.to(device)
                
                
                output = model.forward(ids, token_type_ids=None, attention_mask=mask)[0]
                _, predicted = torch.max(output, 1)
            
                loss = criterion(output, labels)
                losses.append(loss.item())
                _, predicted = torch.max(output.detach(), 1)
                predicted_classes.append(predicted)
                true_classes.append(labels)
                
                
                accuracy_mae = MAE(predicted.cpu().detach().numpy(), labels.cpu().detach().numpy())
                
                pbar.set_description('CrossEntropyLoss: {:.4f}; MAE: {:.4f}'.format(loss.detach().item(), accuracy_mae))    
                pbar.update(1)
                
    predicted_classes = torch.cat(predicted_classes).detach().to('cpu').numpy()
    true_classes = torch.cat(true_classes).detach().to('cpu').numpy()
    
    return losses, predicted_classes, true_classes

def predict_without_labels(model, test_dataloader, device="cuda:0"):
    
    model.to(device).eval()
    predicted_classes = []
    step = 0
    
    with tqdm(total=len(TestDataLoader)) as pbar:
        with torch.no_grad():
            for batch in TestDataLoader:
                ids, mask = batch
                ids = ids.to(device)
                mask = mask.to(device)
                
                
                output = model(ids, token_type_ids=None, attention_mask=mask)
                predicted = output[0].detach().cpu().numpy()
                batch_predicted = np.argmax(predicted, axis=1)
                predicted_classes.extend(batch_predicted)
                
                pbar.set_description('Step: {:.4f}'.format(step))    
                pbar.update(1)
                
                step += 1
                
    return predicted_classes

def train(model, TrainDataLoader, ValidDataLoader, TestDataLoader, criterion, optimizer, device="cuda:0", n_epochs=2, scheduler=None):
    
    model.to(device)
    lrs = []
    
    for epoch in range(n_epochs):

        print(f'--------------------EPOCH №{epoch}--------------------')
        print('-----------------------TRAIN-----------------------')

        train_one_epoch(model, TrainDataLoader, criterion, optimizer, device)
        torch.save(model.state_dict(), 'model')
        
        print('-----------------------VALIDATION-----------------------')
        losses, predicted_classes, true_classes = predict(model, ValidDataLoader, criterion, device)
        print(f'MAE: {MAE(true_classes, predicted_classes)}')

        print('-----------------------TEST-----------------------')
        predicted_classes = predict_without_labels(model, TestDataLoader, device)
        
        df_kaggle = df_test["review_id"].to_frame()
        df_kaggle["score"] = [dict_score_class[item] for item in predicted_classes]
        df_kaggle.to_csv('/content/drive/MyDrive/data/submission.csv', index=False)
        print('SUBMISSION SAVED')
 
        lrs.append(optimizer.param_groups[0]['lr'])
        
        if scheduler != None:
          scheduler.step()

In [27]:
RobConfig = RobertaConfig.from_pretrained("roberta-base", output_hidden_states=True, num_labels=len(df_train.score.unique()))
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=RobConfig)

100%|██████████| 481/481 [00:00<00:00, 365283.40B/s]
100%|██████████| 501200538/501200538 [00:12<00:00, 41410790.29B/s]


In [28]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
n_epochs = 3

In [29]:
torch.cuda.empty_cache()

In [30]:
train(model, TrainDataLoader, ValidDataLoader, TestDataLoader, criterion, optimizer, device=device, n_epochs=n_epochs, scheduler=scheduler)

--------------------EPOCH №0--------------------
-----------------------TRAIN-----------------------


CrossEntropyLoss: 2.9253; MAE: 2.2167: 100%|██████████| 2344/2344 [1:21:31<00:00,  2.09s/it]


-----------------------VALIDATION-----------------------


CrossEntropyLoss: 2.6087; MAE: 1.8000: 100%|██████████| 782/782 [10:00<00:00,  1.30it/s]


MAE: 1.6084240000000003
-----------------------TEST-----------------------


Step: 624.0000: 100%|██████████| 625/625 [07:58<00:00,  1.31it/s]


KeyError: ignored

### Контест (до 3 баллов)

По итогам всех ваших экспериментов выберите модель, которую считаете лучшей. Сделайте сабмит в контест. В зависимости от вашего скора на публичном лидерборде, мы начислим вам баллы:

 - <0.76 - 3 балла
 - [0.76; 0.78) - 2 балла
 - [0.78; 0.8) - 1 балл