In [1]:
import pathlib
import sys

root = pathlib.Path().cwd()
while not root.joinpath('.projectroot').exists():
    root = root.parent

sys.path.append(root.as_posix())

In [2]:
import numpy as np
import pandas as pd
from functools import partial
from tqdm import tqdm

import torch
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from hw5.common import TorchTrainable

In [3]:
# load pretrained model
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

# load & preprocess
data = pd.read_excel('data/summer.xls')
data = data[~data['Content'].isna()]
data.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [4]:
# prepare
features = data['Content'].astype(str).apply(partial(tokenizer, max_length=512, padding=True, truncation=True, return_tensors='pt'))

def markup(x):
    return 1 if x > 3 else 2 if x < 3 else 0
rating = data['Rating'].apply(markup).values

# split
# X_train, X_valid, y_train, y_valid = train_test_split(features, rating, stratify=rating, random_state=17)
train_index, valid_index = train_test_split(data.index, stratify=rating, random_state=17)
train = data.index.isin(train_index)
valid = data.index.isin(valid_index)

In [5]:
class SummerDataset(torch.utils.data.Dataset):
    def __init__(self, features, rating):
        self.features = features
        self.rating = np.eye(3)[rating]

    def __getitem__(self, index):
        # TODO concat with token_type_ids?
        return self.features[index]['input_ids'], torch.as_tensor(self.rating[index].reshape(1, -1))
    
    def __len__(self):
        return len(self.features)

train_dataset = SummerDataset(features[train].values, rating[train])
valid_dataset = SummerDataset(features[valid].values, rating[valid])

In [81]:
model.bert.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(119547, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [6]:
class Net(torch.nn.Module, TorchTrainable):
    def __init__(self, mod=2, drop=0.1):
        super().__init__()
        self.embedding = model.bert.embeddings
        # self.embedding = model.bert.embeddings.word_embeddings
        self.conv = torch.nn.Conv1d(768, 768 * mod, kernel_size=2)

        self.linear = torch.nn.Linear(768 * mod, 3)
        self.dp = torch.nn.Dropout(drop)
        
    def forward(self, x):        
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = torch.relu(x)
        x = torch.max_pool1d(x, 2)
        x = self.dp(x)
        x = x.permute(0, 2, 1)
        x = self.linear(x)
        x = torch.softmax(x, dim=1)
        return torch.mean(x, dim=1)

In [103]:
# обученные эмбеддинги + необученная сетка
net = Net().to('cuda')
setattr(net, 'trained', True)
setattr(net, 'dev', 'cuda')
predicts = net.predict(valid_dataset)

report = classification_report(rating[valid], predicts.argmax(axis=1))
print(report)

              precision    recall  f1-score   support

           0       0.04      0.67      0.07       228
           1       0.66      0.11      0.18      4180
           2       0.32      0.14      0.19       756

    accuracy                           0.14      5164
   macro avg       0.34      0.30      0.15      5164
weighted avg       0.59      0.14      0.18      5164



In [8]:
net = Net()
net.to('cuda')
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

net.fit(train_dataset, optimizer, criterion, epochs=3, device='cuda')

Epoch 1/3: 100%|██████████| 15492/15492 [12:54<00:00, 19.99it/s, cumulative loss per item=1.1]
Epoch 2/3: 100%|██████████| 15492/15492 [12:36<00:00, 20.47it/s, cumulative loss per item=1.1]
Epoch 3/3: 100%|██████████| 15492/15492 [12:46<00:00, 20.22it/s, cumulative loss per item=1.1]


Done.





In [9]:
# обученные эмбеддинги + обученная сетка
predicts = net.predict(valid_dataset)

report = classification_report(rating[valid], predicts.argmax(axis=1))
print(report)

              precision    recall  f1-score   support

           0       0.03      0.61      0.06       228
           1       0.70      0.12      0.20      4180
           2       0.31      0.13      0.19       756

    accuracy                           0.14      5164
   macro avg       0.35      0.29      0.15      5164
weighted avg       0.61      0.14      0.19      5164



По 1-му классу есть прирост, но не сказал бы, что стало принципиально лучше. Направления работы:
- очистка датасета (тут я его не чистил совсем)
- усложнение архитектуры сети

С tensorflow у меня по-прежнему нелюбовь

In [None]:
#