In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn import functional as F

from torch.utils.data import TensorDataset, DataLoader

from tqdm.auto import trange

In [2]:
from navec import Navec

path = '../../static/embeddings/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [3]:
train_data = pd.read_csv('../../static/datasets/original/validation_data_labeled.csv', sep='\t')
sentences = train_data.sentence
emb = np.zeros((len(sentences), 30000))
'''
for i in range(len(sentences)):
    sentence = sentences[i].split(" ")
    for j in range(len(sentence)):
        if sentence[j] in navec:
            emb[i][j] = navec[sentence[j]]
'''
a = [0]
for i in range(len(sentences)):
    sentence = sentences[i].split(" ")
    for j in range(len(sentence)):
        a.append(sentence[j])
        emb[i][j] = len(a) - 1

In [4]:
X = emb
y = pd.DataFrame(train_data['label'])
y.replace(1, 2, inplace=True)
y.replace(0, 1, inplace=True)
y.replace(-1, 0, inplace=True)
y = np.array(y)

In [5]:
X.shape

(2845, 30000)

In [6]:
y.shape

(2845, 1)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [8]:
X_train

array([[28074., 28075., 28076., ...,     0.,     0.,     0.],
       [36072., 36073., 36074., ...,     0.,     0.,     0.],
       [ 3341.,  3342.,  3343., ...,     0.,     0.,     0.],
       ...,
       [22826., 22827., 22828., ...,     0.,     0.,     0.],
       [26162., 26163., 26164., ...,     0.,     0.,     0.],
       [17518., 17519., 17520., ...,     0.,     0.,     0.]])

In [9]:
X_val

array([[ 8481.,  8482.,  8483., ...,     0.,     0.,     0.],
       [45223., 45224., 45225., ...,     0.,     0.,     0.],
       [23190., 23191., 23192., ...,     0.,     0.,     0.],
       ...,
       [31963., 31964., 31965., ...,     0.,     0.,     0.],
       [ 8905.,  8906.,  8907., ...,     0.,     0.,     0.],
       [40331., 40332., 40333., ...,     0.,     0.,     0.]])

In [10]:
xt = np.zeros((len(X_train), 30000, 300))
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if X_train[i][j] != 0:
            if a[int(X_train[i][j])] in navec:
                xt[i][j] = navec[a[int(X_train[i][j])]]
X_train = xt

In [11]:
xt = np.zeros((len(X_val), 30000, 300))
for i in range(len(X_val)):
    for j in range(len(X_val[i])):
        if X_val[i][j] != 0:
            if len(a) <= int(X_val[i][j]):
                print(int(X_val[i][j]))
            if a[int(X_val[i][j])] in navec:
                xt[i][j] = navec[a[int(X_val[i][j])]]
X_val = xt

In [12]:
def df_to_tensor(df):
    return torch.from_numpy(df.values).float()

In [13]:
X_train_t = torch.from_numpy(X_train)
y_train_t = torch.from_numpy(y_train)
X_val_t = torch.from_numpy(X_val)
y_val_t = torch.from_numpy(y_val)
X_train_t = torch.permute(X_train_t, (0, 2, 1))
X_val_t = torch.permute(X_val_t, (0, 2, 1))

In [14]:
X_val_t

tensor([[[ 0.0000,  0.3146,  0.0482,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000, -0.1777, -0.2004,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.2426,  0.0586,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000, -0.2754,  0.2750,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0252,  0.2647,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.3528,  0.4108,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000,  0.2733,  0.0574,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000, -0.0049, -0.6995,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000, -0.0492,  0.5508,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0000, -0.5250, -0.4420,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.1350, -0.1768,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.2226,  0.5108,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0000, -0.4161, -0.5378,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000, -0.8234, -0.6289,  ...,  0

In [15]:
y_train_t

tensor([[1],
        [1],
        [1],
        ...,
        [1],
        [1],
        [1]])

In [16]:
print(X_train_t.shape)
print(X_val_t.shape)
print(y_train_t.shape)
print(y_val_t.shape)

torch.Size([2133, 300, 30000])
torch.Size([712, 300, 30000])
torch.Size([2133, 1])
torch.Size([712, 1])


In [17]:
train_dataset = TensorDataset(X_train_t, y_train_t)
val_dataset = TensorDataset(X_val_t, y_val_t)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)


In [18]:
def fit(epochs, model_, loss_func, opt, train_dl, valid_dl, lr_sched=None):
    print(1)
    model_ = model_.to(device)
    print(2)
    train_losses = []
    val_losses = []
    print(3)
    valid_accuracies = []
    print(4)
    pbar = trange(epochs, desc="Epoch:")
    print(5)
    for epoch in pbar:
        print(6)
        model_.train()
        print(7)
        loss_sum = 0

        for xb, yb in train_dl:
            print(8)
            xb, yb = xb.to(device), yb.to(device)
            print(9)
            probs = model_(xb)
            print(10)
            loss = loss_func(probs, yb)
            loss_sum += loss.item()
            loss.backward()

            opt.step()
            opt.zero_grad()

        train_losses.append(loss_sum / len(train_dl))


        model_.eval()
        loss_sum = 0
        correct = 0
        num = 0

        with torch.no_grad():
            for xb, yb in valid_dl:
                xb, yb = xb.to(device), yb.to(device)
                probs = model_(xb)

                loss = loss_func(probs, yb)
                loss_sum += loss.item()

                _, pred = torch.max(probs, axis=-1)
                correct += (pred == yb).sum().item()
                num += len(xb)


        val_loss = loss_sum / len(valid_dl)
        val_acc = correct / num
        val_losses.append(val_loss)
        valid_accuracies.append(val_acc)

        pbar.set_description('Epoch: {} Loss: {:.4f} Acc: {:.4f}'.format(epoch, val_loss, val_acc))

        #torch.save(model.state_dict(), PATH)   #сохранение весов модели на будущее

        if lr_sched is not None:
            lr_sched.step()

    return train_losses, val_losses, valid_accuracies

Load Model:
    model = TheModelClass(*args, **kwargs)
    model.load_state_dict(torch.load(PATH))
    model.eval()

In [19]:
class MultiLayerPerceptron(nn.Module):

    def __init__(self):
        super(MultiLayerPerceptron, self).__init__()
        # Embedding size, Hyperparameter, Output
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=300, kernel_size=5) #96
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2) #48
        self.conv2 = nn.Conv1d(in_channels=300, out_channels=300, kernel_size=5) #44
        #22
        self.conv3 = nn.Conv1d(in_channels=300, out_channels=300, kernel_size=5) #18
        #9
        self.conv4 = nn.Conv1d(in_channels=300, out_channels=300, kernel_size=2) #8
        #4
        self.conv5 = nn.Conv1d(in_channels=300, out_channels=300, kernel_size=3) #2
        #1
        self.fc1 = nn.Linear(300, 4096)
        self.fc2 = nn.Linear(4096, 3)

    def forward(self, x):
        print(x.shape)
        x = self.conv1(x)
        x = F.leaky_relu(x)
        print(x.shape)
        x = self.pool(x)
        x = self.pool(F.leaky_relu(self.conv2(x)))
        x = self.pool(F.leaky_relu(self.conv3(x)))
        x = self.pool(F.leaky_relu(self.conv4(x)))
        x = self.pool(F.leaky_relu(self.conv5(x)))
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        return x

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = MultiLayerPerceptron().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.9)

info = fit(64, model, criterion, optimizer, train_dataloader, val_dataloader, scheduler)

1
2
3
4


Epoch::   0%|          | 0/64 [00:00<?, ?it/s]

5
6
7
8
9
torch.Size([32, 300, 30000])


[W NNPACK.cpp:51] Could not initialize NNPACK! Reason: Unsupported hardware.


In [None]:
train_losses = info[0]
plt.figure(figsize=(12, 8))
plt.plot(range(len(train_losses)), train_losses)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.show()

In [None]:
import sys
sys.path.append('../..')

from src.metrics import multiclass_classification as multi_metrics

In [None]:
model.eval()
xb, yb = next(iter(val_dataloader))
probs = model(xb)
_, pred = torch.max(probs, axis=-1)

In [None]:
multi_metrics.f1_score(y_true=yb, y_pred=pred, average='macro')