In [3]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.nn import functional as F

from torch.utils.data import TensorDataset, DataLoader

from tqdm.auto import trange

In [4]:
from navec import Navec

path = '../../static/embeddings/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

In [5]:
train_data = pd.read_csv('../../static/datasets/original/validation_data_labeled.csv', sep='\t')
sentences = train_data.sentence
emb = np.zeros((len(sentences), 30000))
'''
for i in range(len(sentences)):
    sentence = sentences[i].split(" ")
    for j in range(len(sentence)):
        if sentence[j] in navec:
            emb[i][j] = navec[sentence[j]]
'''
a = [0]
for i in range(len(sentences)):
    sentence = sentences[i].split(" ")
    for j in range(len(sentence)):
        a.append(sentence[j])
        emb[i][j] = len(a)

In [6]:
X = emb
y = pd.DataFrame(train_data['label'])
y.replace(1, 2, inplace=True)
y.replace(0, 1, inplace=True)
y.replace(-1, 0, inplace=True)
y = np.array(y)

In [7]:
X.shape

(2845, 30000)

In [8]:
y.shape

(2845, 1)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [10]:
X_train

array([[28075., 28076., 28077., ...,     0.,     0.,     0.],
       [36073., 36074., 36075., ...,     0.,     0.,     0.],
       [ 3342.,  3343.,  3344., ...,     0.,     0.,     0.],
       ...,
       [22827., 22828., 22829., ...,     0.,     0.,     0.],
       [26163., 26164., 26165., ...,     0.,     0.,     0.],
       [17519., 17520., 17521., ...,     0.,     0.,     0.]])

In [11]:
xt = np.zeros((len(X_train), 30000, 300))
for i in range(len(X_train)):
    for j in range(len(X_train[i])):
        if X_train[i][j] != 0:
            if a[int(X_train[i][j])] in navec:
                xt[i][j] = navec[a[int(X_train[i][j])]]
X_train = xt

In [12]:
yt = np.zeros((len(y_train), 30000, 300))
for i in range(len(y_train)):
    for j in range(len(y_train[i])):
        if y_train[i][j] != 0:
            if a[int(y_train[i][j])] in navec:
                yt[i][j] = navec[a[int(y_train[i][j])]]
y_train = yt

In [13]:
def df_to_tensor(df):
    return torch.from_numpy(df.values).float()

In [14]:
X_train_t = torch.from_numpy(X_train)
y_train_t = torch.from_numpy(y_train)
X_val_t = torch.from_numpy(X_val)
y_val_t = torch.from_numpy(y_val)


In [15]:
X_train_t.shape

torch.Size([2133, 30000, 300])

In [16]:
train_dataset = TensorDataset(X_train_t, y_train_t)
val_dataset = TensorDataset(X_val_t, y_val_t)
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=True)


In [17]:
def fit(epochs, model_, loss_func, opt, train_dl, valid_dl, lr_sched=None):
    model_ = model_.to(device)

    train_losses = []
    val_losses = []

    valid_accuracies = []

    pbar = trange(epochs, desc="Epoch:")
    for epoch in pbar:

        model_.train()
        loss_sum = 0

        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            probs = model_(xb)

            loss = loss_func(probs, yb)
            loss_sum += loss.item()
            loss.backward()

            opt.step()
            opt.zero_grad()

        train_losses.append(loss_sum / len(train_dl))


        model_.eval()
        loss_sum = 0
        correct = 0
        num = 0

        with torch.no_grad():
            for xb, yb in valid_dl:
                xb, yb = xb.to(device), yb.to(device)
                probs = model_(xb)

                loss = loss_func(probs, yb)
                loss_sum += loss.item()

                _, pred = torch.max(probs, axis=-1)
                correct += (pred == yb).sum().item()
                num += len(xb)


        val_loss = loss_sum / len(valid_dl)
        val_acc = correct / num
        val_losses.append(val_loss)
        valid_accuracies.append(val_acc)

        pbar.set_description('Epoch: {} Loss: {:.4f} Acc: {:.4f}'.format(epoch, val_loss, val_acc))

        #torch.save(model.state_dict(), PATH)   #сохранение весов модели на будущее

        if lr_sched is not None:
            lr_sched.step()

    return train_losses, val_losses, valid_accuracies

Load Model:
    model = TheModelClass(*args, **kwargs)
    model.load_state_dict(torch.load(PATH))
    model.eval()

In [18]:
class MultiLayerPerceptron(nn.Module):

    def __init__(self):
        super(MultiLayerPerceptron, self).__init__()
        # Embedding size, Hyperparameter, Output
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5) #96
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2) #48
        self.conv2 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5) #44
        #22
        self.conv3 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=5) #18
        #9
        self.conv4 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2) #8
        #4
        self.conv5 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3) #2
        #1
        self.fc1 = nn.Linear(300, 4096)
        self.fc2 = nn.Linear(4096, 3)

    def forward(self, x):
        print(x.shape)
        x = self.conv1(x)
        x = F.leaky_relu(x)
        print(x.shape)
        x = self.pool(x)
        x = self.pool(F.leaky_relu(self.conv2(x)))
        x = self.pool(F.leaky_relu(self.conv3(x)))
        x = self.pool(F.leaky_relu(self.conv4(x)))
        x = self.pool(F.leaky_relu(self.conv5(x)))
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        return x

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model = MultiLayerPerceptron().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=0.9)

info = fit(64, model, criterion, optimizer, train_dataloader, val_dataloader, scheduler)

Epoch::   0%|          | 0/64 [00:00<?, ?it/s]

In [None]:
train_losses = info[0]
plt.figure(figsize=(12, 8))
plt.plot(range(len(train_losses)), train_losses)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.show()

In [None]:
import sys
sys.path.append('../..')

from src.metrics import multiclass_classification as multi_metrics

In [None]:
model.eval()
xb, yb = next(iter(val_dataloader))
probs = model(xb)
_, pred = torch.max(probs, axis=-1)

In [None]:
multi_metrics.f1_score(y_true=yb, y_pred=pred, average='macro')