# AP-v9-5-Model training and testing

In [None]:
import re

import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from pymystem3 import Mystem
from typing import List



Loading dataset and write to pandas.Dataframe

In [None]:
df = pd.read_csv('/content/full_data_lemm.csv')
df = df.drop(df.columns[[0]], axis = 1)
df.iloc[:,0]-=1
data = df.dropna()

print(data)

In [None]:
max_words = 31887 #кол-во уникальных слов в датасете 
cv = CountVectorizer(max_features=max_words)
sparse_matrix = cv.fit_transform(data['text']).toarray()
feature_names = cv.get_feature_names_out()
words = feature_names.tolist()
print(feature_names)
print(sparse_matrix)

Spliting the dataset in the ratio train, test, val: 80,10,10

In [None]:
x_train, x_test, y_train, y_test = train_test_split(sparse_matrix, np.array(data['num']),test_size = 0.1, shuffle = True)
print(len(x_train))
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1/0.9, shuffle = True)
print(len(x_train))

Casting pf.array to tensor and creating TensorDataset and DataLoader

In [None]:
batch_size = 1000
epochs = 10
learning_rate = 0.001

train_features = torch.Tensor(x_train)
train_targets = torch.Tensor(y_train)
val_features = torch.Tensor(x_val)
val_targets = torch.Tensor(y_val)
test_features = torch.Tensor(x_test)
test_targets = torch.Tensor(y_test)

train = TensorDataset(train_features, train_targets)
test = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=True)

Creating a LogisticRegression class

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear1 = nn.Linear(31887, 1000)
        self.linear2 = nn.Linear(1000, 100)
        self.linear3 = nn.Linear(100, 5)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [None]:
model = LogisticRegression()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters() , lr=learning_rate)

Train loop

In [None]:

model.train()
losses = []
for epoch in range(epochs):
  loss_values = []
  for x_batch, y_batch in train_loader:
    optimizer.zero_grad()
    y_pred = model(x_batch)
    loss = criterion(y_pred, y_batch.long())
    loss_values.append(loss.item())
    pred = torch.max(y_pred, 1)[1].eq(y_batch).sum()
    acc = pred * 100.0 / len(x_batch)
    loss.backward()
    optimizer.step()
  losses.append(np.array(loss_values).mean())
  print('Epoch: {}, Loss: {}, Accuracy: {}%'.format(epoch+1, losses[-1], acc.numpy()))

Loss Value test vs Epochs plot

In [None]:
plt.plot(losses)
plt.title('Loss Value test vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(['Loss'])
plt.show()


Сhecking the model on a validation set

In [None]:
model.eval()
with torch.no_grad():
    y_pred = model(val_features)
    loss = criterion(y_pred, val_targets.long())
    pred = torch.max(y_pred, 1)[1].eq(val_targets).sum()
    print ("Accuracy : {}%".format(100*pred/len(test_features)))


Saving the model

In [None]:
torch.save(model.state_dict(), "model-b1000-lr0001-e10.pt")

Functions for creating your own vector from random feedback

In [None]:
def text_update(text: str) -> List[str]:
    """Function remove from text punctuation marks and split it

    Args:
        text (str): text for update

    Returns:
        List[str]: List with words from text
    """
    text = re.sub(r"[^\w\s]", "", text)
    text = text.split()
    return ' '.join(text) 

def preprocess_text(text: str) -> List[str]:
    """Function gets text, lemmatize them and removes stopwords

    Args:
        text (str): text for preprocess 

    Returns:
        List[str]: preprocessed text
    """
    mystem = Mystem()
    russian_stopwords = stopwords.words("russian")
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords]
    text = " ".join(tokens)
    return text.split()

def my_vect(text:str):
  tensor = 31887*[0]
  text = preprocess_text(text_update(text))
  for word in set(text): 
    if word in words:
      tensor[words.index(word)] = text.count(word)
  return tensor

Function for checking the model against random feedback

In [None]:
def predict(text: str):
  model.eval()
  with torch.no_grad():
    y_pred = model(torch.Tensor(my_vect(text)))
    return(y_pred)

print(predict('Время от времени пользуемся услугами этого интернет магазина, особенно было актуально в период, лакдаун когда все магазины были закрыты. А одежда и обувь была нужна. Как раз на помощь пришёл магазин Wildberries. Но и до всяких корона вирусов мы ей пользовались.'))