**3.lielais mājasdarbs**

In [None]:
# Autore: Jekaterina Jevtejeva
# St.apl.numurs: jj19021

import pandas as pd

# Datu ielāde no Dropbox
!wget '[your link goes here]' -O reviews.csv

# Visi dati tiek ielasīti no iegūtā csv faila
all_data = pd.read_csv('reviews.csv', delimiter=',', encoding='latin-1', usecols=[1,2,3,4,5,6,7,8,9,10])

In [None]:
# Parametri
LOAD_MODEL = True # karodziņš, kura True vērtība nozīmē, ka modeli ir jāieladē no faila; False - ka modeli ir jātrenē uz trenēšanas datiem
TEST_SET_FILE = 'reviews_test.csv' # testa datu kopas faila nosaukums
TRAIN_SET_FILE = 'reviews_train.csv' # trenēšanas datu kopas faila nosaukums
STATE_DICT_FILE = 'md3_model_jj19021.pt' # saglabātā modeļa faila nosaukums

# Saglabātā modeļa, trenēšanas un testēšanas datu kopu ieguve
# Ar wget tiek ielādēti faili no Dropbox
!wget 'https://www.dropbox.com/s/vd14ifhf1yb9alb/md3_model_jj19021.pt?dl=0' -O md3_model_jj19021.pt
!wget 'https://www.dropbox.com/s/zdy67wcriogc49q/reviews_train.csv?dl=0' -O reviews_train.csv
!wget 'https://www.dropbox.com/s/rj5fnz1dtuh7ss0/reviews_test.csv?dl=0' -O reviews_test.csv

# Trenēšanas un testēšanas datu ielase no csv failiem
train_data = pd.read_csv(TRAIN_SET_FILE, delimiter=',', encoding='latin-1', usecols=[1,2,3,4,5,6,7,8,9,10])
test_data = pd.read_csv(TEST_SET_FILE, delimiter=',', encoding='latin-1', usecols=[1,2,3,4,5,6,7,8,9,10])

In [None]:
# ModelLoader klase ar statiskām metodēm modeļa ielādēšanai un saglabāšai failā
class ModelLoader():
  # Funkcija modeļa ielādēšanai, kas saņem state_dict no faila un ielādē to modelī
  @staticmethod
  def load_model(model):
    state_dict = torch.load(STATE_DICT_FILE)
    model.load_state_dict(state_dict)

  # Funkcija modeļa saglabāšanai, kas iegūst modeļa tekošo state_dict un ielādē to failā
  @staticmethod
  def save_model(model):
    state_dict = model.state_dict()
    torch.save(state_dict, STATE_DICT_FILE)

In [None]:
import numpy as np

# DatasetSplitter klase ar statisku metodi datu sadalīšanai atsevišķās datu kopās
class DatasetSplitter():
  # Funkcija testa un treniņa datu kopu iegūšanai no oriģināla csv faila, sadalot to divos failos
  @staticmethod
  def split_dataset():
    df = pd.read_csv('reviews.csv')
    df['split'] = np.random.randn(df.shape[0], 1)
    split = np.random.rand(len(df)) <= 0.8
    train = df[split]
    test = df[~split]
    train.to_csv('reviews_train.csv', index=False)
    test.to_csv('reviews_test.csv', index=False)

    print(len(train))
    print(len(test))

In [None]:
import re, torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Klase TextProcessor, kas atbild par teksta datu apstrādi - datu tīrīšanu un vārdnīcas veidošanu
class TextProcessor():
  def __init__(self, data):
    self.data = data
    self.text_data = self.get_data()[0]
    self.vocab = self.get_data()[1]

  # Statiska metode datu filtrēšanai
  # Izmantotās kolonnas no datu faila ir Title un Review text
  @staticmethod
  def clean_up(raw_data):
    pattern = re.compile("[^a-zA-Z ]+") # notīra punktiāciju
    raw_data["Title"] = raw_data["Title"].astype(str).map(lambda x: pattern.sub('', x)).str.lower()
    raw_data["Review Text"] = raw_data["Review Text"].astype(str).map(lambda x: pattern.sub('', x)).str.lower()
    raw_data = raw_data[raw_data["Title"].astype(str).map(len) > 5] # atstāj tikai rindas ar garumu lielāku par 5
    clean = raw_data[raw_data["Review Text"].astype(str).map(len) > 5]
    return clean

  # Tokenizācija un vārdnīcas iegūšana
  # Koda fragments paņemts no Google Colab piemēra "Laboratorija Spam PyTorch"
  def get_data(self):
    tokenizer = get_tokenizer('basic_english')
    titles = self.data["Title"].astype(str)
    reviews = self.data["Review Text"].astype(str)
    input_text = [tokenizer(sentence) for sentence in titles + reviews]
    vocab = build_vocab_from_iterator(iter(input_text), specials=["<unk>","<pad>"], max_tokens=3000) # ņemam 3000 biežākos vārdus
    vocab.set_default_index(vocab["<unk>"])
    input_text = [torch.tensor(vocab(tokens)) for tokens in input_text]
    input_text = torch.nn.utils.rnn.pad_sequence(input_text, padding_value=vocab['<pad>'], batch_first=True)
    return input_text, vocab

clean_data = TextProcessor.clean_up(all_data) # nofiltrēti dati
text_processor = TextProcessor(clean_data)
vocabulary = text_processor.vocab # iegūtā vārdnīca
text_data = text_processor.text_data # iegūtie tekstuālie dati
print(len(vocabulary))

In [None]:
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset

BATCH_SIZE = 64 # parametrs, kas nosaka batch_size priekš DataLoaders

# Dataset realizācija
class ReviewsDataset(Dataset):
  def __init__(self, dataset, text_data):
    self.dataset = dataset # dati
    self.text_data = text_data # tekstuāli dati
    self.num_data = self.get_num_data() # skaitliski dati
    self.answers1 = self.get_rec_labels() # pareizās atbildes priekš Recommended ID
    self.answers2 = self.get_rating_labels() # pareizās atbildes priekš Rating

  def __getitem__(self, key):
    return self.text_data[key], self.num_data[key], self.answers1[key], self.answers2[key]

  def __len__(self):
    return len(self.dataset)

  def get_num_data(self):
    input_num_data = self.dataset["Positive Feedback Count"].astype('int')
    input_num_data = torch.Tensor(input_num_data.values)
    input_num_data = torch.tensor(input_num_data, dtype=torch.int64)
    return input_num_data

  def get_rec_labels(self):
    recommended = self.dataset["Recommended IND"].values
    return torch.tensor(recommended, dtype=torch.float32)

  def get_rating_labels(self):
    rating = self.dataset["Rating"].astype('int').values
    return torch.tensor(rating, dtype=torch.float32)

# Nofiltrētas datu kopas
clean_train_data = TextProcessor.clean_up(train_data)
clean_test_data = TextProcessor.clean_up(test_data)

# Trenēšanas un testēšanas ReviewsDataset datu kopas un to DataLoaders
train_dataset = ReviewsDataset(clean_train_data, text_data)
test_dataset = ReviewsDataset(clean_test_data, text_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Tīkla parametri
# Tika noskaidroti, vairākkārt testējot modeli un skatoties uz iegūto precizitāti
EMBEDDING_INIT = len(vocabulary)
EMBEDDING_SIZE = 256
HIDDEN_SIZE_FST = 128
HIDDEN_SIZE_SND = 64
NUM_OUT_FEATURES = 32
REC_OUT_FEATURES = 1
RATING_OUT_FEATURES = 6

# Tīkla modulis
# Slāņi tika noskaidroti, vairākkārt testējot modeli un skatoties uz iegūto precizitāti,
# kā piemēru sākotnēji lietojot MNIST modeļa realizāciju no Google Colab piemēra "Laboratorija Spam PyTorch"
class ReviewsModel(nn.Sequential):
  def __init__(self):
    super(ReviewsModel, self).__init__()
    self.text_layers = nn.Sequential(
        nn.Embedding(EMBEDDING_INIT, EMBEDDING_SIZE),
        nn.LSTM(input_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE_FST),
        extract_tensor(),
        # nn.Dropout(0.25),
        nn.LSTM(input_size=HIDDEN_SIZE_FST, hidden_size=HIDDEN_SIZE_SND),
        extract_tensor(),
        # nn.Dropout(0.25),
        nn.Flatten(),
    )
    self.num_layers = nn.Sequential(
        nn.LazyLinear(out_features=NUM_OUT_FEATURES),
        nn.ReLU(),
    )
    self.rec_layers = nn.Sequential(
        nn.Linear(in_features=NUM_OUT_FEATURES, out_features=REC_OUT_FEATURES),
        nn.Sigmoid()
    )
    self.rating_layers = nn.Sequential(
        nn.Linear(in_features=NUM_OUT_FEATURES, out_features=RATING_OUT_FEATURES),
        nn.LogSoftmax(dim=1)
    )

  def forward(self, x, y):
    x = self.text_layers(x)
    y = torch.cat((x.view(x.size(0), -1), y.view(y.size(0), -1)), dim=1)
    y = self.num_layers(y)
    recommendation = self.rec_layers(y)
    recommendation = torch.squeeze(recommendation, 1)
    rating = self.rating_layers(y)
    return recommendation, rating

# LSTM() atgriež tuple no (tensor, (recurrent state))
# Koda fragments paņemts no Google Colab piemēra "Laboratorija Spam PyTorch"
class extract_tensor(nn.Module):
    def forward(self, x):
        tensor, _ = x
        return tensor

model = ReviewsModel()

In [None]:
epochs = 5
lr = 0.01

# GPU pieejamības pārbaude
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)
# BCEWithLogitsLoss tika izvēlēts, jo to lieto priekš binary classification: Eg 1. x=[-2.34] < 0 → class 0; Eg 2. x=[3.87] > 0 → class 1
criterion1 = nn.BCEWithLogitsLoss()
# CrossEntropyLoss tika izvēlēts, jo to lieto priekš multi-class classification: Eg. x=[-1.33, 3.89], Argmax(x) → class 1
criterion2 = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
%%time
# Ja modeļa ielādes karodziņš ir patiess, ielādē modeļa state_dict no faila
if LOAD_MODEL:
  ModelLoader.load_model(model)
# Citādi notiek modeļa trenēšana
# Koda fragments paņemts no Google Colab piemēra "Laboratorija Spam PyTorch"
else:
  # Trenēšana
  for epoch in range(epochs):
      recommended_loss = 0.0
      rating_loss = 0.0
      running_loss = 0.0

      for times, rows in enumerate(train_loader):
          input_text, input_num, labels_rec, labels_rating = rows[0].to(device), rows[1].to(device), rows[2].to(device), rows[3].to(device)

          # Notīram gradientus
          optimizer.zero_grad()

          # Foward + backward + optimize
          out1, out2 = model(input_text, input_num)

          loss1 = criterion1(out1, labels_rec) # rekomendāciju zudumfunkcija
          loss2 = criterion2(out2, labels_rating.long()) # reitingu zudumfunkcija
          loss = loss1 + loss2
          loss.backward()

          optimizer.step()

          # Izdrukājam statistiku
          recommended_loss += loss1.item()
          rating_loss += loss2.item()
          running_loss += loss.item()
          if times % 1000 == 999 or times+1 == len(train_loader):
              print('[%d/%d, %d/%d] Recommended loss: %.3f | Rating loss: %.3f | Running loss: %.3f' % (epoch+1, epochs, times+1, len(train_loader), loss1/times, loss2/times, loss/times))
                # running_loss / times - vidējā zudumfunkcija tekošā epohā

  print('Training Finished.')

In [None]:
# Modeļa saglabāšana
ModelLoader.save_model(model)

In [None]:
# Testēšana un rezultāti
# Tiek izmantota testa datu kopa
correct_rec = 0
correct_rating = 0
total_recs = 0
total_ratings = 0

with torch.no_grad():
    for data in test_loader:
        input_text, input_num, labels1, labels2 = data[0].to(device), data[1].to(device), data[2].to(device), data[3].to(device)
        labels_rec = labels1.type(torch.FloatTensor)
        labels_rating = labels2.type(torch.FloatTensor)

        out1, out2 = model(input_text, input_num)
        out1_rounded = torch.round(out1)
        _, out2_max = torch.max(out2, 1)

        total_recs += labels_rec.size(0)
        total_ratings += labels_rating.size(0)
        correct_rec += (out1_rounded == labels_rec).sum().item() # pareizās Recommended atbildes
        correct_rating += (out2_max == labels_rating).sum().item() # pareizās Rating atbildes

print('"Recommended IND" precizitāte uz testa datu kopas: %.2f %%' % (100*correct_rec / total_recs))
print('"Rating" precizitāte uz testa datu kopas: %.2f %%' % (100*correct_rating / total_ratings))

# Autore: Jekaterina Jevtejeva
# St.apl.numurs: jj19021

"Recommended IND" precizitāte uz testa datu kopas: 81.46 %
"Rating" precizitāte uz testa datu kopas: 52.05 %
