# Priprava okolja

In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import csv
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from google.colab import drive

import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

RANDOM_SEED = 42

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Pomožni razredi in funkcije

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.model = AutoModel.from_pretrained('EMBEDDIA/sloberta')
    self.pre_classifier = torch.nn.Linear(768, 768)
    self.dropout = torch.nn.Dropout(0.2)
    self.classifier = nn.Linear(self.model.config.hidden_size, n_classes)
        

  def forward(self, input_ids, attention_mask):
    output = self.model(
        input_ids=input_ids, 
        attention_mask=attention_mask
        )
    last_hidden_state = output[0]
    pooler = last_hidden_state[:, 0, :]
    pooler = self.dropout(pooler)
    pooler = self.pre_classifier(pooler)
    pooler = torch.nn.ReLU()(pooler)
    pooler = self.dropout(pooler)
    output = self.classifier(pooler)
    return output


class ArticleTestDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = dataframe
        self.text = dataframe.body
        self.max_len = max_len

    def __getitem__(self, idx):
        text = str(self.text[idx])

        inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_attention_mask=True,
            return_token_type_ids=True
        )
      
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        return {
            'text': text,
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
        }

    def __len__(self):
        return len(self.text)


def get_predictions(model, data_loader):
  model = model.eval()
  predictions = []

  data_iterator = tqdm(data_loader, desc="Iteration")
  with torch.no_grad():
    for step, d in enumerate(data_iterator):
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      predictions.extend(preds)
 
  predictions = torch.stack(predictions).cpu()
  
  return predictions


# MAIN

In [None]:
model_path = '/content/drive/MyDrive/Diploma/best_model_state_latest.bin'

In [None]:
MAX_LEN = 512
BATCH_SIZE = 8

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

tokenizer = AutoTokenizer.from_pretrained('EMBEDDIA/sloberta', use_fast=False)
model = SentimentClassifier(3)
model.load_state_dict(torch.load(model_path))
model = model.to(device)

In [None]:
# V naslednji vrstici lako spremenite vrednost. Možne vrednosti so:
# "2019_slovenija_sentiment",
# "2019_svet_sentiment",
# "2020_korona_sentiment",
# "2020_svet_sentiment",
# "2020_slovenska_politika_sentiment",
file_name = '2019_slovenija_sentiment'
filepath = f'/content/drive/MyDrive/Diploma/data/{file_name}.pkl'
data = pd.read_pickle(filepath)

dataloader = DataLoader(ArticleTestDataset(data, tokenizer, MAX_LEN), **test_params)

In [None]:
preds = get_predictions(model, dataloader)
data['sentiment'] = preds

In [None]:
# data.to_pickle(filepath)