In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertModel
#
import torch
import torch.nn as nn
#
import torch
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset #ease batching/shuffleing of data, 
    # and for TensorDataset easy to use with Dataloader bc of easy pairing of input features with labels
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#izvlacimo preko regexa sve komentar klase na nekom sajtu(scraping)
reviews = []

#ovo vadi samo sa glavne strane reviewove zato ih umesto po 88 da ima npr na sajtu kafica ima u vrh glave 10ak
urls = [
    'https://www.yelp.com/biz/pronto-pizza-san-francisco?hrid=MqOAQdGM98FDpHqArFRZFg',
    'https://www.yelp.com/biz/mejico-sydney-2',
    'https://www.yelp.com/biz/vans-daly-city',
    'https://www.yelp.com/biz/dhoom-indian-fashion-clothing-and-bridal-fremont-3',
    'https://www.yelp.com/biz/san-francisco-centre-san-francisco?osq=Shopping',
    'https://www.yelp.com/biz/sanraku-san-francisco-2'
]

#implement error handling!
def fetch_reviews(urls):
    for url in urls:
        r = requests.get(url) #dohvatamo sve sa tog sajta, r.text je citav tekst
        soup = BeautifulSoup(r.text, 'html.parser')
        regex = re.compile('.*comment.*') #trazimo klase "comment" jer se tu zapravo nalaze review-ovi
        results = soup.find_all('p', {'class':regex}) # p znaci paragrafe trazimo, zatim trazimo sve sto je klase comment
        reviews.extend([result.text for result in results]) #izvlacimo samo text iz html dela
    return reviews

In [5]:
reviews = pd.read_csv('data/imdb.csv')
#reviews = fetch_reviews(urls)

In [6]:
len(reviews) 
#mali je broj instanci ako samo scraping radis za kvalitetan model? (za sad 71)

50000

In [7]:
reviews

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [8]:
# Load BERT model and tokenizer

#tokenization is essentially turning the sentences/words into tokens, 
#tokens are used by the machine to understand the context and process the input better

#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

In [9]:
reviews['sentiment'] = reviews['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [10]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
class Tokenize_dataset:
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text.reset_index(drop=True)  # Ensure continuous integer index
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets.reset_index(drop=True)  # Ensure continuous integer index

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    targets = self.targets[item]
    inputs = self.tokenizer.encode_plus(
        str(text),
        add_special_tokens = True,
        max_length = self.max_len,
        padding='max_length',  # Replace deprecated parameter pad_to_max_length
        truncation=True  # Handle text longer than max_length
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs.get("token_type_ids", [0] * len(ids))  # Use default if not present

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [12]:
train_maxlen = 140
batch_size = 32
epochs = 10
bert_model = 'bert-base-uncased'
learning_rate = 3e-5

In [13]:
class CompleteModel(nn.Module):
  def __init__(self, bert):
    super(CompleteModel, self).__init__()
    self.bert = BertModel.from_pretrained(bert)
    self.drop = nn.Dropout(p=0.25)
    self.out = nn.Linear(self.bert.config.hidden_size, 2) # Number of output classes = 3, positive, negative and N(none)

  def forward(self, ids, mask, token_type_ids):
    _, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
    output = self.drop(pooled_output)
    return self.out(output)

In [14]:
X = reviews['review']
y = reviews['sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [17]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # dal treba BertTokenizer?

In [18]:
train_dataset = Tokenize_dataset(X_train, y_train, tokenizer, max_len=140)
val_dataset = Tokenize_dataset(X_val, y_val, tokenizer, max_len=140)
test_dataset = Tokenize_dataset(X_test, y_test, tokenizer, max_len=140)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [20]:
model = CompleteModel(bert_model)

In [21]:
# Set up training components
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [None]:
for epoch in range(3):
    model.train()
    i=0
    for batch in train_loader:
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)  # Expecting only one value returned
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        print(f"izvrsen {i}. batch")
        i+=1

    model.eval()
    print("usao u evaluaciju!")
    val_loss = 0
    correct_preds = 0
    with torch.no_grad():
        for batch in val_loader:
            ids = batch['ids']
            mask = batch['mask']
            token_type_ids = batch['token_type_ids']
            targets = batch['targets']

            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct_preds += (preds == targets).sum().item()

    print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {val_loss/len(val_loader)}, Accuracy: {correct_preds/len(val_dataset)}')
