In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
import requests
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset #ease batching/shuffleing of data, 
    # and for TensorDataset easy to use with Dataloader bc of easy pairing of input features with labels
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#izvlacimo preko regexa sve komentar klase na nekom sajtu(scraping)
reviews = []

#ovo vadi samo sa glavne strane reviewove zato ih umesto po 88 da ima npr na sajtu kafica ima u vrh glave 10ak
urls = [
    'https://www.yelp.com/biz/pronto-pizza-san-francisco?hrid=MqOAQdGM98FDpHqArFRZFg',
    'https://www.yelp.com/biz/mejico-sydney-2',
    'https://www.yelp.com/biz/vans-daly-city',
    'https://www.yelp.com/biz/dhoom-indian-fashion-clothing-and-bridal-fremont-3',
    'https://www.yelp.com/biz/san-francisco-centre-san-francisco?osq=Shopping',
    'https://www.yelp.com/biz/sanraku-san-francisco-2'
]

#implement error handling!
def fetch_reviews(urls):
    for url in urls:
        r = requests.get(url) #dohvatamo sve sa tog sajta, r.text je citav tekst
        soup = BeautifulSoup(r.text, 'html.parser')
        regex = re.compile('.*comment.*') #trazimo klase "comment" jer se tu zapravo nalaze review-ovi
        results = soup.find_all('p', {'class':regex}) # p znaci paragrafe trazimo, zatim trazimo sve sto je klase comment
        reviews.extend([result.text for result in results]) #izvlacimo samo text iz html dela
    return reviews

In [None]:
reviews = fetch_reviews(urls)

In [None]:
len(reviews) #mali je broj instanci za kvalitetan model? (za sad 71)

71

In [None]:
#sad cemo ubaciti review-ove u dataframe
import pandas as pd
import numpy as np

df = pd.DataFrame(np.array(reviews), columns=['review'])

In [None]:
df

Unnamed: 0,review
0,Good neighborhood pizza joint. Not gourmet--n...
1,All I can say is Wow .... Go Pronto!From the m...
2,Recently placed an order from Pronto Pizzeria ...
3,Pizza is great but don't skip on this sandwich...
4,Don't be fooled by its location. As nasty as i...
...,...
66,"Hi Katherine, thank you very much for your rev..."
67,Not great. Undon was a little bland and chick...
68,"Hi Nikki, thank you very much for your honest ..."
69,Three months ago I was in The Bay area and wen...


In [None]:
# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt', truncation=True, max_length=512)
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
df['sentiment'] = df['review'].apply(lambda x: sentiment_score(x)) #nlp pipeline je limitirana sa koliko mozes tokena da posaljes(max je 512)

In [None]:
df['sentiment'] = df['sentiment'] - 1

In [None]:
df.tail()

Unnamed: 0,review,sentiment
66,"Hi Katherine, thank you very much for your rev...",4
67,Not great. Undon was a little bland and chick...,2
68,"Hi Nikki, thank you very much for your honest ...",4
69,Three months ago I was in The Bay area and wen...,4
70,"Hi Ashley, thank you very much for your great ...",4


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
def encode_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(), max_length=max_length, truncation=True, padding=True, return_tensors='pt'
    )

In [None]:
train_encodings = encode_data(X_train, tokenizer)
test_encodings = encode_data(X_test, tokenizer)
#input_ids: the numerical representations of the tokens in the text.
#attention_mask: This indicates which tokens should be attended to and which should be ignored (padding tokens).
#Tokenization converts text into a format suitable for the model, and these encodings are used as input to the BERT model.

In [None]:
train_labels = torch.tensor(y_train.values, dtype=torch.long)
test_labels = torch.tensor(y_test.values, dtype=torch.long)

In [None]:
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
optimizer = AdamW(model.parameters(), lr=3e-5, no_deprecation_warning=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
#train loop
model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad() #clearing the gradients from the previous step
        input_ids, attention_mask, labels = [x.to(device) for x in batch] #unpacking the batch of data
        
        outputs = model(input_ids, attention_mask=attention_mask, labels = labels) #forward pass(computing the model outputs)
        loss = outputs.loss #getting the loss from the outputs
        loss.backward() #compute the gradients of the loss
        optimizer.step() #update the model parameters using optimizer
        print(f"epoch: {epoch + 1}, loss: {loss.item()}")

epoch: 1, loss: 0.3819330632686615
epoch: 1, loss: 0.7227057218551636
epoch: 1, loss: 0.264137864112854
epoch: 1, loss: 0.3776280879974365
epoch: 2, loss: 0.15438351035118103
epoch: 2, loss: 0.17540502548217773
epoch: 2, loss: 0.33494317531585693
epoch: 2, loss: 0.11054343730211258
epoch: 3, loss: 0.06401576846837997
epoch: 3, loss: 0.16015514731407166
epoch: 3, loss: 0.08508225530385971
epoch: 3, loss: 0.07548914849758148


In [None]:
model.eval()
total_loss = 0
correct_pred = 0
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        predictions = torch.argmax(logits, dim=1)
        correct_pred += (predictions == labels).sum().item()
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predictions.cpu().numpy())

avg_loss = total_loss / len(test_loader)
accuracy = correct_pred / len(test_dataset)
print(f"Test Loss: {avg_loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(classification_report(all_labels, all_preds))

Test Loss: 0.22240297496318817
Test Accuracy: 93.33%
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.50      0.67         2
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         8

    accuracy                           0.93        15
   macro avg       0.92      0.88      0.87        15
weighted avg       0.96      0.93      0.93        15



In [None]:
#saving the model for future use
model.save_pretrained('model1')
tokenizer.save_pretrained('tokenizer1')

('tokenizer1\\tokenizer_config.json',
 'tokenizer1\\special_tokens_map.json',
 'tokenizer1\\vocab.txt',
 'tokenizer1\\added_tokens.json',
 'tokenizer1\\tokenizer.json')

In [None]:
#visualization