## 1. Import Statements

---



In [1]:
# %%capture
# !pip install transformers

In [2]:
import torch
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set up the GPU.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## 2. Load the Data

---


The original code in this section is located in `bert-training.ipynb`. It is included here to make the `get_star_predictions()` function to work.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
github_url = 'https://raw.githubusercontent.com/csbanon/bert-product-rating-predictor/master/data/reviews_comments_stars.csv'
df = pd.read_csv(github_url)
df = df[['comment', 'stars']]
df

Unnamed: 0,comment,stars
0,I could sit here and write all about the specs...,5
1,A very reasonably priced laptop for basic comp...,4
2,"This is the best laptop deal you can get, full...",5
3,A few months after the purchase....It is still...,5
4,BUYER BE AWARE: This computer has Microsoft 10...,1
...,...,...
195760,I have not tried this camera without the SD ca...,5
195761,"Hello, I bought this item months ago and I tho...",1
195762,This is an incredible camera for the money!! ...,5
195763,Great cameras. Purchased some for my mother af...,5


In [None]:
train_dataset, test_dataset = train_test_split(df, test_size=0.2, random_state=1)
test_dataset = test_dataset.reset_index(drop=True)

In [None]:
# Nir START 
df = pd.read_csv('../data/amazon_reviews_reviewText_ratings.csv')
df = df.sample(frac=0.1, random_state=34)

In [None]:
# Set up the GPU.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## 3. Define the BERT Model

---



The original code in this section is located in `bert-training.ipynb`. It is included here to make the `get_star_predictions()` function to work. The output is suppressed to make the notebook easier to read.

In [None]:
%%capture
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 5, # Number of unique labels for our multi-class classification problem.
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Load the Trained Model

---

Here, we load the `pytorch_model_2_epochs.bin` file, which contains the trained weights.

In [None]:
# Load the trained model.
model.load_state_dict(torch.load('/home/user/IdeaProjects/velotix_ex/models/Model_V3/3/bert_3.pth'))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from transformers import BertForSequenceClassification

loaded_model = BertForSequenceClassification.from_pretrained('/home/user/IdeaProjects/velotix_ex/models/Model_V3/3/md')

## 5. Define the Reviews Dataset

---



The original code in this section is located in `star_prediction.ipyn`. It is included here to make the `get_star_predictions()` function to work.

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, df, max_length=512):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length 
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # input=review, label=stars
        review = self.df.loc[idx, 'reviewText']
        # labels are 0-indexed
        label = int(self.df.loc[idx, 'rating']) - 1
        
        encoded = self.tokenizer(
            review,                      # Review to encode.
            add_special_tokens=True,
            max_length=self.max_length,  # Truncate all segments to max_length.
            padding='max_length',        # Pad all reviews with the [PAD] token to the max_length.
            return_attention_mask=True,  # Construct attention masks.
            truncation=True
        )
        
        input_ids = encoded['input_ids']
        attn_mask = encoded['attention_mask']
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attn_mask': torch.tensor(attn_mask), 
            'label': torch.tensor(label)
        }

## 6. Predict the Star Rating

---

The following code takes a string comment and returns a predicted star rating.

In [None]:
def get_single_prediction(comment, model):
  """
  Predict a star rating from a review comment.

  :comment: the string containing the review comment.
  :model: the model to be used for the prediction.
  """

  df = pd.DataFrame()
  df['reviewText'] = [comment]
  df['rating'] = ['0']

  dataset = ReviewsDataset(df)

  TEST_BATCH_SIZE = 1
  NUM_WORKERS = 1

  test_params = {'batch_size': TEST_BATCH_SIZE,
              'shuffle': True,
              'num_workers': NUM_WORKERS}

  data_loader = DataLoader(dataset, **test_params)

  total_examples = len(df)
  predictions = np.zeros([total_examples], dtype=object)

  for batch, data in enumerate(data_loader):

    # Get the tokenization values.
    input_ids = data['input_ids'].to(device)
    mask = data['attn_mask'].to(device)

    # Make the prediction with the trained model.
    outputs = model(input_ids, mask)

    # Get the star rating.
    big_val, big_idx = torch.max(outputs[0].data, dim=1)
    star_predictions = (big_idx + 1).cpu().numpy()

  return star_predictions[0]

In [None]:
from transformers import BertTokenizer

def get_single_prediction2(comment, model, tokenizer):
    # Tokenize the comment
    df = pd.DataFrame()
    df['comment'] = [comment]
    df['stars'] = ['0']

    dataset = ReviewsDataset(df)

    TEST_BATCH_SIZE = 1
    NUM_WORKERS = 1

    test_params = {'batch_size': TEST_BATCH_SIZE,
              'shuffle': True,
              'num_workers': NUM_WORKERS}
    
    data_loader = DataLoader(dataset, **test_params)

    total_examples = len(df)
    predictions = np.zeros([total_examples], dtype=object)
    
    for batch, data in enumerate(data_loader):

        # Get the tokenization values.
        input_ids = data['input_ids'].to(device)
        mask = data['attn_mask'].to(device)

        # Make the prediction with the trained model.
        outputs = model(input_ids, mask)

        # Get the star rating.
        big_val, big_idx = torch.max(outputs[0].data, dim=1)
        star_predictions = (big_idx + 1).cpu().numpy()

    return star_predictions[0]

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Get the star predictions.
model.to(device)
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
prediction = get_single_prediction("Purchase this based on some Amazon recommendations, and I was a little disappointed. I found the product had, reasonable hold, but deteriorated to be very pasty after a short period of time.", model)
prediction

3

In [None]:
print(prediction)

In [None]:
df.head()

Unnamed: 0,rating,reviewText
215769,5,Very Cute
183386,5,I bought this for my Petit1 Pilot pen because ...
240678,5,Nice!
75630,5,"I purchased a home with a sidewalk, driveway a..."
420949,5,a great looking plant


In [None]:
# For validation
def validate(model, data_loader):
    model.eval()
    n_correct = 0 
    nb_test_steps = 0
    nb_test_examples = 0
    test_loss = 0
    y_pred = []
    y_true = []

    with torch.no_grad():
        for _, data in enumerate(data_loader, 0):
            input_ids = data['input_ids'].to(device)
            mask = data['attn_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids, mask)

            # gets labels with highest probabilities and their corresponding indices
            big_val, big_idx = torch.max(outputs[0].data, dim=1)
            n_correct += calculate_accuracy(big_idx, labels)

            preds = (big_idx + 1).cpu().tolist()
            gold = (labels + 1).cpu().tolist()
            y_pred.extend(preds)
            y_true.extend(gold)

            nb_test_steps += 1
            nb_test_examples += labels.size(0)
    
    return y_true, y_pred

In [None]:
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
test_params = {
                'batch_size': 1,
                'shuffle': False,
                'num_workers': 1
              }
test_loader = DataLoader(test_set, **test_params)