### RoBERTa Model 

In [2]:
pip install transformers datasets torch scikit-learn pandas tqdm


Note: you may need to restart the kernel to use updated packages.


In [43]:
import pandas as pd

### Loading Amazon Fine Food Reviews dataset. It has 568454 rows and 10 columns

In [46]:
df= pd.read_csv("Reviews.csv")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [48]:
df.shape

(568454, 10)

In [50]:
# Convert ratings to sentiment labels
df['label'] = df['Score']. apply(lambda x: 'Positive' if x >= 4 else 'Negative')

In [52]:
df.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,label
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,Positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Negative


### Preprocessing

In [55]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
label                      0
dtype: int64

Columns : summary and ProfileName have NaN values. Drop rows with NaN values

In [58]:
# Drop NaN values (if any)
df.dropna(inplace=True)


In [60]:
df.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
label                     0
dtype: int64

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568401 entries, 0 to 568453
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568401 non-null  int64 
 1   ProductId               568401 non-null  object
 2   UserId                  568401 non-null  object
 3   ProfileName             568401 non-null  object
 4   HelpfulnessNumerator    568401 non-null  int64 
 5   HelpfulnessDenominator  568401 non-null  int64 
 6   Score                   568401 non-null  int64 
 7   Time                    568401 non-null  int64 
 8   Summary                 568401 non-null  object
 9   Text                    568401 non-null  object
 10  label                   568401 non-null  object
dtypes: int64(5), object(6)
memory usage: 52.0+ MB


In [64]:
invalid_score = df[(df["Score"] < 1) | (df["Score"] > 5)]
invalid_score

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,label


Check Time is from Oct 1999 - Oct 2012:
- October 1, 1999 at 00:00:00 UTC → 938736000
- November 1, 2012 at 00:00:00 UTC → 1351728000

In [67]:
invalid_date = df[(df["Time"] < 938736000) | (df["Time"] > 1351728000)]
print(invalid_date)

Empty DataFrame
Columns: [Id, ProductId, UserId, ProfileName, HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary, Text, label]
Index: []


In [69]:
df.shape

(568401, 11)

### Splitting data for training and testing 
train_oversampled_df shape (50000, 11)\
test_df shape (12500, 11).

In [72]:
from sklearn.model_selection import train_test_split

# Split into train and test sets (Stratified to maintain class distribution)
train_df, test_df = train_test_split(df , test_size = 0.2 , stratify=df["Score"], random_state =42)

# Sample 12,500 test records
test_df = test_df.sample(n=12500, random_state=42)

# Sample 25,000 positive & 25,000 negative records for a balanced training set
train_oversampled_df = pd.concat([
    train_df[train_df["label"] == 'Positive'].sample(n=25000, random_state=42),
    train_df[train_df["label"] == 'Negative'].sample(n=25000, random_state=42)
], ignore_index=True)

In [74]:
print("Train Dataset Shape:", train_oversampled_df.shape)
print("Test Dataset Shape:", test_df.shape)

Train Dataset Shape: (50000, 11)
Test Dataset Shape: (12500, 11)


In [35]:
negative_labels=(train_oversampled_df['label']=='Negative').value_counts()
poitive_labels = (train_oversampled_df['label']=='Positive').value_counts()
print("Negative labels count in train_oversampled_df ",negative_labels)
print("Positive labels count in train_oversampled_df ", poitive_labels)


Negative labels count in train_oversampled_df  label
False    25000
True     25000
Name: count, dtype: int64
Positive labels count in train_oversampled_df  label
True     25000
False    25000
Name: count, dtype: int64


### Tokenization

In [24]:
import ipywidgets as widgets
widgets.IntSlider()

IntSlider(value=0)

In [25]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

train_encodings = tokenizer(
    train_oversampled_df["Text"].tolist(),
    truncation=True,  # Ensures that long reviews are truncated to max length
    padding=True,  # Ensures all tokenized inputs have the same length
    max_length=512  # Maximum token length for transformer models
)

test_encodings=tokenizer(
    test_df["Text"].tolist(),
    truncation=True,
    padding=True,
    max_length=512
)

In [26]:
train_encodings
# Print an example tokenized input
print("Example Tokenized Input:", train_encodings['input_ids'][0])

Example Tokenized Input: [0, 42949, 5872, 6, 644, 2266, 35, 1437, 152, 21, 10, 372, 432, 15, 3668, 5, 275, 160, 12, 627, 12, 8877, 13491, 33367, 655, 328, 1437, 38, 348, 2162, 42, 13, 107, 31, 10, 400, 8290, 1400, 6, 53, 12056, 51, 214, 747, 66, 9, 388, 6, 98, 38, 439, 546, 804, 8, 303, 42, 372, 432, 15, 1645, 4, 1437, 31476, 124, 7, 645, 55, 122, 6, 53, 1645, 161, 24, 18, 22, 38501, 19216, 845, 1437, 1034, 14, 1022, 1010, 49069, 3809, 1589, 49007, 3809, 48709, 46599, 35, 1437, 1405, 2794, 329, 67, 1523, 5, 276, 1152, 11, 10, 25413, 6, 53, 7421, 5, 24623, 1732, 17893, 3625, 357, 4, 1437, 1491, 686, 596, 49069, 3809, 1589, 49007, 3809, 48709, 34543, 6, 759, 1824, 35, 1437, 3655, 1714, 10, 150, 124, 6, 8, 122, 42, 16, 10, 6587, 432, 328, 1437, 30374, 7, 28, 359, 7984, 131, 1629, 134, 73, 7424, 19, 481, 6738, 4, 1437, 978, 24, 18, 59, 68, 176, 73, 7424, 4, 1437, 7461, 36597, 2326, 2324, 42, 13, 359, 7984, 131, 1629, 134, 73, 7424, 6, 53, 9574, 127, 400, 1400, 1302, 7, 912, 28960, 24, 2740

In [27]:
print(train_oversampled_df["label"].unique())  # Check what values exist
print(test_df["label"].unique())


['Positive' 'Negative']
['Positive' 'Negative']


In [28]:
# Convert labels from strings to integers if necessary
train_oversampled_df["label"] = train_oversampled_df["label"].map({"Positive": 1, "Negative": 0})
test_df["label"] = test_df["label"].map({"Positive": 1, "Negative": 0})


In [29]:
print(train_oversampled_df["label"].head(10))  # Print first 10 values
print(test_df["label"].head(10))


0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
9    1
Name: label, dtype: int64
22786     1
283111    1
358844    0
510964    1
355866    1
414363    1
438515    0
438683    0
492653    1
412713    1
Name: label, dtype: int64


In [30]:
import torch

class AmazonReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # Tokenized inputs (input_ids, attention_mask)
        self.labels = torch.tensor(labels, dtype=torch.long)  # Ensure labels are long integers (0 or 1)

    def __len__(self):
        return len(self.labels)  # Number of samples in dataset

    def __getitem__(self, idx):
        # Convert tokenized inputs into PyTorch tensors
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]  # Ensure labels are single-value integers (0 or 1)
        return item

In [31]:
# Convert labels to a list format
train_labels = train_oversampled_df["label"].tolist()
test_labels = test_df["label"].tolist()

# Wrap tokenized data into PyTorch dataset
train_dataset = AmazonReviewsDataset(train_encodings, train_labels)
test_dataset = AmazonReviewsDataset(test_encodings, test_labels)

# Print dataset size
print(f"Train Dataset Size: {len(train_dataset)}")
print(f"Test Dataset Size: {len(test_dataset)}")


Train Dataset Size: 50000
Test Dataset Size: 12500


In [32]:
import torch

#model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2)
#print(model)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
pip install transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [34]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",      # Save model checkpoints here
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save model at the end of each epoch
    per_device_train_batch_size=8,  # Adjust batch size based on GPU memory
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=3,         # Number of epochs (adjust as needed)
    logging_dir="./logs",       # Log directory
    logging_steps=500,          # Log every 500 steps
    load_best_model_at_end=True # Save the best model automatically
)




In [35]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use tokenized train dataset
    eval_dataset=test_dataset     # Use tokenized test dataset
)

# Start fine-tuning
trainer.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Get predictions
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Print accuracy & classification report
accuracy = accuracy_score(test_df["label"].tolist(), preds)
print(f"Test Accuracy: {accuracy:.4f}")
print(classification_report(test_df["label"].tolist(), preds))


## Alternative approach :


In [76]:
pip install transformers pandas torch tqdm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [78]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned RoBERTa model for binary sentiment classification
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Force model to run on CPU
device = torch.device("cpu")
model.to(device)


tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [79]:
def predict_sentiment(review_text):
    # Tokenize input text
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    # Run the model on the input
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class (0=Negative, 1=Positive)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # Map to binary sentiment
    return "Positive 😊" if predicted_class == 1 else "Negative 😡"

# Example Usage
review = "The food was delicious and fresh!"
print(f"Review Sentiment: {predict_sentiment(review)}")

review = "Terrible service, I will never buy this again!"
print(f"Review Sentiment: {predict_sentiment(review)}")


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Review Sentiment: Positive 😊
Review Sentiment: Negative 😡


In [82]:
import pandas as pd
from tqdm import tqdm

def predict_sentiments_batch(reviews, batch_size=16):
    results = []
    for i in tqdm(range(0, len(reviews), batch_size), desc="Processing Batches on CPU"):
        batch = reviews[i : i + batch_size]  # Get batch
        
        # Tokenize batch
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        # Run model on batch
        with torch.no_grad():
            outputs = model(**inputs)

        # Convert logits to predictions
        predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()

        # Map predictions to labels
        results.extend(["Positive 😊" if label == 1 else "Negative 😡" for label in predicted_classes])

    return results


In [None]:
# Predict for Train Set (50,000 reviews)
train_oversampled_df["Predicted Sentiment"] = predict_sentiments_batch(train_oversampled_df["Text"].tolist())

# Predict for Test Set (12,500 reviews)
test_df["Predicted Sentiment"] = predict_sentiments_batch(test_df["Text"].tolist())

# Display results
print(train_oversampled_df.head())
print(test_df.head())


Processing Batches on CPU:  22%|████████▌                              | 686/3125 [1:21:26<6:22:28,  9.41s/it]