## **Install necessary libraries**
here I already install required libraries

### **Step1: Import the necessary libraries**

In [None]:
import os
import torch
from torch import nn
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer , BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


## **Step2: Import the IMDB(movie review dataaset) dataset and process it**

In [None]:
def load_imdb_dataset(data_file):
  movie_frame = pd.read_csv(data_file)
  texts = movie_frame['review'].tolist()
  labels = [1 if sentiment == 'positive' else 0  for sentiment in movie_frame['sentiment'].tolist()]
  return texts , labels

In [None]:
data_file = "/content/IMDB Dataset.csv"
texts, labels = load_imdb_dataset(data_file)


In [None]:
print(len(texts))
print(len(labels))

50000
50000


## **Create a custom dataset class for text classification**
- this class help us to organize movie review and their sentiment for our BERt model <br>
- this class also take care of tokenizing the input text <br>
- handling the sequence length of text
- and providing a neat package with input IDs
- attention masks and labels for our model to learn from

In [None]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
  def __len__(self):
        return len(self.texts)
  def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}


## **Build our customer BERT model classifer**
- in this step create customer BERT model classifer , on the top of base BERT model, which is greate understanding of text.
- then we will add dropout layer ---> to keeps things in chunks
- after adding dropout layer , then add linear layer --> which help us to classify text

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits





## **Train function**
The train() function takes the model, data loader, optimizer, scheduler, and device as its trainees.

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

## **Build our model evalation model**


In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

## **Build our model prediction method**
- The predict_sentiment() function acts as our evaluation method.
- For each batch, it gets the input IDs, attention masks, and labels and feeds them to the model.
- The model then gives its best predictions, which are compared to the actual labels.

Finally, the function calculates the accuracy score and a classification report to let us know how well the model did in understanding movie reviews’ sentiments

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"


## **Define our model paramters**
- Here we are going to define essentail parameter of our model to finetune the BERTClassifier
- Including the BERT model name, number of classes, maximum input sequence length, batch size, number of training epochs, and learning rate, to help the model effectively understand movie reviews and their sentiment

In [None]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

## **load and splitting the dataset**

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

## **Initialize tokenizer, dataset, and data loader**

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

## **setup the device and model**


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

## **Set up optimizer and learning rate scheduler**

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



## **Training the model**


In [None]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  accuracy, report = evaluate(model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}")
  print(report)

Epoch 1/4
Validation Accuracy: 0.8930
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      4961
           1       0.91      0.88      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Epoch 2/4
Validation Accuracy: 0.8922
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      4961
           1       0.92      0.86      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Epoch 3/4
Validation Accuracy: 0.8960
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      4961
           1       0.90      0.90      0.90      5039

    accuracy                           0.90     10000
   macro avg  

## **Save the train model**

In [None]:
torch.save(model.state_dict(), "my_bert_classifier.pth")

## **Evaluating our model performance**

In [None]:
# Test sentiment prediction
test_text = "The movie was great and I really enjoyed the performances of the actors."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was great and I really enjoyed the performances of the actors.")
print(f"Predicted sentiment: {sentiment}")

The movie was great and I really enjoyed the performances of the actors.
Predicted sentiment: positive


In [None]:
# Test sentiment prediction
test_text = "The movie was so bad and I would not recommend it to anyone."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("The movie was so bad and I would not recommend it to anyone.")
print(f"Predicted sentiment: {sentiment}")

The movie was so bad and I would not recommend it to anyone.
Predicted sentiment: negative


In [None]:
# Test sentiment prediction
test_text = "Worst movie of the year."
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("Worst movie of the year.")
print(f"Predicted sentiment: {sentiment}")

Worst movie of the year.
Predicted sentiment: negative


In [None]:
# Test sentiment prediction
test_text = "The movie was very intersting"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("Worst movie of the year.")
print(f"Predicted sentiment: {sentiment}")

Worst movie of the year.
Predicted sentiment: negative
