# ***Download and Install Required Libraries***

In [1]:
!pip install transformers torch pandas numpy scikit-learn



# ***Upload and Load the Dataset***

In [None]:
from google.colab import files
import pandas as pd

# Upload the dataset
uploaded = files.upload()

In [9]:
# Load the dataset
data_path = "IMDB_Dataset.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# ***Preprocess the Data***

In [10]:
import re

def preprocess_data(df):
    # Clean reviews
    def clean_text(text):
        text = re.sub(r"http\S+", "", text)  # Remove URLs
        text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphanumeric characters
        text = text.lower().strip()  # Convert to lowercase and strip spaces
        return text

    df['cleaned_review'] = df['review'].apply(clean_text)
    df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return df[['cleaned_review', 'label']]

df = preprocess_data(df)
df.head()

Unnamed: 0,cleaned_review,label
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


# ***Split Data and Create Dataset Class***

In [11]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_review'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ***Train the BERT Model***

In [15]:
# Necessary imports
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer, max_len=128)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer, max_len=128)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    load_best_model_at_end=True,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train model
trainer.train()

# Save model and tokenizer
model.save_pretrained('./bert_model')
tokenizer.save_pretrained('./bert_model/tokenizer')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2956,0.27709
2,0.18,0.310952
3,0.0806,0.430847


('./bert_model/tokenizer/tokenizer_config.json',
 './bert_model/tokenizer/special_tokens_map.json',
 './bert_model/tokenizer/vocab.txt',
 './bert_model/tokenizer/added_tokens.json')

# ***Predict Sentiment with the Trained Model***

In [20]:
# Load saved model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
import torch

model_dir = "./bert_model"
tokenizer = BertTokenizer.from_pretrained(f"{model_dir}/tokenizer")
model = BertForSequenceClassification.from_pretrained(model_dir)

# Prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    prediction = torch.argmax(probs, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"

# Test with example reviews
example_review = "This movie was fantastic! The characters were well-developed and the plot was engaging."
print(predict_sentiment(example_review))

example_review = "The movie was terrible. I wasted my time watching it."
print(predict_sentiment(example_review))

Positive
Negative
