In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np


In [None]:
# Load a sentiment dataset (IMDB reviews)
url = "https://raw.githubusercontent.com/datasets/sentiment-analysis-imdb/master/data/imdb_labelled.txt"
df = pd.read_csv(url, delimiter="\t", names=["text", "label"])

# Display dataset sample
print("Dataset Sample:")
print(df.head())

# Check dataset balance
print("\nDataset Shape:", df.shape)
print("Label Distribution:\n", df['label'].value_counts())


In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define a custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

    def __len__(self):
        return len(self.labels)

# Split dataset into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# Create dataset objects
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

print("\nTraining Dataset Size:", len(train_dataset))
print("Test Dataset Size:", len(test_dataset))


In [None]:
# Load the pre-trained BERT model for sentiment classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Binary classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("\nBERT Model Loaded Successfully!")


In [None]:
from torch.optim import AdamW
from torch.utils.data import DataLoader

# Define parameters
BATCH_SIZE = 16
EPOCHS = 3

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Define optimizer and loss
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch, labels in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Testing loop
def evaluate(model, dataloader):
    model.eval()
    preds, truths = [], []
    with torch.no_grad():
        for batch, labels in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            truths.extend(labels.numpy())
    
    return preds, truths


In [None]:
# Train the model
for epoch in range(EPOCHS):
    avg_loss = train(model, train_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS} - Average Loss: {avg_loss:.4f}")

# Evaluate on the test set
preds, truths = evaluate(model, test_loader)
accuracy = accuracy_score(truths, preds)

print("\nTest Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(truths, preds))


In [None]:
# Function to predict sentiment for new text
def predict_sentiment(text, tokenizer, model):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if pred == 1 else "Negative"

# Test predictions
sample_texts = [
    "I loved this movie. The acting was fantastic and the story was engaging!",
    "This film was a waste of time. Terrible plot and bad acting."
]

for text in sample_texts:
    sentiment = predict_sentiment(text, tokenizer, model)
    print(f"Review: {text}\nPredicted Sentiment: {sentiment}\n")
