In [None]:
!pip install transformers gradio

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import gradio as gr
from sklearn.model_selection import train_test_split

# Load pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a dummy dataset
data = {
    "text": [
        "This is a positive review",
        "This is a negative review",
        "I love this product",
        "I hate this product",
        "This is a neutral review"
    ],
    "label": [1, 0, 1, 0, 1]  # 1 for positive, 0 for negative
}

df = pd.DataFrame(data)

# Prepare dataset for fine-tuning
train_text, val_text, train_labels, val_labels = train_test_split(df["text"], df["label"], random_state=42, test_size=0.2)

# Reset the index of train_text and val_text to avoid KeyError
train_text = train_text.reset_index(drop=True)
val_text = val_text.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True) # Reset index for train_labels
val_labels = val_labels.reset_index(drop=True)   # Reset index for val_labels

# Create a custom dataset class for our data
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(labels, dtype=torch.long),
        }

    def __len__(self):
        return len(self.texts)

# Create dataset instances
train_dataset = ReviewDataset(train_text, train_labels, tokenizer)
val_dataset = ReviewDataset(val_text, val_labels, tokenizer)

# Create data loaders
batch_size = 16
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss # Update: Get loss directly from outputs

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    model.eval()
    with torch.no_grad():
        total_correct = 0
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, dim=1)
            total_correct += (predicted == labels).sum().item()

        accuracy = total_correct / len(val_labels)
        print(f"Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}")

def predict(text):
    inputs = tokenizer.encode_plus(
        text,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

    inputs["input_ids"] = inputs["input_ids"].to(device)
    inputs["attention_mask"] = inputs["attention_mask"].to(device)

    outputs = model(**inputs)
    logits = outputs.logits
    _, predicted = torch.max(logits, dim=1)

    return predicted.item()

demo = gr.Interface(
    fn=predict,
    inputs="text",
    outputs="label",
    title="Sentiment Analysis Demo",
    description="Enter a text to analyze its sentiment",
)

if __name__ == "__main__":
    demo.launch()

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
