In [None]:
pip install transformers datasets torch


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
import numpy as np


In [None]:
# Load the IMDb dataset
dataset = load_dataset("imdb")

# Split the dataset into training and testing sets
train_data = dataset['train']
test_data = dataset['test']

# View a sample
print("Sample Text:", train_data[0]['text'])
print("Sample Label:", train_data[0]['label'])


In [None]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Tokenize the datasets
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)


In [None]:
from torch.utils.data import DataLoader

# Format datasets
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Create DataLoaders
train_dataloader = DataLoader(tokenized_train, batch_size=16, shuffle=True)
test_dataloader = DataLoader(tokenized_test, batch_size=16)


In [None]:
# Load a pre-trained Transformer model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


In [None]:
# Define evaluation metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
# Train the model
trainer.train()


In [None]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


In [None]:
# Define a custom text
text = "I absolutely loved this movie! The acting was fantastic."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Get predictions
outputs = model(**inputs)
predictions = np.argmax(outputs.logits.detach().numpy(), axis=1)

# Interpret predictions
label_map = {0: "negative", 1: "positive"}
print(f"Sentiment: {label_map[predictions[0]]}")
