In [1]:
!pip install -q "transformers" "datasets" "evaluate" "gradio"

import os
os.environ["WANDB_DISABLED"] = "true"   # no wandb popup

import numpy as np
import pandas as pd
import torch

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TextClassificationPipeline,
)
import evaluate
import gradio as gr

print("Libraries imported. Using Transformers, Datasets, Evaluate, Gradio.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hLibraries imported. Using Transformers, Datasets, Evaluate, Gradio.


In [2]:
raw_datasets = load_dataset("imdb")
print("Loaded IMDb dataset splits:", raw_datasets.keys())

train_size = 1000
test_size  = 200

small_train = raw_datasets["train"].shuffle(seed=42).select(range(train_size))
small_test  = raw_datasets["test"].shuffle(seed=42).select(range(test_size))

print(f"Training examples: {small_train.num_rows}")
print(f"Test examples: {small_test.num_rows}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Loaded IMDb dataset splits: dict_keys(['train', 'test', 'unsupervised'])
Training examples: 1000
Test examples: 200


In [3]:
model_name = "prajjwal1/bert-tiny"   # base model for both versions

print("Base model for this experiment:", model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded for:", model_name)


Base model for this experiment: prajjwal1/bert-tiny


config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Tokenizer loaded for: prajjwal1/bert-tiny


In [4]:
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

print("Tokenizing training data...")
tokenized_train = small_train.map(tokenize_function, batched=True)

print("Tokenizing test data...")
tokenized_test  = small_test.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.remove_columns(["text"]).with_format("torch")
tokenized_test  = tokenized_test.remove_columns(["text"]).with_format("torch")

print("Tokenization complete. Columns now:", tokenized_train.column_names)


Tokenizing training data...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing test data...


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenization complete. Columns now: ['label', 'input_ids', 'token_type_ids', 'attention_mask']


In [5]:
accuracy_metric  = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric    = evaluate.load("recall")
f1_metric        = evaluate.load("f1")

print("Loaded metrics: accuracy, precision, recall, f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    result = {}
    result.update(accuracy_metric.compute(predictions=preds, references=labels))
    result.update(precision_metric.compute(predictions=preds, references=labels, average="binary"))
    result.update(recall_metric.compute(predictions=preds, references=labels, average="binary"))
    result.update(f1_metric.compute(predictions=preds, references=labels, average="binary"))
    return result


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Loaded metrics: accuracy, precision, recall, f1


In [6]:
print("Loading pretrained classification model (Model 1: bert_tiny_pretrained)...")
model_pre = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

eval_args = TrainingArguments(
    output_dir="tmp_eval",
    per_device_eval_batch_size=8,
    do_train=False,
    do_eval=True,
    logging_steps=10,
    report_to="none"
)

trainer_pre = Trainer(
    model=model_pre,
    args=eval_args,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

print("Evaluating Model 1 (bert_tiny_pretrained) on test subset...")
metrics_before = trainer_pre.evaluate()
print("Model 1 metrics:", metrics_before)


Loading pretrained classification model (Model 1: bert_tiny_pretrained)...


pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Model 1 (bert_tiny_pretrained) on test subset...




model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Model 1 metrics: {'eval_loss': 0.7061797976493835, 'eval_model_preparation_time': 0.0073, 'eval_accuracy': 0.52, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 1.8774, 'eval_samples_per_second': 106.53, 'eval_steps_per_second': 13.316}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
print("Loading model for fine-tuning (Model 2: bert_tiny_finetuned)...")
model_ft = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

train_args = TrainingArguments(
    output_dir="bert_tiny_imdb_ft",
    eval_strategy="no",          # or evaluation_strategy="no" on older transformers
    save_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=0.3,
    max_steps=40,                # limit total steps for speed
    logging_steps=10,
    report_to="none"
)

trainer_ft = Trainer(
    model=model_ft,
    args=train_args,
    train_dataset=tokenized_train,
    compute_metrics=compute_metrics
)

print("Starting training for Model 2 (bert_tiny_finetuned)...")
trainer_ft.train()
print("Training finished. Evaluating Model 2 on test subset...")
metrics_after = trainer_ft.evaluate(eval_dataset=tokenized_test)
print("Model 2 metrics:", metrics_after)


Loading model for fine-tuning (Model 2: bert_tiny_finetuned)...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training for Model 2 (bert_tiny_finetuned)...




Step,Training Loss
10,0.743
20,0.7108
30,0.7014
40,0.7319


Training finished. Evaluating Model 2 on test subset...


Model 2 metrics: {'eval_loss': 0.7059720754623413, 'eval_accuracy': 0.48, 'eval_precision': 0.48, 'eval_recall': 1.0, 'eval_f1': 0.6486486486486487, 'eval_runtime': 1.2507, 'eval_samples_per_second': 159.905, 'eval_steps_per_second': 19.988, 'epoch': 0.32}


In [8]:
df_before_after = pd.DataFrame([
    {"model": "bert_tiny_pretrained", **metrics_before},
    {"model": "bert_tiny_finetuned", **metrics_after}
])

print("Comparison of models (pretrained vs finetuned):")
display(df_before_after[["model", "eval_accuracy", "eval_precision", "eval_recall", "eval_f1"]])


Comparison of models (pretrained vs finetuned):


Unnamed: 0,model,eval_accuracy,eval_precision,eval_recall,eval_f1
0,bert_tiny_pretrained,0.52,0.0,0.0,0.0
1,bert_tiny_finetuned,0.48,0.48,1.0,0.648649


In [9]:
print("Building inference pipelines for custom sentence testing...")
pipe_pre = TextClassificationPipeline(
    model=model_pre,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

pipe_ft = TextClassificationPipeline(
    model=model_ft,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

custom_texts = [
    "This movie was absolutely fantastic, I loved every minute.",
    "The film was boring and way too long.",
    "Great acting but the story was weak.",
    "Terrible script and horrible direction.",
    "Not bad, but I expected more."
]

rows = []
for i, text in enumerate(custom_texts):
    for model_label, pipe in [("bert_tiny_pretrained", pipe_pre), ("bert_tiny_finetuned", pipe_ft)]:
        scores = pipe(text)[0]
        pos_score = [s["score"] for s in scores if s["label"] in ["LABEL_1", "POSITIVE"]][0]
        neg_score = [s["score"] for s in scores if s["label"] in ["LABEL_0", "NEGATIVE"]][0]
        pred_label = "positive" if pos_score >= neg_score else "negative"
        rows.append({
            "sample_id": i,
            "text": text[:60] + ("..." if len(text) > 60 else ""),
            "model": model_label,
            "pred_label": pred_label,
            "pos_conf": round(float(pos_score), 4),
            "neg_conf": round(float(neg_score), 4),
        })

df_custom = pd.DataFrame(rows)
print("Custom sentence predictions for both models:")
display(df_custom)


Device set to use cpu
Device set to use cpu


Building inference pipelines for custom sentence testing...
Custom sentence predictions for both models:


Unnamed: 0,sample_id,text,model,pred_label,pos_conf,neg_conf
0,0,"This movie was absolutely fantastic, I loved e...",bert_tiny_pretrained,negative,0.3647,0.6353
1,0,"This movie was absolutely fantastic, I loved e...",bert_tiny_finetuned,positive,0.5895,0.4105
2,1,The film was boring and way too long.,bert_tiny_pretrained,negative,0.3865,0.6135
3,1,The film was boring and way too long.,bert_tiny_finetuned,positive,0.5878,0.4122
4,2,Great acting but the story was weak.,bert_tiny_pretrained,negative,0.3825,0.6175
5,2,Great acting but the story was weak.,bert_tiny_finetuned,positive,0.5738,0.4262
6,3,Terrible script and horrible direction.,bert_tiny_pretrained,negative,0.3847,0.6153
7,3,Terrible script and horrible direction.,bert_tiny_finetuned,positive,0.5555,0.4445
8,4,"Not bad, but I expected more.",bert_tiny_pretrained,negative,0.3956,0.6044
9,4,"Not bad, but I expected more.",bert_tiny_finetuned,positive,0.552,0.448


In [10]:
def classify_review(text, model_choice):
    if model_choice == "bert_tiny_finetuned":
        pipe = pipe_ft
    else:
        pipe = pipe_pre

    scores = pipe(text)[0]
    pos_score = [s["score"] for s in scores if s["label"] in ["LABEL_1", "POSITIVE"]][0]
    neg_score = [s["score"] for s in scores if s["label"] in ["LABEL_0", "NEGATIVE"]][0]
    pred_label = "positive" if pos_score >= neg_score else "negative"
    return {
        "chosen_model": model_choice,
        "label": pred_label,
        "positive_confidence": float(pos_score),
        "negative_confidence": float(neg_score)
    }

print("Launching Gradio UI. You can choose between:")
print(" - bert_tiny_pretrained")
print(" - bert_tiny_finetuned")

demo = gr.Interface(
    fn=classify_review,
    inputs=[
        gr.Textbox(lines=3, label="Enter movie review"),
        gr.Dropdown(
            choices=["bert_tiny_pretrained", "bert_tiny_finetuned"],
            value="bert_tiny_finetuned",
            label="Model"
        )
    ],
    outputs="json",
    title="IMDb Sentiment Demo (bert-tiny models)"
)

demo.launch()


Launching Gradio UI. You can choose between:
 - bert_tiny_pretrained
 - bert_tiny_finetuned
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://60600c04c4cd10d357.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


