<a href="https://colab.research.google.com/github/kawsarahmd/transformer-text-classifier/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install torch transformers numpy pandas scikit-learn
!pip install datasets
!pip install tqdm



In [9]:
from datasets import load_dataset

imdb = load_dataset("imdb")

imdb["test"][1000]

{'text': 'This film is about a struggling actor trying to find satisfaction in life, especially love which he has not had a taste of for 5 years.<br /><br />It basically is a film featuring a man with very poor social skills, and he says wrong things all the time. The plot is hollow and contrived. The main character, James, is lonely, but this theme of loneliness is not adequately explored. It is more like an empty statement which other subplots stem from. Sadness and disappointment after being dumped are superficial. There is a serious lack of emotions in the film.<br /><br />It is not funny as a comedy either. There are some funny one liners but that is it. It lacks the happy and uplifting atmosphere to infect people with happy mood. I don\'t find "I Want Someone to Eat Cheese With" funny.',
 'label': 0}

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer
import torch
from tqdm import tqdm

# Convert the given dataset into a Pandas DataFrame
def convert_to_dataframe(imdb_data):
    text_data = []
    labels = []

    for item in imdb_data["test"]:
        text_data.append(item["text"])
        labels.append(item["label"])

    return pd.DataFrame({
        'review': text_data,
        'label': labels
    })

data = convert_to_dataframe(imdb)
data = data.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenize the data using BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
max_seq_len = 128

# Follow the tokenization function from the previous response
def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    # Wrap the loop with tqdm to display a progress bar
    for index, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row["review"],
            add_special_tokens=True,  # Add [CLS] and [SEP]
            max_length=max_seq_len,  # Set maximum sequence length
            padding="max_length",  # Pad shorter sequences
            truncation=True,  # Truncate longer sequences
            return_attention_mask=True,  # Return attention masks
        )

        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        labels.append(row["label"])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20000/20000 [02:17<00:00, 145.50it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:32<00:00, 154.54it/s]


In [11]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

# Create DataLoader for the training set
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for the validation set
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

In [14]:
from transformers import DistilBertForSequenceClassification, BertConfig
# import torch.optim.adamw as AdamW
from tqdm import tqdm
from torch.optim import AdamW

# Load the pre-trained BERT model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # Use 2 labels for binary classification, adjust it for multi-class problems
    output_attentions=False,
    output_hidden_states=False,
)

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [15]:
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report

num_epochs = 3
total_steps = len(train_dataloader) * num_epochs

optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(dataloader, desc="Training", position=0, leave=True)
    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

        progress_bar.set_description(f"Training - Loss: {loss.item():.4f}")

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0

    progress_bar = tqdm(dataloader, desc="Evaluation", position=0, leave=True)
    for batch in progress_bar:
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        batch_accuracy = accuracy_score(label_ids, logits.argmax(axis=-1))
        total_eval_accuracy += batch_accuracy

        progress_bar.set_description(f"Evaluation - Batch Accuracy: {batch_accuracy:.4f}")

    return total_eval_accuracy / len(dataloader)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy = evaluate(model, val_dataloader, device)

    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

Training - Loss: 0.5316: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1250/1250 [03:39<00:00,  5.70it/s]
Evaluation - Batch Accuracy: 0.8750: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 313/313 [00:17<00:00, 17.91it/s]



Epoch 1/3
Loss: 0.3567 - Validation Accuracy: 0.8732


Training - Loss: 0.5150: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1250/1250 [03:38<00:00,  5.73it/s]
Evaluation - Batch Accuracy: 1.0000: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 313/313 [00:17<00:00, 17.98it/s]



Epoch 2/3
Loss: 0.2037 - Validation Accuracy: 0.8806


Training - Loss: 0.1135: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1250/1250 [03:38<00:00,  5.73it/s]
Evaluation - Batch Accuracy: 1.0000: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 313/313 [00:17<00:00, 17.96it/s]


Epoch 3/3
Loss: 0.1118 - Validation Accuracy: 0.8836





In [16]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def get_predictions(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)

        logits = outputs[0].detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return np.array(predictions), np.array(true_labels)

predictions, true_labels = get_predictions(model, val_dataloader, device)
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions, digits=4)

print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 313/313 [00:17<00:00, 17.56it/s]

Validation Accuracy: 0.8834
Classification Report:
              precision    recall  f1-score   support

           0     0.8927    0.8724    0.8824      2507
           1     0.8745    0.8945    0.8844      2493

    accuracy                         0.8834      5000
   macro avg     0.8836    0.8834    0.8834      5000
weighted avg     0.8836    0.8834    0.8834      5000






# ðŸ’¾ **Save Model**

In [17]:
output_dir = "./model/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [18]:
def predict_sentiment(review, model, tokenizer, device):
    model.eval()

    # Tokenize the input text
    encoded = tokenizer.encode_plus(
        review,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=128,  # Set maximum sequence length
        padding="max_length",  # Pad the sequence if it is shorter than max_seq_len
        truncation=True,  # Truncate the sequence if it is longer than max_seq_len
        return_attention_mask=True,  # Return the attention mask
    )

    input_id = torch.tensor([encoded["input_ids"]]).to(device)
    attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)

    with torch.no_grad():
        outputs = model(input_id, attention_mask=attention_mask)

    logits = outputs[0].detach().cpu().numpy()
    sentiment = logits.argmax(axis=-1)[0]

    return sentiment

In [24]:
review = "good and bad  movie"
predicted_sentiment = predict_sentiment(review, model, tokenizer, device)

if predicted_sentiment == 0:
    print("Negative sentiment")
else:
    print("Positive sentiment")

Negative sentiment
