# Sentiment Classification of IMDB Reviews Using DistilBERT


[![Open in Layer](https://development.layer.co/assets/badge.svg)](https://app.layer.ai/douglas_mcilwraith/bert-text-classification/) [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/layerai/examples/blob/main/bert-text-classification/bert-text-classification.ipynb) [![Layer Examples Github](https://badgen.net/badge/icon/github?icon=github&label)](https://github.com/layerai/examples/tree/main/bert-text-classification)

We use the DistilBERT [] to perform sentiment classification on the [IMDB sentiment dataset](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews?resource=download)


In [5]:
!pip install layer -U

import layer
layer.login()

from layer.decorators import model, dataset, fabric, pip_requirements
layer.init("distilbert-imdb")



You should consider upgrading via the '/Users/douglas.mcilwraith/.pyenv/versions/3.8.10/bin/python3.8 -m pip install --upgrade pip' command.[0m[33m
[0m

Your Layer project is here: https://app.layer.ai/douglas_mcilwraith/distilbert-imdb

In [None]:
#Preprocess our training dataset and upload to layer
@dataset("imdb-train")
@pip_requirements(packages=["datasets"])
def build():
    from datasets import load_dataset
    import pandas as pd

    ds = load_dataset("imdb")['train']
    df = pd.DataFrame(ds)
    return df

layer.run([build])

In [None]:
#Preprocess our test dataset and upload to layer
@dataset("imdb-test")
@pip_requirements(packages=["datasets"])
def build():
    from datasets import load_dataset
    import pandas as pd

    ds = load_dataset("imdb")['test']
    df = pd.DataFrame(ds)
    return df

layer.run([build])

In [None]:
#Fine tune the DistilBERT model using 10% of the train data (randomly sampled)
@fabric("f-gpu-small")
@model('bert-fine-tune')
def train():
    import numpy as np
    import pandas as pd
    import torch

    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    from sklearn.model_selection import train_test_split
    from transformers import TrainingArguments, Trainer
    from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
    from transformers import EarlyStoppingCallback
    

    model_name = "distilbert-base-uncased"
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    data = layer.get_dataset("douglas_mcilwraith/distilbert-imdb/datasets/imdb-train:1.1").to_pandas()
    data = data.sample(frac=0.10, replace=False, random_state=2)

    X = list(data["text"])
    y = list(data["label"])
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
    
    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels:
                item["labels"] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.encodings["input_ids"])
    
    train_dataset = MyDataset(X_train_tokenized, y_train)
    val_dataset = MyDataset(X_val_tokenized, y_val)
    
    
    def calc_metrics(p):
        pred = p[0]
        labels = p[1]
        pred = np.argmax(pred, axis=1)
        
        a = accuracy_score(y_pred=pred, y_true=labels)
        r = recall_score(y_pred=pred, y_true=labels)
        p = precision_score(y_pred=pred, y_true=labels,)
        f = f1_score(y_pred=pred, y_true=labels)
        
        metrics = {"accuracy": a,"precision": p, "recall": r, "f1": f}
        
        #log the metrics from the latest evaluation to the UI
        layer.log(metrics)
        return metrics
    
    args = TrainingArguments(
        output_dir="out",
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=50,
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calc_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    trainer.train()
    
    return model

layer.run([train],debug=True)

In [None]:
#Evaluate every 10% of the test data against the fine tuned model.
@fabric("f-gpu-small")
@model("distilbert-evaluation")
def build():
    import numpy as np
    import pandas as pd
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
    from sklearn.model_selection import train_test_split
    import torch
    from transformers import TrainingArguments, Trainer
    from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
    from transformers import EarlyStoppingCallback

    #We need to use the same tokenizer as we did during training
    
    #We need to use the same tokenizer as we did during training
    model_name = "distilbert-base-uncased"
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    df = layer.get_dataset("douglas_mcilwraith/distilbert-imdb/datasets/imdb-test:1.1").to_pandas()
    my_model = layer.get_model("douglas_mcilwraith/distilbert-imdb/models/bert-fine-tune:5.1").get_train()
    trainer = Trainer(my_model)
    
    s = np.array_split(df,10)
    list_results = []

    for d in s:
        X = list(d["text"])
        y = list(d["label"])

        df_tokenized = tokenizer(X, padding=True, truncation=True, max_length=512)

        class MyDataset(torch.utils.data.Dataset):
            def __init__(self, encodings, labels=None):
                self.encodings = encodings
                self.labels = labels

            def __getitem__(self, idx):
                item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                if self.labels:
                    item["labels"] = torch.tensor(self.labels[idx])
                return item

            def __len__(self):
                return len(self.encodings["input_ids"])

        test_dataset = MyDataset(df_tokenized, y)
        p, _, _ = trainer.predict(test_dataset)

        pred = np.argmax(p, axis=1)
        labels = test_dataset.labels

        a = accuracy_score(y_pred=pred, y_true=labels)
        r = recall_score(y_pred=pred, y_true=labels)
        p = precision_score(y_pred=pred, y_true=labels,)
        f = f1_score(y_pred=pred, y_true=labels)

        results = [a, r, p, f]
        list_results.append(results)

    results_df = pd.DataFrame(data=list_results, columns=['Accuracy', 'Precision', 'Recall',"F1"])
    layer.log({"results" : results_df})
    
    return my_model

layer.run([build],debug=True)

Output()

In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import torch
from transformers import TrainingArguments, Trainer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import EarlyStoppingCallback

#We need to use the same tokenizer as we did during training
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

df = layer.get_dataset("douglas_mcilwraith/distilbert-imdb/datasets/imdb-test:1.1").to_pandas()
df = df.sample(frac=0.10, replace=False, random_state=2)

my_model = layer.get_model("douglas_mcilwraith/distilbert-imdb/models/bert-fine-tune:5.1").get_train()
trainer = Trainer(my_model)
s = np.array_split(df,10)
list_results = []


for d in s:
    X = list(d["text"])
    y = list(d["label"])

    df_tokenized = tokenizer(X, padding=True, truncation=True, max_length=512)

    class MyDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels:
                item["labels"] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.encodings["input_ids"])

    test_dataset = MyDataset(df_tokenized, y)
    p, _, _ = trainer.predict(test_dataset)

    pred = np.argmax(p, axis=1)
    labels = test_dataset.labels

    a = accuracy_score(y_pred=pred, y_true=labels)
    r = recall_score(y_pred=pred, y_true=labels)
    p = precision_score(y_pred=pred, y_true=labels,)
    f = f1_score(y_pred=pred, y_true=labels)

    results = [a, r, p, f]
    list_results.append(results)

results_df = pd.DataFrame(data=list_results, columns=['Accuracy', 'Precision', 'Recall',"F1"])
layer.log({"results" : results_df})

Output()