#### Load Data with Hugging Face Datasets Library

In [1]:
# Don't do in production. Doing now to keep output clean for understanding
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [4]:
# input_ids, attention_mask, label -> numbers

In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map: 100%|██████████| 35000/35000 [00:04<00:00, 7605.42 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 7869.64 examples/s]


In [7]:
dataset['train'][0]

{'review': 'This film was the worst film I have ever seen. It was a complete waste of money. If I had not been in the cinema was my two young cousins (who also thought it was disappointing, but not as terrible as I thought), I would have left the cinema. There were two points in the film that I almost laughed, but the rest of it was either boring, ridiculous or painful. I thought it would be a spoof on all superhero movies (which I love), but in fact it was mainly based on Spiderman, with a few oblique references to other superhero movies such as Fantastic Four and Batman. I really cannot think of one good thing to say about this film. Do not waste your money with this film-there are many other better films out there!',
 'sentiment': 'negative',
 'label': 0}

## Data Tokenization

In [8]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

In [9]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map: 100%|██████████| 35000/35000 [01:34<00:00, 371.88 examples/s]
Map: 100%|██████████| 15000/15000 [00:11<00:00, 1333.75 examples/s]


In [10]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Building Model Evaluation Functions
https://huggingface.co/docs/transformers/v4.42.0/en/tasks/sequence_classification#evaluate

In [11]:
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [14]:
trainer.train()

 15%|█▌        | 500/3282 [20:30<1:51:28,  2.40s/it]

{'loss': 0.4607, 'grad_norm': 15.69223690032959, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}


 30%|███       | 1000/3282 [41:12<1:34:24,  2.48s/it]

{'loss': 0.3585, 'grad_norm': 7.450829982757568, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


                                                     
 33%|███▎      | 1094/3282 [50:58<1:22:08,  2.25s/it]

{'eval_loss': 0.311557799577713, 'eval_accuracy': 0.8676, 'eval_runtime': 355.5419, 'eval_samples_per_second': 42.189, 'eval_steps_per_second': 1.319, 'epoch': 1.0}


 46%|████▌     | 1500/3282 [1:07:43<1:12:10,  2.43s/it]

{'loss': 0.3046, 'grad_norm': 5.375638008117676, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}


 61%|██████    | 2000/3282 [1:28:21<53:32,  2.51s/it]  

{'loss': 0.2949, 'grad_norm': 14.056318283081055, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


                                                       
 67%|██████▋   | 2188/3282 [1:42:06<41:54,  2.30s/it]

{'eval_loss': 0.29359063506126404, 'eval_accuracy': 0.8788, 'eval_runtime': 358.3306, 'eval_samples_per_second': 41.861, 'eval_steps_per_second': 1.309, 'epoch': 2.0}


 76%|███████▌  | 2500/3282 [1:55:02<32:57,  2.53s/it]    

{'loss': 0.2772, 'grad_norm': 9.637059211730957, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}


 91%|█████████▏| 3000/3282 [2:15:55<11:48,  2.51s/it]

{'loss': 0.2523, 'grad_norm': 8.23198413848877, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


                                                     
100%|██████████| 3282/3282 [2:33:41<00:00,  2.81s/it]

{'eval_loss': 0.29659125208854675, 'eval_accuracy': 0.8811333333333333, 'eval_runtime': 357.2551, 'eval_samples_per_second': 41.987, 'eval_steps_per_second': 1.313, 'epoch': 3.0}
{'train_runtime': 9221.9807, 'train_samples_per_second': 11.386, 'train_steps_per_second': 0.356, 'train_loss': 0.3185743300526844, 'epoch': 3.0}





TrainOutput(global_step=3282, training_loss=0.3185743300526844, metrics={'train_runtime': 9221.9807, 'train_samples_per_second': 11.386, 'train_steps_per_second': 0.356, 'total_flos': 882184338000000.0, 'train_loss': 0.3185743300526844, 'epoch': 3.0})

In [15]:
trainer.evaluate()

100%|██████████| 469/469 [05:57<00:00,  1.31it/s]


{'eval_loss': 0.29659125208854675,
 'eval_accuracy': 0.8811333333333333,
 'eval_runtime': 358.5812,
 'eval_samples_per_second': 41.832,
 'eval_steps_per_second': 1.308,
 'epoch': 3.0}