#### Load Data with Hugging Face Datasets Library

In [1]:
# Don't do in production. Doing now to keep output clean for understanding
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [4]:
# input_ids, attention_mask, label -> numbers

In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [7]:
dataset['train'][0]

{'review': 'Every motion picture Bette Davis stars in is worth experiencing. Before Davis co-stars with Leslie Howard in "Of Human Bondage," she\'d been in over a score of movies. Legend has it that Davis was \'robbed\' of a 1935 Oscar for her performance as a cockney-speaking waitress, unwed mother & manipulative boyfriend-user, Mildred Rogers. The story goes that the AFI consoled Davis by awarding her 1st Oscar for playing Joyce Heath in "Dangerous." I imagine Davis\' fans of "Of Human Bondage" who agree with the Oscar-robbing legend are going to have at my critique\'s contrast of the 1934 film for which the AFI didn\'t award her performance & the 1936 film "Dangerous," performance for which she received her 1st Oscar in 1937.<br /><br />I\'ve tried to view all of Bette Davis\' motion pictures, TV interviews, videos, advertisements for WWII & TV performances in popular series. In hindsight, it is easy to recognize why this film, "Of Human Bondage," gave Davis the opportunity to be no

## Data Tokenization

In [8]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [9]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [10]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Building Model Evaluation Functions
https://huggingface.co/docs/transformers/v4.42.0/en/tasks/sequence_classification#evaluate

In [11]:
!pip install evaluate scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [12]:
# !pip install evaluate

import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    logging_dir="logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [15]:
import os
os.environ["WANDB_DISAIBLED"]="true"

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3585,0.306742,0.869333
2,0.2984,0.289391,0.879467
3,0.2571,0.292866,0.881533


TrainOutput(global_step=3282, training_loss=0.3188060502465508, metrics={'train_runtime': 443.8547, 'train_samples_per_second': 236.564, 'train_steps_per_second': 7.394, 'total_flos': 882184338000000.0, 'train_loss': 0.3188060502465508, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.2928655743598938,
 'eval_accuracy': 0.8815333333333333,
 'eval_runtime': 18.9288,
 'eval_samples_per_second': 792.442,
 'eval_steps_per_second': 24.777,
 'epoch': 3.0}

In [18]:
import shutil

# Define the training directory and output zip file
train_dir = "train_dir"  # Replace with your actual training directory path
output_zip = "train_dir.zip"

# Create a zip file of the training directory
shutil.make_archive(base_name=output_zip.replace('.zip', ''), format='zip', root_dir=train_dir)

output_zip

'train_dir.zip'

In [19]:
trainer.save_model('tinybert-sentiment-analysis')

In [20]:
import shutil

# Define the training directory and output zip file
train_dir = "tinybert-sentiment-analysis"  # Replace with your actual training directory path
output_zip = "tinybert-sentiment-analysis.zip"

# Create a zip file of the training directory
shutil.make_archive(base_name=output_zip.replace('.zip', ''), format='zip', root_dir=train_dir)

output_zip

'tinybert-sentiment-analysis.zip'

In [21]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
       'the movie is really sucked. there is not plot and acting was bad',
       'what a beautiful movie. great plot. acting was good. will see it again']

In [22]:
from transformers import pipeline

In [23]:
classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device='cuda')

In [24]:
classifier(data)

[{'label': 'negative', 'score': 0.9900258779525757},
 {'label': 'negative', 'score': 0.9894617199897766},
 {'label': 'positive', 'score': 0.9900806546211243}]