## TinyBERT

In [1]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
from datasets import Dataset
dataset = Dataset.from_pandas(data)

dataset = dataset.train_test_split(test_size=0.3)
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [3]:
data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
label2id = {"negative": 0, "positive": 1}
dataset = dataset.map(lambda x: {"label": label2id[x["sentiment"]]})
dataset

Map: 100%|██████████| 35000/35000 [00:01<00:00, 30882.98 examples/s]
Map: 100%|██████████| 15000/15000 [00:00<00:00, 41542.49 examples/s]


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'label'],
        num_rows: 15000
    })
})

In [5]:
dataset["train"][:5]

{'review': ['Man, what a scam this turned out to be! Not because it wasn\'t any good (as I wasn\'t really expecting anything from it) but because I was misled by the DVD sleeve which ignorantly paraded its "stars" as being Stuart Whitman, Stella Stevens and Tony Bill. Sure enough, their names did not appear in the film\'s opening credits, much less themselves in the rest of it!! As it turned out, the only movie which connects those three actors together is the equally obscure LAS VEGAS LADY (1975) \x96 but what that one has to do with THE CRATER LAKE MONSTER is anybody\'s guess\x85 <br /><br />Even so, since I paid $1.50 for its rental and I was in a monster-movie mood anyhow, I elected to watch the movie regardless and, yup, it stunk! Apart from the fact that it had a no-name cast and an anonymous crew, an unmistakably amateurish air was visible from miles away and the most I could do with it is laugh at the JAWS-like pretensions and, intentionally so, at the resistible antics of two 

In [6]:
from transformers import AutoTokenizer
import torch

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

model = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
tokenizer



BertTokenizerFast(name_or_path='huawei-noah/TinyBERT_General_4L_312D', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [7]:
tokenizer(dataset["train"][0]["review"])

{'input_ids': [101, 2158, 1010, 2054, 1037, 8040, 3286, 2023, 2357, 2041, 2000, 2022, 999, 2025, 2138, 2009, 2347, 1005, 1056, 2151, 2204, 1006, 2004, 1045, 2347, 1005, 1056, 2428, 8074, 2505, 2013, 2009, 1007, 2021, 2138, 1045, 2001, 28616, 3709, 2011, 1996, 4966, 10353, 2029, 21591, 2135, 7700, 2094, 2049, 1000, 3340, 1000, 2004, 2108, 6990, 21311, 1010, 11894, 8799, 1998, 4116, 3021, 1012, 2469, 2438, 1010, 2037, 3415, 2106, 2025, 3711, 1999, 1996, 2143, 1005, 1055, 3098, 6495, 1010, 2172, 2625, 3209, 1999, 1996, 2717, 1997, 2009, 999, 999, 2004, 2009, 2357, 2041, 1010, 1996, 2069, 3185, 2029, 8539, 2216, 2093, 5889, 2362, 2003, 1996, 8053, 14485, 5869, 7136, 3203, 1006, 3339, 1007, 2021, 2054, 2008, 2028, 2038, 2000, 2079, 2007, 1996, 11351, 2697, 6071, 2003, 10334, 1005, 1055, 3984, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2130, 2061, 1010, 2144, 1045, 3825, 1002, 1015, 1012, 2753, 2005, 2049, 12635, 1998, 1045, 2001, 1999, 1037, 6071, 1011, 3185, 6888, 2151, 14406, 1010, 1

In [8]:
def tokenize(batch):
    temp = tokenizer(batch["review"], padding=True, truncation=True, max_length=300)
    return temp

In [9]:
dataset = dataset.map(tokenize, batched=True, batch_size=256)
dataset

Map: 100%|██████████| 35000/35000 [00:07<00:00, 4380.13 examples/s]
Map: 100%|██████████| 15000/15000 [00:03<00:00, 4340.34 examples/s]


DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15000
    })
})

In [None]:
dataset["train"][0]

In [11]:
from transformers import AutoModelForSequenceClassification

id2label = {0: "negative", 1: "positive"}

tr_model = AutoModelForSequenceClassification.from_pretrained(
    model,
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import evaluate
import numpy as np
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="train_dir",
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
)

trainer = Trainer(
    model=tr_model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [21]:
trainer.train()



KeyboardInterrupt: 

In [23]:
from transformers import pipeline

classifier = pipeline("text-classification", model="tinybert-sentiment-analysis", device=device)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x334198eb0>

In [24]:
data = ["This movie is crazy...", "shouldn't have seen this one", "should have seen earlier", "should have brought my friends with me"]
classifier(data)

[{'label': 'positive', 'score': 0.8108767867088318},
 {'label': 'negative', 'score': 0.9376280307769775},
 {'label': 'negative', 'score': 0.9141314029693604},
 {'label': 'negative', 'score': 0.8275437951087952}]

In [27]:
import boto3

s3 = boto3.client("s3")
bucket_name = "jihwan-mlops"

def create_bucket(bucket_name):
    try:
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={"LocationConstraint": "ap-northeast-2"}
        )
        print(f"Success to create bucket ({bucket_name})")
    except Exception as e:
        print(e)
        
create_bucket(bucket_name)

Success to create bucket (jihwan-mlops)


In [28]:
s3.list_buckets()

{'ResponseMetadata': {'RequestId': '2Q0AJ4H5XSQ8DB24',
  'HostId': 'NWyRlw4yYX7BNf5Leu0fmkBt2lBBGzP32tJVVztSq44Z5h/yiLZf7+KnhLgkKntujhLfDXn96DI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'NWyRlw4yYX7BNf5Leu0fmkBt2lBBGzP32tJVVztSq44Z5h/yiLZf7+KnhLgkKntujhLfDXn96DI=',
   'x-amz-request-id': '2Q0AJ4H5XSQ8DB24',
   'date': 'Mon, 23 Sep 2024 07:20:43 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'jihwan-mlops',
   'CreationDate': datetime.datetime(2024, 9, 23, 7, 20, 13, tzinfo=tzutc())}],
 'Owner': {'ID': '130f68431ef941a9ebdb00664f815794a21d0b67da22a5b8a2af695f6cf4b191'}}

In [32]:
import os
for root, dir, files in os.walk("tinybert-sentiment-analysis"):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        print(file_path)
        s3.upload_file(file_path, bucket_name, file_path)

tinybert-sentiment-analysis/model.safetensors
tinybert-sentiment-analysis/tokenizer_config.json
tinybert-sentiment-analysis/special_tokens_map.json
tinybert-sentiment-analysis/config.json
tinybert-sentiment-analysis/tokenizer.json
tinybert-sentiment-analysis/training_args.bin
tinybert-sentiment-analysis/vocab.txt
