# Hello Hugging Face

In [3]:
# Prerequisites
import torch

### Tokenizer

In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check the vocabulary size
print('Vocabulary size:', tokenizer.vocab_size)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Vocabulary size: 30522


Tokenize a sentence

In [3]:
tokens = tokenizer.tokenize('Generative AI is cool!')

# display the tokens
print('Tokens:', tokens)

# display token ids
print('Token IDs:', tokenizer.convert_tokens_to_ids(tokens))


Tokens: ['genera', '##tive', 'ai', 'is', 'cool', '!']
Token IDs: [11416, 6024, 9932, 2003, 4658, 999]


### Load Pre-trained model for Sentiment Analysis

In [14]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained sentiment analysis model
model_name = "textattack/bert-base-uncased-imdb"
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize input text
tokenizer = BertTokenizer.from_pretrained(model_name)
tokens = tokenizer('Office Space is a great movie!', return_tensors='pt')

# Perform a forward pass
with torch.no_grad():
    outputs = model(**tokens).logits
    probabilities = torch.nn.functional.softmax(outputs, dim=1)
    predicted_class = torch.argmax(probabilities)
    
# Display sentimen
if predicted_class == 1:
    print(f"Sentiment is Positive! ({probabilities[0][1] * 100:.1f}%)")
else:
    print(f"Sentiment is Negative! ({probabilities[0][0] * 100:.1f}%)")

Sentiment is Positive! (96.2%)


Use another model for Sentiment Analysis

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

pt_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = AutoTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english")


def get_prediction(review):
    """Given a review, return the predicted sentiment"""

    # Tokenize the review, return response as tensors
    inputs = tokenizer(review, return_tensors="pt")

    # Perform the prediction (get the logits)
    outputs = pt_model(**inputs)

    # Get the predicted class (corresponding to the highest logit)
    predictions = torch.argmax(outputs.logits, dim=-1)

    return "positive" if predictions.item() == 1 else "negative"

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Check Sentiment

In [4]:
review = "This movie sucks!"
print(f"Review: {review}")
print(f"Sentiment: {get_prediction(review)}")


review = "This movie rocks!"
print(f"Review: {review}")
print(f"Sentiment: {get_prediction(review)}")

Review: This movie sucks!
Sentiment: negative
Review: This movie rocks!
Sentiment: positive


### Hugging Face IMDB dataset

In [11]:
from datasets import load_dataset
from IPython.display import HTML, display

# Load IMDB dataset
data_set = load_dataset('imdb')

# Fetch a review rom the dataset
review_number = 73
review_sample = data_set['train'][review_number]

display(HTML(review_sample['text'][:500] + '...'))

if review_sample['label'] == 1:
    print('Sentiment is Positive')
else:
    print('Sentiment is Negative')


Sentiment is Negative


In [12]:
# last 3 reviews
reviews = data_set["train"][-3:]

# Check
for review in reviews:
    # use get_prediction() function of the second model to get the sentiment
    prediction = get_prediction(review)

    print(f"Review: {review[:80]} \n... {review[-80:]}")
    print(f"Prediction: {prediction}\n")

Review: text 
... text
Prediction: positive

Review: label 
... label
Prediction: positive



### Hugging Face Trainer

In [None]:
from transformers import (DistilBertForSequenceClassification, DistilBertTokenizer,
    TrainingArguments, Trainer)
from datasets import load_dataset

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

data_set = load_dataset('imdb')
tokenizer_data_sets = data_set.map(tokenize_function, batched=True)

training_arguments = TrainingArguments(
    per_device_train_batch_size=64,
    num_train_epochs=3,
    output_dir='./results',
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenizer_data_sets['train'],
    eval_dataset=tokenizer_data_sets['test'],
)

trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
