## Introduction

This notebook contains work realted to fetching a model for sentiment analysis from Hugging Face and use it for different sentiment analysis text classification text.

## Setup

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import numpy as np
from scipy.special import softmax

In [3]:
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification

In [4]:
from plotly import graph_objs as go
from plotly import offline as pyo

## Model

### Model Path

In [5]:
MODEL_PATH = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

Model can be found on <a href="https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest">Hugging Face Website .</a>

### Tokenizer

In [6]:
tokenizer = tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer

RobertaTokenizerFast(name_or_path='cardiffnlp/twitter-roberta-base-sentiment-latest', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)

### Config

In [7]:
config = AutoConfig.from_pretrained(MODEL_PATH)
config

RobertaConfig {
  "_name_or_path": "cardiffnlp/twitter-roberta-base-sentiment-latest",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.29.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

### Model

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

### Encode Text

In [20]:
text = "Covid cases are increasing fast!"
text

'Covid cases are increasing fast!'

In [41]:
def prediction_pipeline(text: str) -> dict:
    """return the predictions"""
    
    sentiment_scores = []
    labels = []
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        labels.append(config.id2label[ranking[i]])
        sentiment_scores.append(scores[ranking[i]])
    
    return dict(zip(labels, sentiment_scores))

In [43]:
prediction_pipeline("I love you.!")

{'positive': 0.98066753, 'neutral': 0.015784204, 'negative': 0.0035480624}