In [1]:
from utils.preprocessor import preprocess_data

import pandas as pd
from tqdm import tqdm

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline, AutoConfig

In [6]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
model_alternate_name = "roberta2022"

resources_dir = "resources"
    
model_dir = f"{resources_dir}/models"
model_bin_path = f"{model_dir}/bin"
tokenizer_path = f"{model_dir}/tokenizer"

other_dir = f"{resources_dir}/other" #for other constraints, instructions, blacklists, etc
path_blacklist_sentence = f"{other_dir}/blacklist_sentence.txt"

dataset_dir = f"dataset"
input_data = f"{dataset_dir}/input.csv"
output_cache = f"{dataset_dir}/output.csv"

text_column = "text"
label_column = "label"
max_length=256

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, config=config, ignore_mismatched_sizes=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [10]:
tokenizer_kwargs = {'padding': True, 'truncation': True, 'max_length': max_length}
pipe = pipeline("text-classification",
                    model=model, tokenizer=tokenizer, device=device,
                    **tokenizer_kwargs)

input_request = "Commuting for me involves traveling between home and work, and I can choose between driving, taking public transportation, or biking"


preprocessed_input = preprocess_data(input_request)
result = pipe(preprocessed_input)
print(result)

[{'label': 'neutral', 'score': 0.8775570392608643}]
