In [1]:
# Making necessary imports
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random
import re
import gc
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Fixing randomization seed for reproduceability 
def fix_seed(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

In [3]:
fix_seed(42)

In [4]:
dfr = pd.DataFrame

In [5]:
# Class ID to Labels Dictionary
labels = {
    0: "negative",
    1: "neutral",
    2: "positive"
}

In [6]:
len(labels)

3

In [7]:
# Pre-Processing Steps for Expanding Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not",
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [8]:
# Overall Data Cleaning such as removing links, removing special characters, removing extra spaces and lower casing the string 
def remove_links(text):
    return re.sub(r'http[s]?://\S+', '', text)

def clean_text(text):
    # Remove non-alphanumeric characters and extra whitespaces
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def clean_dfs(dfs):
    for df in dfs:
        df['text'] = df['text'].apply(remove_links)
        df['text'] = df['text'].apply(lambda x: expand_contractions(x))
        df['text'] = df['text'].apply(str.lower)
        df['text'] = df['text'].apply(clean_text)

def clean_inputs(text):
    text = remove_links(text)
    text = expand_contractions(text)
    text = text.lower()
    text = clean_text(text)
    return text

In [9]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Perform garbage collection
gc.collect()

0

In [10]:
# Importing the tokenizer and model class
from transformers import RobertaTokenizer, RobertaForSequenceClassification
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model1 = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(labels), problem_type="multi_label_classification")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Dataset Class for reference
class Data(Dataset):
    def __init__(self, text):
        self.text = text.reset_index(drop=True)
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        txt = clean_inputs(self.text[index])
        encoding = self.tokenizer(txt, return_tensors="pt", padding='max_length', truncation = True)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

In [12]:
# Checking the device for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [17]:
# Loading the model, transferring it to the device, and setting it for evaluation
model1.load_state_dict(torch.load('/home/shreshthsharma/fin-para/model_sentiment/best.pth'))
model1.to(device)
model1.eval();

In [18]:
# Enter input
text = "The airline service was very good"

In [22]:
# Inference code snippet 
inputs = tokenizer(clean_inputs(text), return_tensors="pt").to(device)
with torch.no_grad():
    logits = model1(**inputs).logits
predicted_class_id = logits.argmax().item()
output = labels[predicted_class_id]
print(output)

positive
