In [6]:
#libraris 
import sys
import pandas as pd
import numpy as np 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification     
from scipy.special import softmax    

In [7]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)



In [8]:
def get_sentiment_roberta(text):
    try:
        if pd.isna(text) or text == "":
            print("It was an NA")
            sentiment_label = {
                'negative': 0,
                'neutral': 0, 
                'positive': 0
            }
            max_sentiment = 'neutral'
            return max_sentiment, sentiment_label
        
        encoded_text = tokenizer(text, return_tensor='pt')
        output = model(**encoded_text)
        scores = output.logits[0].detach().numpy()
        scores = softmax(scores)
        sentiment_label = {
            'negative': scores[0],
            'neutral': scores[1],
            'positive': scores[2]
        }
        max_sentiment = max(sentiment_label, key=sentiment_label.get)
        return max_sentiment, sentiment_label
    except Exception as e:
        print("Too long, splot and calculate")
        sentences = text.split('.')
        pos = 0
        neg = 0 
        sentence_scores = []
        for sentence in sentences:
            encoded_text = tokenizer(sentence, return_tensors='pt')
            output = model(**encoded_text)
            scores = output.logits[0].detach().numpy()
            scores = softmax(scores)

            sentiment_label = {
                'negative': scores[0],
                'neutral': scores[1],
                'positive': scores[2]
            }
            sentence_scores.append(sentiment_label)

            max_sentiment = max(sentiment_label, key=sentiment_label.get)
            if max_sentiment == 'positive':
                pos += 1
            else:
                neg += 1
        mean_dict = {
            key: np.mean([sentence[key] for sentence in sentence_scores])
            for key in ['negative', 'neutral', 'positive']
        }
        if neg >= pos:
            return 'negative', mean_dict
        return 'positive', mean_dict

In [14]:
def main():
    if len(sys.argv) < 2:
        print("Not enough command arguments porivded.")
        return 
    file_path = sys.argv[1]
    print("Loading file")
    df = pd.read_csv(file_path, encoding='latin1')
    df.fillna({'text':' '}, inplace=True)

    #Run Roberta model on text 
    print("Running roberta model ")
    print("Head of the text column", df['text'].head(10))
    df[['roberta', 'roberta_scores']] = df['text'].apply(get_sentiment_roberta).apply(pd.Series)
    normalized_scores  = pd.json_normalize(df['roberta_scores'])
    result_df = pd.concat([df, normalized_scores], axis=1)
    df = result_df.drop('roberta_scores', axis=1)

    print(df)

NameError: name 'df' is not defined