In [19]:
import os
import spacy
import json
from tqdm import tqdm
import numpy as np
from scipy.special import softmax

In [20]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

In [21]:
MODEL="cardiffnlp/twitter-roberta-base-sentiment-latest"

In [29]:
nlp = spacy.load("en_core_web_sm")

In [36]:
def process_file(filename):
    refined_json = dict()
    #sentiment_task = pipeline("sentiment-analysis", model=MODEL, tokenizer=MODEL)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    with open(filename, "r") as f:
        content = json.load(f)

    for k, v in content.items():
        c = " ".join(v.keys())
        doc = nlp(c)
        for word in doc:
            if word.is_stop or word.is_punct or word.is_space or word.like_num:
                txt = word.text
                del v[txt]
        refined_json[k] = v
        
    for k, v in refined_json.items():
        for term, frequency in tqdm(v.items()):
            info = dict()
            v[term] = dict()
            v[term]["frequency"] = frequency
#             result = sentiment_task(term)[0]
#             v[term]["sentiment"] = result
            token = tokenizer(term, return_tensors='pt')
            #print(token)
            output = model(**token)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
            v[term]["sentiment"] = dict()
            for i in range(scores.shape[0]):
                l = config.id2label[ranking[i]]
                s = round(float(scores[ranking[i]]), 3)
                v[term]["sentiment"][l] = s
                
            

    new_name = "refined_" + filename
    with open(new_name, "w") as f:
        json.dump(refined_json, f, indent=4)
    

In [37]:
process_file("results_3.json")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 335/335 [00:06<00:00, 51.33it/s]
100%|██████████| 358/358 [00:06<00:00, 51.51it/s]
100%|██████████| 379/379 [00:07<00:00, 50.97it/s]


In [38]:
process_file("results_0.json")
process_file("results_1.json")
process_file("results_2.json")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 365/365 [00:07<00:00, 50.78it/s]
100%|██████████| 363/363 [00:07<00:00, 50.71it/s]
100%|██████████| 363/363 [00:07<00:00, 51.39it/s]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequ