In [304]:
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
from pprint import pprint
from colorama import init, Fore, Back, Style
from pandas._libs.internals import defaultdict
import pandas as pd

init(autoreset=True)  # Ensure colors reset automatically

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")

name = "US_foreign_policy_in_the_Middle_East1974-2024by3months"

go_emotions_pipe = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3
)

In [305]:
table = pd.read_csv(name+".csv")

In [306]:
def chunk_text(text, max_length=511):
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word, word_id in zip(tokens, token_ids):
        current_chunk.append(word)
        current_length += 1
        
        if current_length == max_length:
            chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
            current_chunk = []
            current_length = 0
    
    if len(current_chunk) != 0:
        chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
    return chunks


In [307]:
chuck_length = 32
# Process each text
def getScores(text):
    # print(f"Processing text: "+Fore.GREEN+f"{text[:50]}")
    chunks = chunk_text(text, max_length=chuck_length)
    
    aggregated_outputs = []
    for chunk in chunks:
        outputs = go_emotions_pipe([chunk])
        outputs[0]["scores"] = [float(score) for score in outputs[0]["scores"]]
        
        outputs = [{label: score for label, score in zip(item['labels'], item['scores'])} for item in outputs]
    
        aggregated_outputs.append(outputs)
        
    emotion_counts = defaultdict(int)
    num_chunks = len(aggregated_outputs)

    for chunk in aggregated_outputs:
        for emotion_dict in chunk:
            for emotion in emotion_dict:
                emotion_counts[emotion] += 1
    return { emotion : counts/num_chunks for (emotion, counts) in zip(emotion_counts.keys(), emotion_counts.values())}
    

In [308]:
for i, row in table.iterrows():
    emotion_counts = getScores(row["content"])
    # print(emotion_counts)
    for emotion in emotion_counts.keys():
        table.loc[i, emotion] = emotion_counts[emotion]
        # print(row)
print(table.head(10))
# print(table.head(10).columns)

   Unnamed: 0                                              title  \
0           0            America's Middle East Policy Has Failed   
1           1  WATCH: What's Next for U.S. Policy in the Midd...   
2           2  Timeline: Oil Dependence and U.S. Foreign Poli...   
3           3  Book Launch | American Diplomacy Toward Lebano...   
4           4  The End of De-Escalation in the Middle East? -...   
5           5  Washington's Looming Middle Eastern Quagmire |...   
6           6  Hearings | United States Senate Committee on F...   
7           7                         Foreign Policy | Brookings   
8           8              Brian Katulis | Middle East Institute   
9           9  Testing Assumptions About US Foreign Policy in...   

                                                link     date_range  \
0  https://foreignpolicy.com/2024/01/11/israel-ha...  202311-202311   
1  https://ncafp.org/watch-whats-next-for-u-s-pol...  202311-202311   
2  https://www.cfr.org/timeline/oil-de

In [309]:
table.fillna(0.0, inplace=True)

In [310]:
table.to_csv("./data/"+name +"_scored.csv")