In [1]:
%load_ext dotenv
%dotenv
import os
from openai import OpenAI, AsyncOpenAI
from ratelimit import limits, sleep_and_retry
from hamison_datasets.UPV.load_upv import load_upv_dataset


upv = load_upv_dataset()

client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY_J"))
# client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY_ET"))

ONE_MINUTE = 125
model = "gpt-4o-mini"


@sleep_and_retry
@limits(calls=1, period=ONE_MINUTE)
async def call_api(client, prompt, text, model=model):
    completion = await client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user",   "content": text}
        ]
    )

    return completion.choices[0].message.content


upv.columns

Loaded UPV dataset from UPV_allURLs.csv
Loading html files...
Removing rows without html: [6668]


Extracting articles from HTML:   0%|          | 0/408 [00:00<?, ?it/s]

Index(['Medio', 'Título desmentido', 'URL', 'Titular enlazado',
       '¿Periodo de campaña?', '¿Qué campaña?', '¿Campaña?', 'Único',
       'Duplicado misma campaña', 'Corpus', 'Numeración duplicados',
       'ID Corpus', 'ID ALE', '1. Espacio', '2. Texto', '3. Multimedia',
       '4. Elemento desinformador', '5.  Tipo multimedia',
       '6. Alteración multimedia', '7. Cuenta emisora', '8. Fuente',
       '9. Protagonista', '10. Atributo', '11. Macro tema', '12. Populismo',
       '13. Ataque', '14. Tipo de ataque', '5.  Tipo multimedia reduced',
       '7. Cuenta emisora reduced', '9. Protagonista reduced',
       '14. Tipo de ataque reduced', 'html', 'article', 'text'],
      dtype='object')

In [2]:
output_fname = f'results_{model}_11-12.json'

labels_11 = upv['11. Macro tema']
labels_11_count = labels_11.value_counts().to_json()
labels_11_count_norm = labels_11.value_counts(
    normalize=True).to_string(header=False)

labels_12 = upv['12. Populismo']
labels_12_count = labels_12.value_counts().to_json()
labels_12_count_norm = labels_12.value_counts(
    normalize=True).to_string(header=False)


prompt = f"""You are a journalist, expert in Spanish politics and media. You will be presented with an article from a fact-checking website. Your task is to classify the article along two variables: THEME and POPULIST STRATEGY. For each dimension, please choose only one of the given classes. To help you choose, next to each class I provide its normalized count.

For the THEME variable, the classes are:
""" + rf"<classes>{labels_11_count_norm}<\classes>""" + """

For the POPULIST STRATEGY variable, the classes are:
""" + rf"<classes>\n{labels_12_count_norm}<\classes>""" + '''

Output the chosen classes and your reasoning for each choice in JSON format:
{
"theme": chosen class for the THEME variable (string), 
"theme_reasoning": explanation for your choice of class for the THEME variable (string), 
"populist_strategy": chosen class for the POPULIST STRATEGY variable (string),
"populist_strategy_reasoning": explanation for your choice of class for the POPULIST STRATEGY variable (string)
}.'''

# print(prompt)

In [3]:
import json

# try-except:
with open(output_fname, 'r+') as output_sofar:
    preds = json.load(output_sofar)
    seen_indices = [record['ID_Medio_Crono'] for record in preds]

for idx, article in upv.iterrows():
    if idx in seen_indices:
        continue
    else:
        print(f'Prompting with article {idx}:', end=' ')

    text = article.text
    # print(idx, text[:90])
    try:
        req = await call_api(client, prompt, text)
        req = req.replace("```", "").removeprefix("json\n")
        record = {upv.index.name: idx}
        record.update(json.loads(req))
        preds.append(record)
        with open(output_fname, 'w+') as output_f:
            json.dump(preds, output_f, indent=4)
        print('OK')
    except Exception as e:
        print(f'Error')
        print(e)
        print('Last record:', record)
        break

Prompting with article 5890: OK
Prompting with article 5891: OK
Prompting with article 6323: OK
Prompting with article 6326: OK
Prompting with article 6327: OK
Prompting with article 6334: OK
Prompting with article 6335: OK
Prompting with article 6336: OK
Prompting with article 6337: OK
Prompting with article 6338: OK
Prompting with article 6345: OK
Prompting with article 6354: OK
Prompting with article 6355: OK
Prompting with article 6358: OK
Prompting with article 6360: OK
Prompting with article 6364: OK
Prompting with article 6367: OK


In [None]:
# two-step classification prompt
output_fname = f'results_{model}_13-14.json'

labels_13 = upv['13. Ataque']
labels_13_count = labels_13.value_counts().to_json()
labels_13_count_norm = labels_13.value_counts(
    normalize=True).to_string(header=False)

labels_14 = upv['14. Tipo de ataque']
labels_14_count = labels_14.value_counts().to_json()
labels_14_count_norm = labels_14.value_counts(
    normalize=True).to_string(header=False)


prompt = f"""You are a journalist, expert in Spanish politics and media. You will be presented with an article from a fact-checking website. Your task is to classify the article along two variables, ATTACK and TYPE OF ATTACK. The variables are related, so you will do this in two steps.

First, decide whether the article describes a verbal ATTACK. The possible answers are:
""" + rf"<classes>{labels_13_count_norm}<\classes>""" + """

If you decided that there was an attack, please pick the TYPE OF ATTACK from one of the classes below. If you decided that there was no attack, please choose the class "Sin ataque":
""" + rf"<classes>\n{labels_14_count_norm}<\classes>""" + '''

Output the chosen classes and your reasoning for each choice in JSON format:
{
"attack": chosen class for the ATTACK variable (string), 
"attack_reasoning": explanation for your choice of class for the ATTACK variable (string), 
"type_of_attack": chosen class for the TYPE OF ATTACK variable (string),
"type_of_attack_reasoning": explanation for your choice of class for the TYPE OF ATTACK variable (string)
}.'''

print(prompt)

In [2]:
output_fname = f'results_{model}_9.json'
labels = upv['9. Protagonista']
labels_count = labels.value_counts().to_json()
labels_count_norm = labels.value_counts(normalize=True).to_json()

prompt = """You are a journalist, expert in Spanish politics and media. You will be presented with an article from a fact-checking website and your job is to choose to which class its PROTAGONIST belongs to. Please choose one class ONLY from the list below and output the chosen class and your reasoning in json format. To help you choose, the list of classes also contains the normalized count of each class. The resulting JSON object should have this format: {"prediction": chosen class (string), "reasoning": explanation (string)}. The list of classes you can choose from:"""
prompt = prompt + rf"\n<classes>\n{labels_count_norm}\n<\classes>"""

In [None]:
dir(completion)
dir(completion.choices[0].message)

In [None]:
# clean input text
import re

pat = r'\n\n\n+'
article_items = re.split(pat, article.text)
article_items.sort(key=len, reverse=True)