<a href="https://colab.research.google.com/github/joshcova/LLMs-for-social-scientists/blob/main/code/03_ChatGPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook we will go over how to classify the same texts that we have classified in our classifiers.ipynb notebook, but this time using OpenAI's API.

Note, that to run this notebook, you will first have to set up a [API key](https://platform.openai.com/docs/api-reference/introduction) and save it locally.

It is important that you do not share the API key with others.

In [None]:
! pip install retry

In [None]:
# Load necessary libraries

import os
from openai import OpenAI
import pandas as pd
import requests
import re
from typing import List, Dict
from retry.api import retry_call
from tqdm.auto import tqdm
import time
tqdm.pandas()

## Setting up an API key

In [None]:
# WARNING: Use this method only for testing and personal projects
# Set up your API key (https://platform.openai.com/docs/quickstart)

client = OpenAI(api_key=os.environ.get("")) # This is where you would insert your personal API key

## Loading our classification sample

Let us first load our dataset of UK newspaper headlines. Note that as interacting with OpenAI's API comes is not free, we will only conduct this analysis on a sample of the corpus of newspaper headlines.

In [None]:
# Load the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/joshcova/LLMs-for-social-scientists/main/data/uk_media_2.csv")

df = df[["majortopic","text"]]
df = df.rename(columns={"majortopic":"label"})

df = df.groupby("label").sample(n=50, random_state=1)

## Prompting the LLM

In [None]:
# Importing the Open AI's libraries

import openai
from openai import OpenAIError

# Here we create a function to automate our API requests

def send_prompt_with_context(model: str,
                             messages: List[Dict],
                             max_tokens: int = 0) -> Dict[str, str]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0, # Low temperatures results into more deterministic outputs, high temperatures result into more stochastic outputs
        seed=42, # Setting the same seed is important for reproducibility
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content
    pass

categories = ["0: Law & Crime", "1: Macroeconomics", "2: Unclear"]

definitions = """
0: The newspaper headline concerns a topic other than Macroeconomics or Law & Crime.
1: The newspaper headline concerns the topic Macroeconomics. \\
2: It is unclear whether the newspaper headline concerns the topics Law & Crime or Macroeconomics.
"""

def predict_sentiment(review: str, model: str) -> Dict[str, str]:
                system_msg = f"""
                    You are a skilled research assistant who will help to classify newspaper headlines. \\
                    Classify the following text into one of the given categories: {categories}\n{definitions} \\
                    Only include the number of the selected category in your response and no further text."
                    """
                messages = [
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": review}
                ]
                return send_prompt_with_context(model, messages)

## Sending out the texts

In [None]:
# This is where we interact with the API. Note that you can choose from different GPT models.

df['GPT_4o'] = df['text'].progress_apply(lambda x: predict_sentiment(x, model='gpt-4o-2024-11-20'))

## Reviewing a subset of annotated sentences

In [None]:
print(df[['text', 'GPT_4o']])

In [None]:
# To run our performance metrics, we first need to ensure that the data is saved in the same type

df["GPT_4o"] = df["GPT_4o"].astype(int)

## Validation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score

metrics = {
    "Metric": ["F1 Score (macro)", "F1 Score (micro)", "Balanced Accuracy"],
    "Value": [
        f1_score(df["label"], df["GPT_4o"], average='macro'),
        f1_score(df["label"], df["GPT_4o"], average='micro'),
        balanced_accuracy_score(df["label"], df["GPT_4o"])
    ]
}

# Convert the dictionary into a DataFrame for nice tabular representation
results_df = pd.DataFrame(metrics)

# Display the results table
results_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>F1 Score (macro)</td>
      <td>0.727384</td>
    </tr>
    <tr>
      <th>1</th>
      <td>F1 Score (micro)</td>
      <td>0.746667</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Balanced Accuracy</td>
      <td>0.746667</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
# Calculating metrics per class
# Replace the second df with any model of your choice
precision_per_class = precision_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])
recall_per_class = recall_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])
f1_per_class = f1_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])

# Since accuracy is a global metric (not class-specific), we will not recalculate it here.

# Create a DataFrame from the metrics
metrics_per_class_df = pd.DataFrame({
    "Class": [0, 1, 2],
    "Precision": precision_per_class,
    "Recall": recall_per_class,
    "F1 Score": f1_per_class
})

# Display the results table
metrics_per_class_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Class</th>
      <th>Precision</th>
      <th>Recall</th>
      <th>F1 Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>0.863636</td>
      <td>0.38</td>
      <td>0.527778</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>0.920000</td>
      <td>0.92</td>
      <td>0.920000</td>
    </tr>
    <tr>
      <th>2</th>
      <td>2</td>
      <td>0.602564</td>
      <td>0.94</td>
      <td>0.734375</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
# For future reference, it is advisable to save the resulting dataframe locally

df.to_csv("uk_media_2_results.csv")

## Central bank independence corpus

Let us see how well ChatGPT performs on our other dataset on parliamentary interventions on central bank independence.  

In [None]:
df_cbi = pd.read_csv("https://raw.githubusercontent.com/joshcova/LLMs-for-social-scientists/main/data/uk_cbi_sample.csv")

In [None]:
import openai
from openai import OpenAIError

def send_prompt_with_context(model: str,
                             messages: List[Dict],
                             max_tokens: int = 0) -> Dict[str, str]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0,
        seed=42,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content
    pass

categories = ["0: anti-independence", "1: pro-independence", "2: unrelated"]

definitions = """
0: The statement expresses opposition for central bank independence for the Bank of England. \\
1: The statement expresses support for central bank independence for the Bank of England. \\
2: The statement does not contain a clear expression in support or opposition to Bank of England central bank independence or is on an unrelated topic
(e.g. on the European Central Bank).
"""

def predict_sentiment(review: str, model: str) -> Dict[str, str]:
                system_msg = f"""
                    You are a skilled research assistant who will help to classify parliamentary interventions on central bank independence. \\
                    Central bank independence can relate to formal independence (the legal provisions that guarantee the central bank's autonomy, such as its mandate, its organizational structure, and the procedures for appointing its leaders), and actual independence (taking into account factors such as its political and institutional environment, its relationship with the government, and the level of transparency and accountability in its operations). \\
                    Classify the following text into one of the given categories: {categories}\n{definitions} \\
                    Only include the number of the selected category in your response and no further text."
                    """
                messages = [
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": review}
                ]
                return send_prompt_with_context(model, messages)


In [None]:
df_cbi['GPT_4o'] = df_cbi['sents'].progress_apply(lambda x: predict_sentiment(x, model='gpt-4o-2024-11-20'))

In [None]:
df_cbi["GPT_4o"] = df_cbi["GPT_4o"].astype(int)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score

metrics = {
    "Metric": ["F1 Score (macro)", "F1 Score (micro)", "Balanced Accuracy"],
    "Value": [
        f1_score(df_cbi["results_number"], df_cbi["GPT_4o"], average='macro'),
        f1_score(df_cbi["results_number"], df_cbi["GPT_4o"], average='micro'),
        balanced_accuracy_score(df_cbi["results_number"], df_cbi["GPT_4o"])
    ]
}

# Convert the dictionary into a DataFrame for nice tabular representation
results_df_cbi = pd.DataFrame(metrics)

# Display the results table
results_df_cbi

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>F1 Score (macro)</td>
      <td>0.622342</td>
    </tr>
    <tr>
      <th>1</th>
      <td>F1 Score (micro)</td>
      <td>0.726667</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Balanced Accuracy</td>
      <td>0.701783</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
# Calculating metrics per class
# Replace the second df with any model of your choice
precision_per_class_cbi = precision_score(df_cbi["results_number"], df_cbi["GPT_4o"], average=None, labels=[0,1,2])
recall_per_class_cbi = recall_score(df_cbi["results_number"], df_cbi["GPT_4o"], average=None, labels=[0,1,2])
f1_per_class_cbi = f1_score(df_cbi["results_number"], df_cbi["GPT_4o"], average=None, labels=[0,1,2])

# Since accuracy is a global metric (not class-specific), we will not recalculate it here.

# Create a DataFrame from the metrics
metrics_per_class_df_cbi = pd.DataFrame({
    "Class": [0, 1, 2],
    "Precision": precision_per_class_cbi,
    "Recall": recall_per_class_cbi,
    "F1 Score": f1_per_class_cbi
})

# Display the results table
metrics_per_class_df_cbi

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Class</th>
      <th>Precision</th>
      <th>Recall</th>
      <th>F1 Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>0.225806</td>
      <td>0.700000</td>
      <td>0.341463</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>0.897436</td>
      <td>0.823529</td>
      <td>0.858896</td>
    </tr>
    <tr>
      <th>2</th>
      <td>2</td>
      <td>0.780488</td>
      <td>0.581818</td>
      <td>0.666667</td>
    </tr>
  </tbody>
</table>
</div>