<a href="https://colab.research.google.com/github/joshcova/LLMs-for-social-scientists/blob/main/code/ChatGPT_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install retry

In [3]:
import os
from openai import OpenAI
import pandas as pd
import requests
import re
from typing import List, Dict
from retry.api import retry_call
from tqdm.auto import tqdm
import time
tqdm.pandas()

## Setting up an API key

In [None]:
# WARNING: Use this method only for testing and personal projects
# Set up your API key (https://platform.openai.com/docs/quickstart)

client = OpenAI(api_key=os.environ.get(""))

## Loading our classification sample

In [None]:
# Load the CSV file
df = pd.read_csv("https://raw.githubusercontent.com/joshcova/LLMs-for-social-scientists/main/data/uk_media_2.csv")

df = df[["majortopic","text"]]
df = df.rename(columns={"majortopic":"label"})

df = df.groupby("label").sample(n=50, random_state=1)

## Prompting the LLM

In [None]:
import openai
from openai import OpenAIError

def send_prompt_with_context(model: str,
                             messages: List[Dict],
                             max_tokens: int = 0) -> Dict[str, str]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0,
        seed=42,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content
    pass

categories = ["0: Law & Crime", "1: Macroeconomics", "2: Unclear"]

definitions = """
0: The newspaper headline concerns the topic Law & Crime. \\
1: The newspaper headline concerns the topic Macroeconomics. \\
2: It is unclear whether the newspaper headline concerns the topics Law & Crime or Macroeconomics.
"""

def predict_sentiment(review: str, model: str) -> Dict[str, str]:
                system_msg = f"""
                    You are a skilled research assistant who will help to classify newspaper headlines. \\
                    Classify the following text into one of the given categories: {categories}\n{definitions} \\
                    Only include the number of the selected category in your response and no further text."
                    """
                messages = [
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": review}
                ]
                return send_prompt_with_context(model, messages)

## Sending out the texts

In [None]:
df['GPT_4o'] = df['text'].progress_apply(lambda x: predict_sentiment(x, model='gpt-4o-2024-11-20'))

## Reviewing a subset of annotated sentences

In [None]:
print(df[['text', 'GPT_4o']])

In [None]:
df["GPT_4o"] = df["GPT_4o"].astype(int)

## Validation

In [None]:
# Replace the second df with any model of your choice
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score

metrics = {
    "Metric": ["F1 Score (macro)", "F1 Score (micro)", "Balanced Accuracy"],
    "Value": [
        f1_score(df["label"], df["GPT_4o"], average='macro'),
        f1_score(df["label"], df["GPT_4o"], average='micro'),
        balanced_accuracy_score(df["label"], df["GPT_4o"])
    ]
}

# Convert the dictionary into a DataFrame for nice tabular representation
results_df = pd.DataFrame(metrics)

# Display the results table
results_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>F1 Score (macro)</td>
      <td>0.601442</td>
    </tr>
    <tr>
      <th>1</th>
      <td>F1 Score (micro)</td>
      <td>0.830000</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Balanced Accuracy</td>
      <td>0.830000</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
# Calculating metrics per class
# Replace the second df with any model of your choice
precision_per_class = precision_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])
recall_per_class = recall_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])
f1_per_class = f1_score(df["label"], df["GPT_4o"], average=None, labels=[0,1,2])

# Since accuracy is a global metric (not class-specific), we will not recalculate it here.

# Create a DataFrame from the metrics
metrics_per_class_df = pd.DataFrame({
    "Class": [0, 1, 2],
    "Precision": precision_per_class,
    "Recall": recall_per_class,
    "F1 Score": f1_per_class
})

# Display the results table
metrics_per_class_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Class</th>
      <th>Precision</th>
      <th>Recall</th>
      <th>F1 Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>0.976744</td>
      <td>0.84</td>
      <td>0.903226</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>1.000000</td>
      <td>0.82</td>
      <td>0.901099</td>
    </tr>
    <tr>
      <th>2</th>
      <td>2</td>
      <td>0.000000</td>
      <td>0.00</td>
      <td>0.000000</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
# For future reference, it is advisable to save the resulting dataframe locally

df.to_csv("uk_media_2_results.csv")

## Central bank independence corpus

Let us see how well ChatGPT performs on our other dataset on parliamentary interventions on central bank independence.  

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/joshcova/LLMs-for-social-scientists/main/data/uk_cbi_sample.csv")

In [None]:
# Check if 'Sentences' column exists
if 'sents' in df.columns:
    # Extract texts from the 'Sentences' column
    texts = df['sents'].tolist()

    # Example: Print the first few texts to verify
    for text in texts[:5]:  # Adjust the number to print as needed
        print(text)
else:
    print("The 'sents' column was not found in the CSV file.")

In [None]:
import openai
from openai import OpenAIError

def send_prompt_with_context(model: str,
                             messages: List[Dict],
                             max_tokens: int = 0) -> Dict[str, str]:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.0,
        seed=42,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].message.content
    pass

categories = ["0: anti-independence", "1: pro-independence", "2: unrelated"]

definitions = """
0: The statement expresses opposition for central bank independence. \\
1: The statement expresses support for central bank independence. \\
2: The statemnent does not contain a clear expression in support or opposition to central bank independence.
"""

def predict_sentiment(review: str, model: str) -> Dict[str, str]:
                system_msg = f"""
                    You are a skilled research assistant who will help to classify parliamentary interventions on central bank independence. \\
                    Central bank independence can relate to formal independence (the legal provisions that guarantee the central bank's autonomy, such as its mandate, its organizational structure, and the procedures for appointing its leaders), and actual independence (taking into account factors such as its political and institutional environment, its relationship with the government, and the level of transparency and accountability in its operations). \\
                    Classify the following text into one of the given categories: {categories}\n{definitions} \\
                    Only include the number of the selected category in your response and no further text."
                    """
                messages = [
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": review}
                ]
                return send_prompt_with_context(model, messages)


In [None]:
df['GPT_4o'] = df['sents'].progress_apply(lambda x: predict_sentiment(x, model='gpt-4o-2024-11-20'))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
# Save a separate data frame for validation purposes

df_validate = pd.DataFrame(df)

In [None]:
# Encode the data to ensure that the data types match (this is important for the subsequent step)

label_encoder = LabelEncoder()
df_validate["GPT_4o_encoded"] = label_encoder.fit_transform(df_validate["GPT_4o"])

In [None]:
category_encoded = df_validate["results_number"]


In [None]:
metrics = {
    "Metric": ["F1 Score (macro)", "F1 Score (micro)", "Balanced Accuracy"],
    "Value": [
        f1_score(category_encoded, df_validate["GPT_4o_encoded"], average='macro'),
        f1_score(category_encoded, df_validate["GPT_4o_encoded"], average='micro'),
        balanced_accuracy_score(category_encoded, df_validate["GPT_4o_encoded"])
    ]
}

# Convert the dictionary into a DataFrame for nice tabular representation
results_df = pd.DataFrame(metrics)
print(results_df)

<div>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Metric</th>
      <th>Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>F1 Score (macro)</td>
      <td>0.610818</td>
    </tr>
    <tr>
      <th>1</th>
      <td>F1 Score (micro)</td>
      <td>0.713333</td>
    </tr>
    <tr>
      <th>2</th>
      <td>Balanced Accuracy</td>
      <td>0.710517</td>
    </tr>
  </tbody>
</table>
</div>


In [None]:
# Calculating metrics per class
# Replace the second df with any model of your choice
precision_per_class = precision_score(category_encoded, df_validate["GPT_4o_encoded"], average=None, labels=[0,1,2])
recall_per_class = recall_score(category_encoded, df_validate["GPT_4o_encoded"], average=None, labels=[0,1,2])
f1_per_class = f1_score(category_encoded, df_validate["GPT_4o_encoded"], average=None, labels=[0,1,2])

# Since accuracy is a global metric (not class-specific), we will not recalculate it here.

# Create a DataFrame from the metrics
metrics_per_class_df = pd.DataFrame({
    "Class": [0, 1, 2],
    "Precision": precision_per_class,
    "Recall": recall_per_class,
    "F1 Score": f1_per_class
})

# Display the results table
metrics_per_class_df

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Class</th>
      <th>Precision</th>
      <th>Recall</th>
      <th>F1 Score</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0</td>
      <td>0.250000</td>
      <td>0.800000</td>
      <td>0.380952</td>
    </tr>
    <tr>
      <th>1</th>
      <td>1</td>
      <td>0.848837</td>
      <td>0.858824</td>
      <td>0.853801</td>
    </tr>
    <tr>
      <th>2</th>
      <td>2</td>
      <td>0.812500</td>
      <td>0.472727</td>
      <td>0.597701</td>
    </tr>
  </tbody>
</table>
</div>