In [1]:
import pandas as pd
import os
import re
import requests
import json
import time
import yfinance as yf
from time import sleep
from pathlib import Path
from tqdm import tqdm
from datetime import datetime as dt
from openai import OpenAI

Analyze Sentiment Scores with GPT 5 Nano

In [3]:
all_calls = pd.read_csv('all_calls.csv', index_col = 0)

In [4]:
custom_sp_universe = pd.read_csv('spx2015_top3sectors_50.csv')
custom_sp_universe = custom_sp_universe.Ticker.tolist()

In [5]:
all_calls = all_calls[all_calls.ticker.isin(custom_sp_universe)]

In [6]:
client = OpenAI()

MODEL = "gpt-5-nano"
MAX_OUTPUT_TOKENS = 500
CHAR_CAP = 80_000
CHECKPOINT_PATH = "all_calls_progress.csv"
SAVE_EVERY = 50

PROMPT_HEADER = """I will provide the transcript of an earnings call. Your job is to analyze the text only based on what is actually present in the transcript. For each of the following categories, assign a score between -1 and 1:

forward_looking_sentiment: How positive or negative is the company’s outlook or projections for the future?
management_confidence: How confident does management appear about business performance and strategy?
risk_and_uncertainty: How much concern, risk, or uncertainty is conveyed (higher = more risk)?
qa_sentiment: How positive or negative is the tone during the Q&A section with analysts?
opening_sentiment: How positive or negative is the opening section or prepared remarks?
financial_performance_sentiment: Based solely on what is said in the transcript, how positively is past financial performance portrayed?
macroeconomic_reference_sentiment: If there are references to broader macroeconomic conditions, how optimistic or pessimistic are those?

If a category is not addressed clearly in the transcript, return exactly 0 for that category.

Use the following format for your output:
{
  "forward_looking_sentiment": ___,
  "management_confidence": ___,
  "risk_and_uncertainty": ___,
  "qa_sentiment": ___,
  "opening_sentiment": ___,
  "financial_performance_sentiment": ___,
  "macroeconomic_reference_sentiment": ___
}
Do not include any text or explanation—only return the JSON object. Do not guess or infer information that is not directly stated in the transcript.

Transcript:"""

def build_prompt(transcript: str) -> str:
    return f"{PROMPT_HEADER}\n{(transcript or '')[:CHAR_CAP]}"

def call_gpt_nano(prompt: str, max_retries: int = 5):
    delays = [1, 2, 5, 10, 20]
    for attempt in range(max_retries):
        try:
            resp = client.responses.create(
                model=MODEL,
                input=prompt,
                max_output_tokens=MAX_OUTPUT_TOKENS,
                reasoning={"effort": "low"},
                text={"format": {"type": "json_object"}, "verbosity": "low"},
            )
            return resp.output_text.strip()
        except Exception as e:
            if attempt == max_retries - 1:
                return None
            time.sleep(delays[min(attempt, len(delays)-1)])

def safe_json_load(s: str):
    if not s: return {}
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        s2 = s.strip().strip("`").replace("```json","").replace("```","").strip()
        try: return json.loads(s2)
        except: return {}

# ---- load dataframe ----
# assume you already loaded all_calls with column earnings_call_raw_text
# all_calls = pd.read_csv("all_calls.csv")

if os.path.exists(CHECKPOINT_PATH):
    saved = pd.read_csv(CHECKPOINT_PATH)
    for col in saved.columns:
        if col not in all_calls.columns:
            all_calls[col] = saved[col]

if "analysis_json" not in all_calls.columns:
    all_calls["analysis_json"] = None

result_cols = [
    "forward_looking_sentiment",
    "management_confidence",
    "risk_and_uncertainty",
    "qa_sentiment",
    "opening_sentiment",
    "financial_performance_sentiment",
    "macroeconomic_reference_sentiment",
]
for c in result_cols:
    if c not in all_calls.columns:
        all_calls[c] = None

In [None]:
# submit api calls and log progress
processed_since_save = 0
for i, row in all_calls.iterrows():
    if pd.notna(row.get("analysis_json")) and str(row["analysis_json"]).strip():
        continue

    prompt = build_prompt(row["earnings_call_raw_text"])
    txt = call_gpt_nano(prompt)
    all_calls.at[i, "analysis_json"] = txt

    parsed = safe_json_load(txt)
    for c in result_cols:
        all_calls.at[i, c] = parsed.get(c, None)

    processed_since_save += 1
    if processed_since_save >= SAVE_EVERY:
        all_calls.to_csv(CHECKPOINT_PATH, index=False)
        processed_since_save = 0

# final save
all_calls.to_csv(CHECKPOINT_PATH, index=False)
print("Done. Checkpoint saved to", CHECKPOINT_PATH)