# Opinion Analysis

## Initialize and import

In [107]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import dotenv
import pandas as pd

from src.apis.common import is_invalid_json, run_inference_parallel_with_retry
from src.apis.google import gemini_infer
from src.apis.openai import openai_infer
from src.filter_references import process_raw_results
from src.load_court_data import load_sc_data
from src.templates import render_template
from src.token_count import analyze_corpus_tokens

dotenv.load_dotenv()

TEMPLATE_NAME = "extract_acts_03.jinja2"

SC_PATH = Path("data") / "sc_opinions.json"
sc_df = load_sc_data(SC_PATH)

CC_PATH = Path("data") / "NALUS.json"
cc_df = pd.read_json(CC_PATH)

In [None]:
from src.apis.fireworks import fireworks_infer

response = fireworks_infer("heollo")

In [15]:
response.json()["choices"][0]["message"]["content"]

## Compute tokens

In [None]:
print(f"Supreme court ({len(sc_df)} cases):")
analyze_corpus_tokens(sc_df.text.tolist())

print(f"\nConstitutional court ({len(cc_df)} cases):")
analyze_corpus_tokens(cc_df.text.tolist())

## Check extraction

### Sample and get results

In [2]:
# Filter to keep only rows where numbers has at least one item
sample_100 = sc_df[sc_df.numbers.apply(len) > 0].sample(100, random_state=42)
texts_100 = sample_100.text.tolist()
numbers_100 = sample_100.numbers.tolist()

templatize_100 = [
    render_template(TEMPLATE_NAME, court_opinion=text) for text in texts_100
]

results_raw = run_inference_parallel_with_retry(templatize_100, gemini_infer)

### Check wrong JSON parsing

After multiple checks, Gemini Flash performs better than GPT-4o mini. GPT-4o mini frequently omitted paragraph numbers. The issue remained even after prompt engineering.

Gemini Flash 2.0. initially had an issue with infinite repetition. This occured frequently with temperature = 0.1 or 0.0 or top_k = 1. With temperature 1.0, the issue happened from time to time only. Lower top_p does not work either.

I could not see improved accuracy with lower temperatures.

In [3]:
for i, result in enumerate(results_raw):
    if is_invalid_json(result):
        print(i)

### Check parsed paragraph numbers

In [5]:
processed = process_raw_results(results_raw)  # type: ignore

for i, (numbers, result) in enumerate(zip(numbers_100, processed, strict=True)):
    if set(numbers) <= set(result):
        continue
    missing = set(numbers) - set(result)
    if all(str(m) not in sample_100.iloc[i].text for m in missing):
        continue

    print()
    print(i, numbers, result, missing)
    print(sample_100.iloc[i].permanent_link)

## Just do it

## Review

In [17]:
cc_df