In [1]:
import os
import typing as t
from collections import Counter

from openai import OpenAI
from pydantic import BaseModel

from bsky_net import Post, jsonl

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

In [37]:
MODEL = "gpt-4o-mini"
BSKY_TEST_PATH = "../data/experiments/test-sets/moderation-v1-sanitized.jsonl"

Stance = t.Literal["favor", "against", "none", "unrelated"]


class BskySample(Post):
    classification: t.Literal[0, 1]
    stance: Stance


class StanceReasoning(BaseModel):
    evidence: t.Any
    confidence: t.Literal["high", "medium", "low"]
    final_stance: Stance


class Evidence(BaseModel):
    phrase: str
    reasoning: str
    supports_stance: Stance


class StanceReasoningV1(StanceReasoning):
    evidence: list[Evidence]


class Result(t.TypedDict):
    topic: t.Literal["tp", "fp", "tn", "fn"]
    stance: t.Optional[t.Literal["tp", "fp", "tn", "fn"]]


def evaluate(pred: StanceReasoning, gold: BskySample) -> Result:
    def pred_on_topic() -> bool:
        return pred.final_stance in ["favor", "against"]

    def gold_on_topic() -> bool:
        return gold["classification"] == 1 and gold["stance"] != "none"

    def print_case(_type: str, gold: BskySample, pred: StanceReasoning):
        print(_type)
        print("TEXT: ", gold["text"])
        if "TOPIC" in _type:
            print("GOLD TOPIC: ", gold_on_topic())
            print("PRED TOPIC: ", pred_on_topic())
            print(
                "GOLD STANCE: ",
                gold["stance"] if "stance" in gold else "<nan>",
            )
            print("PRED STANCE: ", pred.final_stance)
            print("CONFIDENCE: ", pred.confidence)
            print("REASONING: ", pred.evidence)
            print("-" * 20)
        if "STANCE" in _type:
            print("GOLD STANCE: ", gold["stance"])
            print("PRED STANCE: ", pred.final_stance)
            print("CONFIDENCE: ", pred.confidence)
            print("REASONING: ", pred.evidence)
            print("-" * 20)

    if pred_on_topic():
        if not gold_on_topic():
            print_case("TOPIC - FALSE POSITIVE", gold, pred)
            return {"topic": "fp", "stance": None}

        if pred.final_stance == "favor":
            if gold["stance"] == "favor":
                # Full true positive, do nothing
                return {"topic": "tp", "stance": "tp"}
            if gold["stance"] == "against":
                print_case("STANCE - FALSE POSITIVE", gold, pred)
                return {"topic": "tp", "stance": "fp"}

        if pred.final_stance == "against":
            if gold["stance"] == "against":
                # Full true negative, do nothing
                return {"topic": "tp", "stance": "tn"}
            if gold["stance"] == "favor":
                print_case("STANCE - FALSE NEGATIVE", gold, pred)
                return {"topic": "tp", "stance": "fn"}

    elif not pred_on_topic():
        if not gold_on_topic():
            return {"topic": "tn", "stance": None}

        print_case("TOPIC - FALSE NEGATIVE", gold, pred)
        return {"topic": "fn", "stance": None}

    raise ValueError("Unknown case")


def confusion_mat(title: str, results: Counter) -> None:
    tp = results["tp"]
    tn = results["tn"]
    fp = results["fp"]
    fn = results["fn"]
    total = tp + tn + fp + fn

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = (
        2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    )
    accuracy = (tp + tn) / total if total > 0 else 0

    print(
        f"{title}"
        f"TP: {tp:<2} ({tp/total:.2%}) | FP: {fp:<2} ({fp/total:.2%})\n"
        f"FN: {fn:<2} ({fn/total:.2%}) | TN: {tn:<2} ({tn/total:.2%})\n\n"
        f"Precision: {precision:.4f}\n"
        f"Recall:    {recall:.4f}\n"
        f"F1 Score:  {f1_score:.4f}\n"
        f"Accuracy:  {accuracy:.4f}\n"
        f"{'=' * 20}"
    )


def experiment(system_prompt: str, response_format: type[StanceReasoning]):
    topic_eval = Counter()
    stance_eval = Counter()

    for sample in jsonl[BskySample].iter(BSKY_TEST_PATH):
        completion = client.beta.chat.completions.parse(
            model=MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": sample["text"]},
            ],
            temperature=0.0,
            response_format=response_format,
        )

        pred = completion.choices[0].message.parsed

        if not pred:
            continue

        result = evaluate(pred, sample)

        topic_eval[result["topic"]] += 1

        if result["stance"]:
            stance_eval[result["stance"]] += 1

    confusion_mat("TOPIC RESULTS\n\n", topic_eval)
    confusion_mat("STANCE RESULTS\n\n", stance_eval)


In [35]:
# CoT v1


system_prompt_v1 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky to determine the post's stance on Bluesky's moderation policies, trust and safety, and user removal, especially in relation to the Bluesky development team. If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'.

Since Bluesky is a social network, users are commonly using sarcasm, irony, and humor to playfully express their stance on the platform.

You are annotating the post's stance on the Bluesky platform/team's moderation policies/decisions, not the post's stance on moderation generally.

Ignore any posts that are not discussing the Bluesky platform/team specifically.
"""

experiment(system_prompt_v1, StanceReasoningV1)

# TOPIC RESULTS

# TP: 40 (35.71%) | FP: 14 (12.50%)
# FN: 9  (8.04%) | TN: 49 (43.75%)

# Precision: 0.7407
# Recall:    0.8163
# F1 Score:  0.7767
# Accuracy:  0.7946
# ====================
# STANCE RESULTS

# TP: 3  (7.50%) | FP: 2  (5.00%)
# FN: 3  (7.50%) | TN: 32 (80.00%)

# Precision: 0.6000
# Recall:    0.5000
# F1 Score:  0.5455
# Accuracy:  0.8750
# ====================

# TOPIC RESULTS

# TP: 42 (37.50%) | FP: 17 (15.18%)
# FN: 7  (6.25%) | TN: 46 (41.07%)

# Precision: 0.7119
# Recall:    0.8571
# F1 Score:  0.7778
# Accuracy:  0.7857
# ====================
# STANCE RESULTS

# TP: 3  (7.14%) | FP: 3  (7.14%)
# FN: 4  (9.52%) | TN: 32 (76.19%)

# Precision: 0.5000
# Recall:    0.4286
# F1 Score:  0.4615
# Accuracy:  0.8333
# ====================

TOPIC - FALSE POSITIVE
TEXT:  We should let Donald J Trump on here until he is imprisoned if he double dog promises to only skeet trolls of Pudding Rob
GOLD TOPIC:  False
PRED TOPIC:  True
GOLD STANCE:  <nan>
PRED STANCE:  against
CONFIDENCE:  high
REASONING:  [Evidence(phrase='We should let Donald J Trump on here until he is imprisoned', reasoning='The post suggests allowing a controversial figure on the platform, which implies a stance on moderation policies regarding user removal.', supports_stance='against')]
--------------------
STANCE - FALSE NEGATIVE
TEXT:  we've taken down the account. I need yall to understand this doesn't happen fast. I noticed the mention about it about 30 minutes after I got tagged. I escalated to moderation and we gathered evidence

on the first pass, because we were trying to move fast, we missed some posts. 1/n
GOLD STANCE:  favor
PRED STANCE:  against
CONFIDENCE:  high
REASONING:  [Evidence(phrase="we've taken down the account", reasoning="The post disc

In [16]:
# CoT v2

system_prompt_v2 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky to determine the post's stance on Bluesky's moderation policies, trust and safety, and user removal, especially in relation to the Bluesky development team. If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation decisions/policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'."""

experiment(system_prompt_v2, StanceReasoningV1)

# TOPIC RESULTS

# TP: 41 (36.61%) | FP: 10 (8.93%)
# FN: 8  (7.14%) | TN: 53 (47.32%)

# Precision: 0.8039
# Recall:    0.8367
# F1 Score:  0.8200
# Accuracy:  0.8393
# ====================
# STANCE RESULTS

# TP: 5  (12.20%) | FP: 6  (14.63%)
# FN: 1  (2.44%) | TN: 29 (70.73%)

# Precision: 0.4545
# Recall:    0.8333
# F1 Score:  0.5882
# Accuracy:  0.8293
# ====================

TOPIC - FALSE POSITIVE
TEXT:  We should let Donald J Trump on here until he is imprisoned if he double dog promises to only skeet trolls of Pudding Rob
GOLD TOPIC:  False
PRED TOPIC:  True
--------------------
STANCE - FALSE POSITIVE
TEXT:  Unsurprisingly, BlueSky is also going to need a trust and safety team if it doesn’t want to be overrun with fascists. Content moderation is hard and not something that can be done by a handful of devs 🤷🏻‍♀️
GOLD STANCE:  against
PRED STANCE:  favor
REASONING:  [Evidence(phrase='BlueSky is also going to need a trust and safety team', reasoning='The post suggests that Bluesky requires a dedicated trust and safety team to effectively manage content moderation, indicating a need for improved moderation policies.', supports_stance='favor'), Evidence(phrase='Content moderation is hard and not something that can be done by a handful of devs', reasoning='This statement implies that the current moderation efforts by the development team are insufficient, adv

In [17]:
# CoT v3

system_prompt_v3 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky to determine the post's stance on the Bluesky team's approach to and handling of moderation policies, trust and safety (T&S), and user removal. 

If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation approach/decisions/policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'."""

experiment(system_prompt_v3, StanceReasoningV1)

# --------------------
# TOPIC RESULTS

# TP: 39 (34.82%) | FP: 9  (8.04%)
# FN: 10 (8.93%) | TN: 54 (48.21%)

# Precision: 0.8125
# Recall:    0.7959
# F1 Score:  0.8041
# Accuracy:  0.8304
# ====================
# STANCE RESULTS

# TP: 5  (12.82%) | FP: 2  (5.13%)
# FN: 1  (2.56%) | TN: 31 (79.49%)

# Precision: 0.7143
# Recall:    0.8333
# F1 Score:  0.7692
# Accuracy:  0.9231
# ====================


TOPIC - FALSE POSITIVE
TEXT:  We should let Donald J Trump on here until he is imprisoned if he double dog promises to only skeet trolls of Pudding Rob
GOLD TOPIC:  False
PRED TOPIC:  True
--------------------
TOPIC - FALSE NEGATIVE
TEXT:  Pretty sure the org I manage is bigger than the whole bsky team.

Just a gentle expectation-setting reminder.
GOLD TOPIC:  True
PRED TOPIC:  False
--------------------
TOPIC - FALSE NEGATIVE
TEXT:  I'm curious, is the expectation for malicious account removal really <60 minutes in all circumstances?

I'm not sure I've ever been on a site of any size that had anywhere near that response time.

This is not criticising the criticism, I'm genuinely curious if that is/should be the expectation.
GOLD TOPIC:  True
PRED TOPIC:  False
--------------------
TOPIC - FALSE POSITIVE
TEXT:  *A LOT of trust and safety experts* suddenly appear
GOLD TOPIC:  False
PRED TOPIC:  True
--------------------
TOPIC - FALSE POSITIVE
TEXT:  i would like to nominate myself for T

In [25]:
# CoT v4

system_prompt_v4 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky to determine the post's stance on the Bluesky team's approach to and handling of moderation policies, trust and safety (T&S), and user removal. Bluesky's moderation approach is defined by their tech-first, decentralized focus, where users/communities are enabled to leverage moderation tooling instead of relying on centralized decisions by the Bluesky team. 

If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation approach/decisions/policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'. Provide evidence of your classification that references phrases from the post."""

experiment(system_prompt_v4, StanceReasoningV1)

# TOPIC RESULTS

# TP: 38 (33.93%) | FP: 9  (8.04%)
# FN: 11 (9.82%) | TN: 54 (48.21%)

# Precision: 0.8085
# Recall:    0.7755
# F1 Score:  0.7917
# Accuracy:  0.8214
# ====================
# STANCE RESULTS

# TP: 2  (5.26%) | FP: 2  (5.26%)
# FN: 4  (10.53%) | TN: 30 (78.95%)

# Precision: 0.5000
# Recall:    0.3333
# F1 Score:  0.4000
# Accuracy:  0.8421
# ====================


TOPIC - FALSE POSITIVE
TEXT:  If I wanted to experiment with somehow adding spoiler tags to bsky is that something I could do myself?
GOLD TOPIC:  False
PRED TOPIC:  True
GOLD STANCE:  <nan>
PRED STANCE:  favor
REASONING:  [Evidence(phrase='experiment with somehow adding spoiler tags to bsky', reasoning='The post discusses the possibility of adding features (spoiler tags) to the Bluesky platform, indicating an interest in user-driven modifications.', supports_stance='favor')]
--------------------
TOPIC - FALSE NEGATIVE
TEXT:  we've taken down the account. I need yall to understand this doesn't happen fast. I noticed the mention about it about 30 minutes after I got tagged. I escalated to moderation and we gathered evidence

on the first pass, because we were trying to move fast, we missed some posts. 1/n
GOLD TOPIC:  True
PRED TOPIC:  False
GOLD STANCE:  favor
PRED STANCE:  none
REASONING:  [Evidence(phrase='we escalated to moderation and we gathered evidence', reasoning="This phrase i

In [28]:
# CoT v5 -- few-shot

system_prompt_v5 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky to determine the post's stance on the Bluesky team's approach to and handling of moderation policies, trust and safety (T&S), and user removal. 

If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation approach/decisions/policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'.

Examples:

Post: "I've moderated in several spaces and visibility and transparency was something I learned. I've been on hot seat myself and had to grow.\n\nOne thing that failed was hiding. When people try to hide in the groups I've moderates, it crumbled.\n\nI don't want to see things crumble again :("
{
    "evidence": [
        {
            "phrase": "one thing that failed was hiding",
            "reasoning": "The user criticizes the Bluesky team's lack of visibility and transparency in handling moderation decisions.",
            "supports_stance": "against"
        },
        {
            "phrase": "I don't want to see things crumble again",
            "reasoning": "The user expresses a concern about the potential failure of Bluesky's moderation system.",
            "supports_stance": "against"
        }
    ],
    "final_stance": "against"
}

Post: "Jack Dorsey's key insight was that there's no way to build a social media app without becoming a punching bag as people complain about content moderation or feed/ad recommendations.\n\nSo he decided to fund an app where the developers couldn't be blamed for these things. Bluesky is that dream realized"
{
    "evidence": [
        {
            "phrase": "no way to build a social media app without becoming a punching bag",
            "reasoning": "The user describes the difficulty of desiging moderation systems on social networks in general.",
            "supports_stance": "none"
        },
        {
            "phrase": "So he decided to fund an app where the developers couldn't be blamed for these things. Bluesky is that dream realized",
            "reasoning": "The user uses sarcasm to suggest that Bluesky was created to avoid responsibility for moderation decisions, criticizing Bluesky's lack of ownership over their moderation decisions.",
            "supports_stance": "against"
        }
    ],
    "final_stance": "against"
}

Post: "around an hour seems like a reasonable turnaround time for a permaban imo ¯\\(ツ)_/¯"
{
    "evidence": [
        {
            "phrase": "around an hour seems like a reasonable turnaround time for a permaban imo",
            "reasoning": "The post discusses the turnaround time for a permanent ban, which relates directly to moderation policies on the Bluesky platform.",
            "supports_stance": "favor"
        }
    ],
    "final_stance": "favor"
}

Post: "Please just play the game, I'm. Begging you. If you think moderation is easy and the team should just get it right, humble yourself https://moderatormayhem.engine.is"
{
    "evidence": [
        {
            "phrase": "If you think moderation is easy and the team should just get it right, humble yourself",
            "reasoning": "he post suggests that the Bluesky moderation team faces challenges and implies that criticism of their efforts may be unwarranted. This indicates a supportive stance towards the moderation team.",
            "supports_stance": "favor"
        }
    ],
    "supports_stance": "favor"
}

Post: "just encountered the classic moderation trick on reddit DM from \"reddit\" saying they took action on my report, found it violated the rules, linked to what i reported! Reader, the blatant transphobia they said they investigated and found violated their rules, was still up"
{
    "evidence": [
        {
            "phrase": "the blatant transphobia they said they investigated and found violated their rules, was still up",
            "reasoning": "The post criticizes moderation practices on Reddit, highlighting a failure to remove content that violates rules. However, it does not reference Bluesky's moderation policies or decisions, making it unrelated to the Bluesky platform.",
            "supports_stance": "unrelated"
        }
    ],
    "supports_stance": "unrelated"
}
"""

experiment(system_prompt_v5, StanceReasoningV1)

# TOPIC RESULTS

# TP: 39 (34.82%) | FP: 6  (5.36%)
# FN: 10 (8.93%) | TN: 57 (50.89%)

# Precision: 0.8667
# Recall:    0.7959
# F1 Score:  0.8298
# Accuracy:  0.8571
# ====================
# STANCE RESULTS

# TP: 5  (12.82%) | FP: 4  (10.26%)
# FN: 1  (2.56%) | TN: 29 (74.36%)

# Precision: 0.5556
# Recall:    0.8333
# F1 Score:  0.6667
# Accuracy:  0.8718
# ====================

TOPIC - FALSE NEGATIVE
TEXT:  Pretty sure the org I manage is bigger than the whole bsky team.

Just a gentle expectation-setting reminder.
GOLD TOPIC:  True
PRED TOPIC:  False
GOLD STANCE:  favor
PRED STANCE:  none
REASONING:  [Evidence(phrase='the org I manage is bigger than the whole bsky team', reasoning="The user compares their organization to the Bluesky team, implying that Bluesky may be under-resourced or lacking in capacity. This suggests a critical view of the Bluesky team's effectiveness without directly addressing moderation policies.", supports_stance='none')]
--------------------
TOPIC - FALSE NEGATIVE
TEXT:  I'm curious, is the expectation for malicious account removal really <60 minutes in all circumstances?

I'm not sure I've ever been on a site of any size that had anywhere near that response time.

This is not criticising the criticism, I'm genuinely curious if that is/should be the expectation.
GOLD TOPIC:  True
PRED TOPIC:  False
GOLD STANCE:  favor
PRED STANCE:  n

In [41]:
# CoT v6 -- with confidence

system_prompt_v6 = """You are an NLP expert, tasked with annotating posts from the social network Bluesky. Your goal is to detect the post's stance on Bluesky's moderation policies, trust and safety, and user removal, especially in relation to the Bluesky development team. 

If the post is not CLEARLY referencing the BLUESKY PLATFORM ITSELF and ITS moderation policies, classify it's stance as 'unrelated'. Otherwise, classify the stance of the post as 'favor', 'against', or 'none'.
"""


experiment(system_prompt_v6, StanceReasoningV1)


TOPIC - FALSE POSITIVE
TEXT:  We should let Donald J Trump on here until he is imprisoned if he double dog promises to only skeet trolls of Pudding Rob
GOLD TOPIC:  False
PRED TOPIC:  True
GOLD STANCE:  <nan>
PRED STANCE:  favor
CONFIDENCE:  medium
REASONING:  [Evidence(phrase='We should let Donald J Trump on here until he is imprisoned', reasoning='The post suggests allowing a controversial figure (Donald J Trump) on the platform, which implies a stance on moderation policies regarding user removal. It indicates a preference for leniency in moderation until a specific condition is met, which can be interpreted as favoring less strict moderation.', supports_stance='favor')]
--------------------
STANCE - FALSE POSITIVE
TEXT:  Unsurprisingly, BlueSky is also going to need a trust and safety team if it doesn’t want to be overrun with fascists. Content moderation is hard and not something that can be done by a handful of devs 🤷🏻‍♀️
GOLD STANCE:  against
PRED STANCE:  favor
CONFIDENCE:  hig