In [None]:
import langsmith
from dotenv import load_dotenv
import numpy as np

load_dotenv()
client = langsmith.Client()

In [2]:
# Query all runs from "geochat-local" project
it = client.list_runs(
    project_name="geochat-local",
    is_root=True,
)
runs = [run for run in it if run.feedback_stats]

In [3]:
# Query feedbacks for runs that have feedback (not using feedback_stats because we need raw feedback)
feedbacks = {}
for run in runs:
    fdbs = client.list_feedback(run_ids=[run.id])
    feedbacks[run.id] = [fb for fb in fdbs]

In [6]:
ab_total_count = 0
ab_context_count = 0
ab_stars_context = []
ab_stars_alt = []
ab_stars_solo = []

for k,v in feedbacks.items():
    print('-'*50)
    ab_feedbacks = [fb for fb in v if fb.key == 'ab']
    star_feedbacks = [fb for fb in v if fb.key == 'stars']
    print(f"Run id: {k} AB: {len(ab_feedbacks)} Stars: {len(star_feedbacks)}")

    if len(ab_feedbacks) > 0:
        ab_feedback = ab_feedbacks[0]
        context_msg_pos = ab_feedback.correction['context_response']
        print("Context msg position:", context_msg_pos)
        ab_total_count = ab_total_count + len(ab_feedbacks)
        ab_context_count = ab_context_count + len([fb for fb in ab_feedbacks if fb.value == fb.correction['context_response']])
        print("Selected msg position:", ab_feedback.value)

        for fb in star_feedbacks:
            score = fb.score
            if fb.correction['original_position'] == context_msg_pos:
                print(f"Stars for context response: {score}")
                ab_stars_context.append(score)
            else:
                print(f"Stars for alternative response: {score}")
                ab_stars_alt.append(score)
    else:
        for fb in star_feedbacks:
            score = fb.score
            print(f"Stars for solo response: {score}")
            ab_stars_solo.append(score)

--------------------------------------------------
Run id: 9f6367cb-ea26-46dd-9045-b28db0bfcf92 AB: 0 Stars: 1
Stars for solo response: 4.0
--------------------------------------------------
Run id: 71944ae3-38bf-4bf5-812e-1c428adc7652 AB: 1 Stars: 0
Context msg position: Option A
Selected msg position: Option A
--------------------------------------------------
Run id: bd4464f1-d03b-4d65-bf12-30d59f818811 AB: 1 Stars: 0
Context msg position: Option B
Selected msg position: Option B
--------------------------------------------------
Run id: 9b9a4cef-a856-46dc-b225-a71fa6a1f6cf AB: 1 Stars: 0
Context msg position: Option A
Selected msg position: Option A
--------------------------------------------------
Run id: 3bbb0709-40ad-4ae8-96ea-8fc92cba4b0f AB: 1 Stars: 0
Context msg position: Option A
Selected msg position: Option A
--------------------------------------------------
Run id: 1fffee41-484c-4981-8607-6139b817ba15 AB: 1 Stars: 0
Context msg position: Option B
Selected msg position:

In [7]:
print("-"*50)
print("Statistics:")
print("AB total count:", ab_total_count)
print("AB context count:", ab_context_count)
print("AB context stars:", ab_stars_context)
print("AB context stars mean:", np.array(ab_stars_context).mean())
print("AB alternative stars:", ab_stars_alt)
print("AB alternative stars mean:", np.array(ab_stars_alt).mean())
print("AB solo stars:", ab_stars_solo)
print("AB solo stars mean:", np.array(ab_stars_solo).mean())

--------------------------------------------------
Statistics:
AB total count: 14
AB context count: 10
AB context stars: [2.0, 3.0, 0.0]
AB context stars mean: 1.6666666666666667
AB alternative stars: [2.0, 4.0]
AB alternative stars mean: 3.0
AB solo stars: [4.0, 2.0, 1.0]
AB solo stars mean: 2.3333333333333335
