In [15]:
import dspy
import pandas

In [16]:
df_data = pandas.read_excel("300 revised.xlsx")
df_data.columns

Index(['discussion_post_id', 'discussion_post_content_clean',
       'discussion_post_classification', 'discussion_topic', 'course_title',
       'course_desc', 'context_independence', 'parent_discussion_post_id',
       'llama_70b_ta_response',
       'Clarify misunderstandings and address questions',
       'Deepen Disciplinary Understanding', 'Develop Higher-Order Thinking',
       'Enhance Metacognitive Awareness',
       'Foster Collaborative Knowledge Construction and Social Presence'],
      dtype='object')

In [17]:
df_topic_ann = pandas.read_csv("300_with_context_annotation.csv")
df_topic_ann.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'discussion_post_id',
       'discussion_post_content_clean', 'discussion_post_classification',
       'discussion_topic', 'course_title', 'course_desc',
       'context_independence', 'parent_discussion_post_id', 'mellon_id',
       'discussion_context', 'llama_70b_ta_response_with_similar',
       'Clarify misunderstandings and address questions',
       'Deepen Disciplinary Understanding', 'Develop Higher-Order Thinking',
       'Enhance Metacognitive Awareness',
       'Foster Collaborative Knowledge Construction and Social Presence',
       'Unnamed: 18', 'Unnamed: 19'],
      dtype='object')

In [None]:
from typing import List, Literal

class SimpleClassifier(dspy.Signature):
    """Given a discussion forum post and the corresponding teaching assistant response, classify how well this response adheres to pedagogical goals specified with a rubric. Provide your classification as integers."""
    discussion_forum_post = dspy.InputField(desc="The original student forum post.")
    ta_response = dspy.InputField(desc="The corresponding teaching assistant response.")
    pedagogical_rubric = dspy.InputField(desc="The pedagogical rubric you should follow when evaluating the response.")
    discussion_topic = dspy.InputField(desc="Description and instruction for the specific discussion topic for students.")
    course_info = dspy.InputField(desc="Information about the course.")
    rating: Literal[0, 1, 2, "NA"] = dspy.OutputField()

class ComplexSelfReflector(dspy.Module):
    def __init__(self, callbacks=None):
        super().__init__(callbacks)
        self.classifier = dspy.ChainOfThought(SimpleClassifier)
    
    def forward(self, discussion_forum_post, ta_response, pedagogical_rubric, discussion_topic, course_info):
        og_rating = self.classifier(discussion_forum_post=discussion_forum_post,
                                    ta_response=ta_response,
                                    pedagogical_rubric=pedagogical_rubric,
                                    discussion_topic=discussion_topic,
                                    course_info=course_info).rating
        return dspy.Prediction(rating=og_rating)

classifier = ComplexSelfReflector()

In [19]:
import random
random.seed(42)

# first set of selections
import numpy as np
np.random.seed(42)

selector_1 = np.random.choice(300, 150, replace=False)
selector_2 = np.random.choice(300, 150, replace=False)

print(len(selector_1), len(selector_2))

150 150


In [None]:
LEVEL_1 = open("rubrics/level_1.txt").read()

all_train_examples = []

all_val_examples = []

counter = 0

for i, row in df_data.iterrows():
    post = row["discussion_post_content_clean"]
    resp = row["llama_70b_ta_response"]
    for col, desc in [
        ("Clarify misunderstandings and address questions", LEVEL_1),
    ]:
        if pandas.notna(row[col]):
            if pandas.notna(row["course_desc"]):
                course_info = row["course_title"] + "\n\n" + row["course_desc"]
            else:
                course_info = row["course_title"]
            new_ex = dspy.Example({
                "discussion_forum_post": post,
                "ta_response": resp,
                "pedagogical_rubric": desc,
                "discussion_topic": row["discussion_topic"],
                "course_info": course_info,
                "rating": int(row[col]),
                "level": col
            }).with_inputs("discussion_forum_post", "ta_response", "pedagogical_rubric", "discussion_topic", "course_info")
            if counter not in selector_1:
                all_train_examples.append(new_ex)
            else:
                all_val_examples.append(new_ex)
    counter += 1

print(len(all_train_examples), len(all_val_examples))

counter = 0

for i, row in df_topic_ann.iterrows():
    post = row["discussion_post_content_clean"]
    resp = row["llama_70b_ta_response_with_similar"]
    for col, desc in [
        ("Clarify misunderstandings and address questions", LEVEL_1),
    ]:
        if pandas.notna(row[col]):
            if pandas.notna(row["course_desc"]):
                course_info = row["course_title"] + "\n\n" + row["course_desc"]
            else:
                course_info = row["course_title"]
            new_ex = dspy.Example({
                "discussion_forum_post": post,
                "ta_response": resp,
                "pedagogical_rubric": desc,
                "discussion_topic": row["discussion_topic"],
                "course_info": course_info,
                "rating": int(row[col]),
                "level": col
            }).with_inputs("discussion_forum_post", "ta_response", "pedagogical_rubric", "discussion_topic", "course_info")
            if counter not in selector_2:
                all_train_examples.append(new_ex)
            else:
                all_val_examples.append(new_ex)
    counter += 1

train = all_train_examples
val = all_val_examples
print(len(train), len(val))

150 150
293 296


In [21]:
all_gold_labels, all_pred_labels = {}, {}
all_examples = {}

def acc_metric(gold, pred, trace=None):
    gold_rating = str(gold.rating)
    pred_rating = str(pred.rating)
    if gold_rating == "3":
        gold_rating = "NA"
    if gold.level not in all_gold_labels:
        all_gold_labels.update({gold.level: []})
        all_pred_labels.update({gold.level: []})
        all_examples.update({gold.level: []})
    all_gold_labels[gold.level].append(gold_rating)
    all_pred_labels[gold.level].append(pred_rating)
    all_examples[gold.level].append([gold.discussion_forum_post,
                                     gold.ta_response])
    fin_score = int(gold_rating == pred_rating)
    if trace is not None: return fin_score == 1
    return fin_score

In [None]:
import os
import dotenv
dotenv.load_dotenv(".env")

lm = dspy.LM("openai/gpt-4o", cache=True, max_tokens=4000)
dspy.configure(lm=lm)
evaluate = dspy.Evaluate(metric=acc_metric, devset=val, nsum_threads=8, display_progress=True, display_table=5, max_errors=100, provide_traceback=True)
eval_score = evaluate(classifier)

In [23]:
len(all_gold_labels), len(all_pred_labels), len(all_examples)

(1, 1, 1)

In [24]:
all_gold_labels.keys()

dict_keys(['Clarify misunderstandings and address questions'])

In [25]:
from sklearn.metrics import precision_recall_fscore_support

for k in all_gold_labels:
    print(k)
    print(precision_recall_fscore_support(y_true=all_gold_labels[k],
                                          y_pred=all_pred_labels[k],
                                          average="weighted"))

Clarify misunderstandings and address questions
(0.8397398234354756, 0.875, 0.8562545291070613, None)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from collections import Counter

for k in all_gold_labels:
    print(k)
    print(Counter(all_gold_labels[k]))
    print(precision_recall_fscore_support(y_true=all_gold_labels[k],
                                          y_pred=all_pred_labels[k],
                                          average="weighted"))

    cm = confusion_matrix(all_gold_labels[k], all_pred_labels[k], labels=["0", "1", "2", "NA"])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                display_labels=["0", "1", "2", "NA"])
    disp.plot()
    plt.show()

In [None]:
try:
    kwargs = dict(num_threads=8, display_progress=True, display_table=0)
    

    teleprompter = dspy.SIMBA(metric=acc_metric, max_demos=10)
    
    compiled_prompt_opt = teleprompter.compile(classifier, trainset=train)
    evaluate = dspy.Evaluate(metric=acc_metric, devset=val, nsum_threads=8, display_progress=True, display_table=5, max_errors=100, provide_traceback=True)
    eval_score = evaluate(compiled_prompt_opt)
    print(eval_score)
    
    compiled_prompt_opt.save(f"optimized_prompt_4o_level_1.json")
except ValueError as e:
    print(e)

In [None]:
all_gold_labels, all_pred_labels = {}, {}
all_examples = {}
evaluate = dspy.Evaluate(metric=acc_metric, devset=val, nsum_threads=8, display_progress=True, display_table=5, max_errors=100, provide_traceback=True)
eval_score = evaluate(compiled_prompt_opt)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

for k in all_gold_labels:
    print(k)
    print(precision_recall_fscore_support(y_true=all_gold_labels[k],
                                          y_pred=all_pred_labels[k],
                                          average="weighted"))
    print(accuracy_score(y_true=all_gold_labels[k], y_pred=all_pred_labels[k]))