In [1]:
import io
import re
import asyncio

from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import jinja2  # Templating library (prompt templates)
from dotenv import load_dotenv
import openai
from pingouin import intraclass_corr

load_dotenv("../.env")
client = openai.OpenAI()

In [2]:
df = pd.read_csv("../data/ASAP2_new_IN_data_only_w_splits.csv")
df

Unnamed: 0,essay_id,score,full_text,set,pubpriv,assignment,prompt_name,economically_disadvantaged,student_disability_status,ell_status,race_ethnicity,gender,grade_level,essay_word_count,source,task,stratified_split
0,AAAOPP13416000019936,4,"Being a Seagoing Cowboy is really fun, but als...",test,private,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Not identified as having disability,No,Black/African American,M,6,538.0,MI,Text dependent,train
1,AAAOPP13416000055926,5,"To whom ever reads this at the end, you might ...",test,public,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Not economically disadvantaged,Not identified as having disability,No,Hispanic/Latino,F,6,367.0,MI,Text dependent,train
2,AAAOPP13416000005911,3,Have you ever wondered about what you might do...,train,0,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Not identified as having disability,No,Two or more races/Other,F,6,498.0,MI,Text dependent,train
3,AAAOPP13416000019800,3,I believe that you should join the Seagoing Co...,train,0,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Not economically disadvantaged,Not identified as having disability,No,Asian/Pacific Islander,F,6,317.0,MI,Text dependent,train
4,AAAOPP13416000019823,4,Would you like to participate in the Seagoing ...,train,0,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Not economically disadvantaged,Not identified as having disability,No,Asian/Pacific Islander,F,6,443.0,MI,Text dependent,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11821,AAAOPP13416000218424,1,We've been discovering things almost everyday....,test,private,You have read the article 'Unmasking the Face ...,The Face on Mars,Economically disadvantaged,Not identified as having disability,No,Black/African American,M,8,259.0,MI,Text dependent,train
11822,AAAOPP13416000254175,1,In 1976 the queation was and everyone was also...,train,0,You have read the article 'Unmasking the Face ...,The Face on Mars,Not economically disadvantaged,Not identified as having disability,No,Two or more races/Other,M,8,226.0,MI,Text dependent,test
11823,AAAOPP13416000260386,1,25 years ago there was a face discovered by na...,test,private,You have read the article 'Unmasking the Face ...,The Face on Mars,Not economically disadvantaged,Not identified as having disability,No,Black/African American,M,8,163.0,MI,Text dependent,train
11824,AAAOPP13416000292300,1,The\n\nfirst thing that i should include in my...,train,0,You have read the article 'Unmasking the Face ...,The Face on Mars,Not economically disadvantaged,Not identified as having disability,No,Black/African American,F,8,292.0,MI,Text dependent,dev


## Prompt Template

In [3]:
environment = jinja2.Environment(loader=jinja2.FileSystemLoader("prompt_templates"))
template = environment.get_template("scoring_prompt.jinja2")

In [27]:
# Collect Datasets
with open("../bin/openai_finetune_train.jsonl", "rb") as file:
    train_response = client.files.create(file=file, purpose="fine-tune")

with open("../bin/openai_finetune_dev.jsonl", "rb") as file:
    dev_response = client.files.create(file=file, purpose="fine-tune")

print(f"Training File ID: {train_response.id}")
print(f"Development File ID: {dev_response.id}")

Training File ID: file-SQqEvaWqamBmihQU2MFLYg
Development File ID: file-Sv1R7t31uvov8s5DSZBCG2


In [28]:
# Start job
response = client.fine_tuning.jobs.create(
    training_file=train_response.id,
    validation_file=dev_response.id,
    model="gpt-4o-mini-2024-07-18",
    suffix="asap-scoring",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-MRf8vjZABAws2x41IdEqi3TC
Status: validating_files


In [47]:
# Check finetuning job status
client.fine_tuning.jobs.retrieve("ftjob-MRf8vjZABAws2x41IdEqi3TC")

FineTuningJob(id='ftjob-MRf8vjZABAws2x41IdEqi3TC', created_at=1741790190, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:ai-aloe-georgia-institute-of-technology:asap-scoring:BAJl0DXv', finished_at=1741798923, hyperparameters=Hyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-T8buG9LSyviQmc1hjja0RRCV', result_files=['file-6rQYZsBTwZd4jGQJfXPeV3'], seed=259069256, status='succeeded', trained_tokens=24114963, training_file='file-SQqEvaWqamBmihQU2MFLYg', validation_file='file-Sv1R7t31uvov8s5DSZBCG2', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=11, learning_rate_multiplier=1.8, n_epochs=3)), type='supervised'), user_provided_suffix='asap-scoring', metadata=None)

In [46]:
# Retrieve the fine tuned model id
results = client.fine_tuning.jobs.list()
finetuned_model = results.data[0].fine_tuned_model
print(finetuned_model)

ft:gpt-4o-mini-2024-07-18:ai-aloe-georgia-institute-of-technology:asap-scoring:BAJl0DXv


## Inference

In [9]:
# Configuration
RANDOM_SEED = 42
SAMPLE_SIZE = 500
REPETITIONS = 5
FINETUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:ai-aloe-georgia-institute-of-technology:asap-scoring:BAJl0DXv"

# Sample texts
test_frame = df[df["stratified_split"] == "test"]
sampled_texts = test_frame.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)

# Store results with metadata
reliability_results = []

print(f"Scoring {SAMPLE_SIZE} texts, {REPETITIONS} times each...")

for row in tqdm(sampled_texts.itertuples(), total=len(sampled_texts)):
    text_id = row.essay_id
    
    # Score the same text 5 times
    for rep in range(REPETITIONS):
        response = client.chat.completions.create(
            model=FINETUNED_MODEL,
            messages=[{
                "role": "user",
                "content": template.render(
                    assignment=row.assignment,
                    full_text=row.full_text,
                )
            }],
            temperature=1.0 # default
        )
        
        # Extract score from response (adjust based on your scoring format)
        score = int(response.choices[0].message.content)
        
        # Store results for this text
        reliability_results.append({
            'text_id': text_id,
            'repetition': rep + 1,
            'score': score,
        })

# Convert to DataFrame for analysis
reliability_df = pd.DataFrame(reliability_results)
reliability_df

Scoring 500 texts, 5 times each...


  0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,text_id,repetition,score
0,AAAOPP13416000004289,1,2
1,AAAOPP13416000004289,2,2
2,AAAOPP13416000004289,3,2
3,AAAOPP13416000004289,4,3
4,AAAOPP13416000004289,5,3
...,...,...,...
2495,AAAVUP14319000014481,1,5
2496,AAAVUP14319000014481,2,5
2497,AAAVUP14319000014481,3,4
2498,AAAVUP14319000014481,4,4


## Calculate ICC(2,1)

ICC(2,1) is a measure of test-retest reliability, or "internal consistency". If the model gives the same score to the same essay every time, then ICC(2,1) is 1.0.

INTERPRETATION
  * \>= 0.90: "Excellent internal reliability"
  * \>= 0.75: "Good internal reliability"
  * \>= 0.50: "Moderate internal reliability"
  * < 0.50: "Poor internal reliability"

In [14]:
# Prepare data for ICC calculation
icc_data = reliability_df[['text_id', 'repetition', 'score']].copy()
icc_data.columns = ['targets', 'raters', 'ratings']

# Calculate ICC
icc_result = intraclass_corr(data=icc_data, targets='targets', 
                            raters='raters', ratings='ratings')

# Extract ICC(2,1) - two-way random effects, single measurement, absolute agreement
icc_21 = icc_result[icc_result['Type'] == 'ICC2']['ICC'].iloc[0]
icc_21_ci_lower = icc_result[icc_result['Type'] == 'ICC2']['CI95%'].iloc[0][0]
icc_21_ci_upper = icc_result[icc_result['Type'] == 'ICC2']['CI95%'].iloc[0][1]

# Extract ICC(2,k) - two-way random effects, average of k measurements, absolute agreement
# Just calculating this out of curiosity. It tells us what the expected reliability is if we take the average of all scores
# Rather than the reliability of each individual score
icc_2k = icc_result[icc_result['Type'] == 'ICC2k']['ICC'].iloc[0]
icc_2k_ci_lower = icc_result[icc_result['Type'] == 'ICC2k']['CI95%'].iloc[0][0]
icc_2k_ci_upper = icc_result[icc_result['Type'] == 'ICC2k']['CI95%'].iloc[0][1]

# Display results
print("\n=== INTERNAL RELIABILITY RESULTS ===")
print(f"Sample size: {SAMPLE_SIZE} texts")
print(f"Repetitions per text: {REPETITIONS}")
print(f"ICC(2,1) = {icc_21:.3f} [{icc_21_ci_lower:.3f}, {icc_21_ci_upper:.3f}]")
print(f"ICC(2,k) = {icc_2k:.3f} [{icc_2k_ci_lower:.3f}, {icc_2k_ci_upper:.3f}]")

# Save results
reliability_df.to_csv('../results/finetuned_o3_mini_reliability_results.csv', index=False)
print(f"\n\nResults saved to: gpt_reliability_results.csv")


=== INTERNAL RELIABILITY RESULTS ===
Sample size: 500 texts
Repetitions per text: 5
ICC(2,1) = 0.845 [0.830, 0.860]
ICC(2,k) = 0.965 [0.960, 0.970]


Results saved to: gpt_reliability_results.csv


In [106]:
response_contents = [
    chat_completion.choices[0].message.content
    for chat_completion in responses
]

scores_4o = [int(score) for score in response_contents]

In [107]:
df.loc[df["stratified_split"] == "test", "4o_finetuned_score"] = scores_4o
df[df["stratified_split"] == "test"]

Unnamed: 0,essay_id,score,full_text,set,pubpriv,assignment,prompt_name,economically_disadvantaged,student_disability_status,ell_status,race_ethnicity,gender,grade_level,essay_word_count,source,task,stratified_split,4o_finetuned_score
4,AAAOPP13416000019823,4,Would you like to participate in the Seagoing ...,train,0,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Not economically disadvantaged,Not identified as having disability,No,Asian/Pacific Islander,F,6,443.0,MI,Text dependent,test,4.0
6,AAAOPP13416000049976,2,"""The cattle-boat trips were an unbelieveable o...",train,0,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Not identified as having disability,No,White,F,6,477.0,MI,Text dependent,test,3.0
8,AAAOPP13416000072005,3,"*point of view Luke*\n\n""I just don't get why ...",test,private,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Not identified as having disability,No,Black/African American,F,6,374.0,MI,Text dependent,test,3.0
11,AAAOPP13416000008070,4,I strongly suggest that anyone should join the...,test,private,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Not identified as having disability,No,Two or more races/Other,F,6,389.0,MI,Text dependent,test,4.0
14,AAAOPP13416000055458,4,Join the Seagoing Cowboys program!\n\nThe Seag...,test,private,"You have just read the article, 'A Cowboy Who ...","""A Cowboy Who Rode the Waves""",Economically disadvantaged,Identified as having disability,No,White,F,6,497.0,MI,Text dependent,test,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11808,AAAOPP13416000111581,1,The Face wasn't made by aliens. The Face is ju...,test,public,You have read the article 'Unmasking the Face ...,The Face on Mars,Economically disadvantaged,Not identified as having disability,No,White,M,8,165.0,MI,Text dependent,test,2.0
11811,AAAOPP13416000116879,1,If anything this could all be a misunderstandi...,test,public,You have read the article 'Unmasking the Face ...,The Face on Mars,Economically disadvantaged,Not identified as having disability,No,Black/African American,F,8,178.0,MI,Text dependent,test,2.0
11817,AAAOPP13416000148184,1,"Unmasking the Face on Mars\n\nIn May of 2001, ...",test,private,You have read the article 'Unmasking the Face ...,The Face on Mars,Economically disadvantaged,Not identified as having disability,No,Hispanic/Latino,F,8,257.0,MI,Text dependent,test,3.0
11818,AAAOPP13416000174761,1,I'm saying this matter of factly not out of ru...,test,public,You have read the article 'Unmasking the Face ...,The Face on Mars,Economically disadvantaged,Not identified as having disability,No,White,M,8,284.0,MI,Text dependent,test,2.0
