# Import & Setup

In [None]:
# Install required packages
!pip install pandas numpy matplotlib seaborn scikit-learn openai

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import re
from scipy import stats


In [None]:
!pip install fitz
!pip install pymupdf
!pip install PyPDF2
!pip install pingouin

import PyPDF2

from dotenv import load_dotenv
load_dotenv()


API_KEY_SENTINEL=os.getenv("OPENAI_API_KEY")

In [None]:
import fitz  # PyMuPDF
import pandas as pd
import re

# === 1. Load and extract essays from PDF ===
pdf_path = "essay_grading_content/essays_for_grading_Ray Zhou.pdf"

doc = fitz.open(pdf_path)
full_text = ""
for page in doc:
    full_text += page.get_text("text") + "\n"

# Split essays using "ID:" markers
essays_raw = re.split(r"(?=ID: )", full_text)
essays = []
for chunk in essays_raw:
    if not chunk.strip().startswith("ID:"):
        continue

    # Extract ID
    m_id = re.search(r"ID:\s*([a-zA-Z0-9]+)", chunk)
    if not m_id:
        continue
    essay_id = m_id.group(1).strip()

    # Extract prompt
    m_prompt = re.search(r"PROMPT:(.*?)---", chunk, re.S)
    prompt = m_prompt.group(1).strip() if m_prompt else ""

    # Extract essay text (everything after ---)
    m_essay = re.search(r"---(.*)", chunk, re.S)
    essay_text = m_essay.group(1).strip() if m_essay else ""

    essays.append({
        "ID": essay_id,
        "Prompt": prompt,
        "Essay": essay_text
    })

essays_df = pd.DataFrame(essays)
print(f"✅ Parsed {len(essays_df)} essays from PDF")

# === 2. Load grades CSV ===
grades_path = "essay_grading_content/essay_grades.csv"
grades_df = pd.read_csv(grades_path)

# Clean column names
grades_df.columns = grades_df.columns.str.strip()

# Rename Prolific Id → ID
if "Prolific Id" in grades_df.columns:
    grades_df.rename(columns={"Prolific Id": "ID"}, inplace=True)

print("✅ Loaded grades file with columns:", grades_df.columns.tolist())

# === 3. Merge essays + grades ===
merged_df = pd.merge(essays_df, grades_df, on="ID", how="outer")
print(f"✅ Merged dataset has {len(merged_df)} rows")

# === 4. Preview ===
merged_df.head()


In [None]:
rubric_text = ""
with open("essay_grading_content/Band-Descriptors-Task-2.pdf", "rb") as f:  # "rb" = read binary
    reader = PyPDF2.PdfReader(f)
    for page in reader.pages:
        rubric_text += page.extract_text() + "\n"

print(rubric_text[:1000])

In [None]:
from sklearn.model_selection import train_test_split
examples_df, test_df = train_test_split(merged_df, test_size=20, random_state=42)
print(f"Examples: {len(examples_df)}, Test set: {len(test_df)}")


def make_examples(df, n=30):
    examples = ""
    for _, row in df.head(n).iterrows():  # take first n
        examples += f"""
        Essay:
        {row['Essay']}

        Scores:
        - Quality of argument: {row['Quality of argument']}
        - Coherence and cohesion: {row['Coherence and cohesion']}
        - Lexical resource and grammar: {row['Lexical resource and grammar']}

        ---
        """
    return examples

examples = make_examples(examples_df, n=30)
print(examples[:1500])  # preview

In [None]:
prompt_template = f"""
You are an expert IELTS examiner. Your task is to evaluate student essays according to the IELTS Task 2 rubric.

You will give **three separate scores (0–9)**:
1. Quality of argument – how well the student develops and supports their ideas.
2. Coherence and cohesion – how logically the essay flows and how ideas are connected.
3. Lexical resource and grammar – vocabulary range, accuracy, and grammar use.

Rubric (truncated to stay within context length):
{rubric_text}

Here are 30 graded examples for reference:
{examples}

Now grade the following essay: {{essay_text}}



Assign scores that reflect the standards from the examples. Align with the reference grader.

1. Quality of argument: Look for weak reasoning, insufficient support, or underdeveloped ideas.
   Avoid giving “safe” midrange scores unless they truly match the examples. Align with the reference grader.

2. Coherence and cohesion: Penalize logical gaps, poor transitions, or unclear organization.
   Align with the reference grader.

3. Lexical resource and grammar: Evaluate normally based on the rubric.
   Align with the reference grader.


Output format:
Quality of argument: X
Coherence and cohesion: Y
Lexical resource and grammar: Z
"""


from openai import OpenAI
import os, re

os.environ["OPENAI_API_KEY"] = API_KEY_SENTINEL  # keep safe!
client = OpenAI()

def grade_essay(essay_text):
    prompt = prompt_template.format(essay_text=essay_text)
    resp = client.responses.create(
    model="o4-mini",
    reasoning={"effort": "high"},
    input=[{
        "role": "user",
        "content": [{"type": "input_text", "text": prompt}],
        }],
    )
    return resp.output_text

def extract_scores(text):
    # extract first 3 numbers (0–9)
    m = re.findall(r"\b([0-9])\b", text)
    if len(m) >= 3:
        return list(map(int, m[:3]))
    return [None, None, None]

# Compare to Ground Truth

In [None]:
#################################################################################
#################################################################################
################# ########.   Grade Here    .######## ###########################
#################################################################################
#################################################################################

results = []
for _, row in test_df.iterrows():
    gpt_output = grade_essay(row["Essay"])
    gpt_scores = extract_scores(gpt_output)
    results.append({
        "ID": row["ID"],
        "Human": [
            row["Quality of argument"],
            row["Coherence and cohesion"],
            row["Lexical resource and grammar"]
        ],
        "GPT": gpt_scores,
        "Raw": gpt_output
    })

results_df = pd.DataFrame(results)

In [None]:
results_df

In [None]:
# Save DataFrame to CSV
results_df.to_csv("output.csv", index=False)

In [None]:
result_df = results_df
print("DataFrame info:")
print(f"Shape: {result_df.shape}")
print("\nColumns:", result_df.columns.tolist())
print("\nFirst few rows:")
print(result_df.head())


import pandas as pd

# Expand Human and GPT scores into separate columns
results_df[["Human_Arg", "Human_Coh", "Human_Lex"]] = pd.DataFrame(results_df["Human"].tolist(), index=results_df.index)
results_df[["GPT_Arg", "GPT_Coh", "GPT_Lex"]] = pd.DataFrame(results_df["GPT"].tolist(), index=results_df.index)

# Compute differences
results_df["Diff_Arg"] = results_df["GPT_Arg"] - results_df["Human_Arg"]
results_df["Diff_Coh"] = results_df["GPT_Coh"] - results_df["Human_Coh"]
results_df["Diff_Lex"] = results_df["GPT_Lex"] - results_df["Human_Lex"]

# Absolute differences
results_df["AbsDiff_Arg"] = results_df["Diff_Arg"].abs()
results_df["AbsDiff_Coh"] = results_df["Diff_Coh"].abs()
results_df["AbsDiff_Lex"] = results_df["Diff_Lex"].abs()

# Summary metrics
summary = {
    "Mean Abs Diff (Arg)": results_df["AbsDiff_Arg"].mean(),
    "Mean Abs Diff (Coh)": results_df["AbsDiff_Coh"].mean(),
    "Mean Abs Diff (Lex)": results_df["AbsDiff_Lex"].mean(),
    "Overall Mean Abs Diff": results_df[["AbsDiff_Arg","AbsDiff_Coh","AbsDiff_Lex"]].mean().mean()
}

print("=== Score Differences Summary ===")
for k,v in summary.items():
    print(f"{k}: {v:.2f}")

# Show sample rows
print("\n=== Sample Comparison ===")
print(results_df[["ID","Human_Arg","GPT_Arg","Diff_Arg",
                 "Human_Coh","GPT_Coh","Diff_Coh",
                 "Human_Lex","GPT_Lex","Diff_Lex"]].head(10))


import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
import pingouin as pg

# === 1. Scatterplots ===
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
dims = [("Human_Arg","GPT_Arg","Quality of Argument"),
        ("Human_Coh","GPT_Coh","Coherence & Cohesion"),
        ("Human_Lex","GPT_Lex","Lexical & Grammar")]

for ax,(h,g,label) in zip(axes,dims):
    sns.scatterplot(x=results_df[h], y=results_df[g], ax=ax)
    ax.plot([0,9],[0,9], 'r--')  # 1:1 line
    ax.set_title(label)
    ax.set_xlabel("Human")
    ax.set_ylabel("GPT")

plt.tight_layout()
plt.show()

# === 2. Correlations ===
for h,g,label in dims:
    pearson = pearsonr(results_df[h], results_df[g])
    spearman = spearmanr(results_df[h], results_df[g])
    print(f"\n{label}")
    print(f"  Pearson r: {pearson[0]:.3f} (p={pearson[1]:.3g})")
    print(f"  Spearman rho: {spearman[0]:.3f} (p={spearman[1]:.3g})")

# === 3. Intraclass Correlation Coefficient (ICC) ===
# Reshape into long form: ID, Rater, Score, Dimension
icc_data = []
for idx,row in results_df.iterrows():
    icc_data.append([row["ID"], "Human", row["Human_Arg"], "Argument"])
    icc_data.append([row["ID"], "GPT",   row["GPT_Arg"], "Argument"])
    icc_data.append([row["ID"], "Human", row["Human_Coh"], "Coherence"])
    icc_data.append([row["ID"], "GPT",   row["GPT_Coh"], "Coherence"])
    icc_data.append([row["ID"], "Human", row["Human_Lex"], "Lexical"])
    icc_data.append([row["ID"], "GPT",   row["GPT_Lex"], "Lexical"])

icc_df = pd.DataFrame(icc_data, columns=["ID","Rater","Score","Dimension"])

# Run ICC separately for each dimension
for dim in icc_df["Dimension"].unique():
    sub = icc_df[icc_df["Dimension"]==dim]
    icc = pg.intraclass_corr(data=sub, targets="ID", raters="Rater", ratings="Score")
    print(f"\nICC for {dim}:\n", icc[["Type","ICC","CI95%","pval"]])



import matplotlib.pyplot as plt
import seaborn as sns

# Unpack the scores from lists into separate columns
results_df["Human_Arg"]     = results_df["Human"].apply(lambda x: x[0])
results_df["Human_Coh"]     = results_df["Human"].apply(lambda x: x[1])
results_df["Human_Lex"]     = results_df["Human"].apply(lambda x: x[2])

results_df["GPT_Arg"]       = results_df["GPT"].apply(lambda x: x[0])
results_df["GPT_Coh"]       = results_df["GPT"].apply(lambda x: x[1])
results_df["GPT_Lex"]       = results_df["GPT"].apply(lambda x: x[2])

# Set up figure
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
dims = [("Human_Arg","GPT_Arg","Quality of Argument"),
        ("Human_Coh","GPT_Coh","Coherence & Cohesion"),
        ("Human_Lex","GPT_Lex","Lexical & Grammar")]

for ax,(h,g,label) in zip(axes,dims):
    sns.scatterplot(x=results_df[h], y=results_df[g], ax=ax)
    ax.plot([0,9],[0,9], 'r--')  # 1:1 reference line
    ax.set_title(label)
    ax.set_xlabel("Human Score")
    ax.set_ylabel("GPT Score")
    ax.set_xticks(range(0,10))
    ax.set_yticks(range(0,10))
    ax.grid(True)

plt.tight_layout()
plt.show()




from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# === Cohen's kappa ===
print("Cohen's Kappa:")
print("  Argument:", cohen_kappa_score(results_df["Human_Arg"], results_df["GPT_Arg"]))
print("  Cohesion:", cohen_kappa_score(results_df["Human_Coh"], results_df["GPT_Coh"]))
print("  Lexical :", cohen_kappa_score(results_df["Human_Lex"], results_df["GPT_Lex"]))

# === Confusion Matrix Heatmaps ===
dims = [("Human_Arg","GPT_Arg","Quality of Argument"),
        ("Human_Coh","GPT_Coh","Coherence & Cohesion"),
        ("Human_Lex","GPT_Lex","Lexical & Grammar")]

for h,g,label in dims:
    cm = confusion_matrix(results_df[h], results_df[g], labels=range(0,10))
    plt.figure(figsize=(7,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=range(10), yticklabels=range(10))
    plt.title(f"Confusion Matrix: {label}")
    plt.xlabel("GPT Score")
    plt.ylabel("Human Score")
    plt.show()


from sklearn.metrics import cohen_kappa_score

# Extract each dimension into lists
human_arg     = results_df["Human"].apply(lambda x: x[0])
human_cohes   = results_df["Human"].apply(lambda x: x[1])
human_lexical = results_df["Human"].apply(lambda x: x[2])

gpt_arg     = results_df["GPT"].apply(lambda x: x[0])
gpt_cohes   = results_df["GPT"].apply(lambda x: x[1])
gpt_lexical = results_df["GPT"].apply(lambda x: x[2])

# Weighted kappas
print("Weighted Kappa (quadratic):")
print("  Argument:", cohen_kappa_score(human_arg, gpt_arg, weights="quadratic"))
print("  Cohesion:", cohen_kappa_score(human_cohes, gpt_cohes, weights="quadratic"))
print("  Lexical :", cohen_kappa_score(human_lexical, gpt_lexical, weights="quadratic"))

print("\nWeighted Kappa (linear):")
print("  Argument:", cohen_kappa_score(human_arg, gpt_arg, weights="linear"))
print("  Cohesion:", cohen_kappa_score(human_cohes, gpt_cohes, weights="linear"))
print("  Lexical :", cohen_kappa_score(human_lexical, gpt_lexical, weights="linear"))
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# === Cohen's kappa ===
print("Cohen's Kappa:")
print("  Argument:", cohen_kappa_score(results_df["Human_Arg"], results_df["GPT_Arg"]))
print("  Cohesion:", cohen_kappa_score(results_df["Human_Coh"], results_df["GPT_Coh"]))
print("  Lexical :", cohen_kappa_score(results_df["Human_Lex"], results_df["GPT_Lex"]))

# === Confusion Matrix Heatmaps ===
dims = [("Human_Arg","GPT_Arg","Quality of Argument"),
        ("Human_Coh","GPT_Coh","Coherence & Cohesion"),
        ("Human_Lex","GPT_Lex","Lexical & Grammar")]

for h,g,label in dims:
    cm = confusion_matrix(results_df[h], results_df[g], labels=range(0,10))
    plt.figure(figsize=(7,5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=range(10), yticklabels=range(10))
    plt.title(f"Confusion Matrix: {label}")
    plt.xlabel("GPT Score")
    plt.ylabel("Human Score")
    plt.show()


from sklearn.metrics import cohen_kappa_score

# Extract each dimension into lists
human_arg     = results_df["Human"].apply(lambda x: x[0])
human_cohes   = results_df["Human"].apply(lambda x: x[1])
human_lexical = results_df["Human"].apply(lambda x: x[2])

gpt_arg     = results_df["GPT"].apply(lambda x: x[0])
gpt_cohes   = results_df["GPT"].apply(lambda x: x[1])
gpt_lexical = results_df["GPT"].apply(lambda x: x[2])

# Weighted kappas
print("Weighted Kappa (quadratic):")
print("  Argument:", cohen_kappa_score(human_arg, gpt_arg, weights="quadratic"))
print("  Cohesion:", cohen_kappa_score(human_cohes, gpt_cohes, weights="quadratic"))
print("  Lexical :", cohen_kappa_score(human_lexical, gpt_lexical, weights="quadratic"))

print("\nWeighted Kappa (linear):")
print("  Argument:", cohen_kappa_score(human_arg, gpt_arg, weights="linear"))
print("  Cohesion:", cohen_kappa_score(human_cohes, gpt_cohes, weights="linear"))
print("  Lexical :", cohen_kappa_score(human_lexical, gpt_lexical, weights="linear"))



# Calculate absolute differences if not already present
if 'AbsDiff_Arg' not in result_df.columns:
    result_df['AbsDiff_Arg'] = abs(result_df['Diff_Arg'])
    result_df['AbsDiff_Coh'] = abs(result_df['Diff_Coh'])
    result_df['AbsDiff_Lex'] = abs(result_df['Diff_Lex'])

# Summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

categories = ['Arg', 'Coh', 'Lex']
category_names = ['Quality of Argument', 'Coherence & Cohesion', 'Lexical Resource & Grammar']

for i, cat in enumerate(categories):
    human_col = f'Human_{cat}'
    gpt_col = f'GPT_{cat}'
    diff_col = f'Diff_{cat}'
    abs_diff_col = f'AbsDiff_{cat}'

    print(f"\n{category_names[i]}:")
    print(f"  Human - Mean: {result_df[human_col].mean():.2f}, Std: {result_df[human_col].std():.2f}, Range: {result_df[human_col].min()}-{result_df[human_col].max()}")
    print(f"  GPT   - Mean: {result_df[gpt_col].mean():.2f}, Std: {result_df[gpt_col].std():.2f}, Range: {result_df[gpt_col].min()}-{result_df[gpt_col].max()}")
    print(f"  Mean difference (Human-GPT): {result_df[diff_col].mean():.2f}")
    print(f"  Mean absolute difference: {result_df[abs_diff_col].mean():.2f}")
    print(f"  Correlation: {result_df[human_col].corr(result_df[gpt_col]):.3f}")

# Function to calculate agreement with tolerance
def calculate_agreement_with_tolerance(human_scores, gpt_scores, tolerance=0):
    """Calculate agreement rate with tolerance for scoring differences"""
    agreement = abs(np.array(human_scores) - np.array(gpt_scores)) <= tolerance
    return np.mean(agreement)

# Function to calculate weighted kappa with tolerance
def calculate_weighted_kappa_tolerance(human_scores, gpt_scores, tolerance=0):
    """Calculate Cohen's kappa considering tolerance"""
    if tolerance == 0:
        return cohen_kappa_score(human_scores, gpt_scores)

    # For tolerance > 0, we'll calculate agreement-based kappa
    agreement_rate = calculate_agreement_with_tolerance(human_scores, gpt_scores, tolerance)

    # Simple kappa approximation based on agreement
    # This is a simplified version - exact calculation would require contingency tables
    num_categories = len(np.unique(np.concatenate([human_scores, gpt_scores])))
    expected_agreement = 1 / num_categories

    if expected_agreement >= 1.0:
        return 1.0 if agreement_rate == 1.0 else 0.0

    kappa = (agreement_rate - expected_agreement) / (1 - expected_agreement)
    return max(-1, min(1, kappa))  # Bound between -1 and 1

# Calculate Cohen's Kappa for different tolerance levels
print("\n" + "="*60)
print("COHEN'S KAPPA ANALYSIS")
print("="*60)

tolerance_levels = [0, 1, 2, 3]
kappa_results = {}

for i, cat in enumerate(categories):
    human_scores = result_df[f'Human_{cat}'].values
    gpt_scores = result_df[f'GPT_{cat}'].values

    print(f"\n{category_names[i]}:")
    kappa_results[cat] = {}

    for tol in tolerance_levels:
        if tol == 0:
            kappa = cohen_kappa_score(human_scores, gpt_scores)
            agreement = calculate_agreement_with_tolerance(human_scores, gpt_scores, tol)
        else:
            kappa = calculate_weighted_kappa_tolerance(human_scores, gpt_scores, tol)
            agreement = calculate_agreement_with_tolerance(human_scores, gpt_scores, tol)

        print(f"  Tolerance ±{tol}: κ = {kappa:.3f}, Agreement = {agreement:.3f} ({agreement*100:.1f}%)")
        kappa_results[cat][tol] = {'kappa': kappa, 'agreement': agreement}

# Interpretation of kappa values
print("\n" + "="*50)
print("KAPPA INTERPRETATION:")
print("="*50)
print("< 0.00: Poor agreement")
print("0.00-0.20: Slight agreement")
print("0.21-0.40: Fair agreement")
print("0.41-0.60: Moderate agreement")
print("0.61-0.80: Substantial agreement")
print("0.81-1.00: Almost perfect agreement")

# Create visualizations
plt.style.use('default')
fig = plt.figure(figsize=(20, 15))

# Create a 3x3 grid for better layout
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

# Scatter plots comparing Human vs GPT scores (top row)
for i, cat in enumerate(categories):
    ax = fig.add_subplot(gs[0, i])
    human_scores = result_df[f'Human_{cat}']
    gpt_scores = result_df[f'GPT_{cat}']

    ax.scatter(human_scores, gpt_scores, alpha=0.7, s=100, color=colors[i])

    # Add diagonal line (perfect agreement)
    min_score = min(human_scores.min(), gpt_scores.min())
    max_score = max(human_scores.max(), gpt_scores.max())
    ax.plot([min_score, max_score], [min_score, max_score], 'k--', alpha=0.5, label='Perfect Agreement')

    # Add trend line
    z = np.polyfit(human_scores, gpt_scores, 1)
    p = np.poly1d(z)
    ax.plot(human_scores, p(human_scores), "r-", alpha=0.7, label='Trend')

    correlation = np.corrcoef(human_scores, gpt_scores)[0,1]
    ax.set_xlabel('Human Scores', fontsize=12)
    ax.set_ylabel('GPT Scores', fontsize=12)
    ax.set_title(f'{category_names[i]}\n(r = {correlation:.3f})', fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.legend()

# Distribution plots (middle row)
for i, cat in enumerate(categories):
    ax = fig.add_subplot(gs[1, i])

    human_scores = result_df[f'Human_{cat}']
    gpt_scores = result_df[f'GPT_{cat}']

    # Create bins based on actual score range
    min_score = min(human_scores.min(), gpt_scores.min())
    max_score = max(human_scores.max(), gpt_scores.max())
    bins = np.arange(min_score, max_score + 2) - 0.5

    ax.hist(human_scores, bins=bins, alpha=0.6, label='Human', color=colors[i], density=True)
    ax.hist(gpt_scores, bins=bins, alpha=0.6, label='GPT', color='orange', density=True)

    ax.set_xlabel('Scores', fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    ax.set_title(f'{category_names[i]} Distribution', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

# Agreement by tolerance (bottom row - spans all columns)
ax = fig.add_subplot(gs[2, :])

tolerance_range = range(len(tolerance_levels))
width = 0.25

for i, cat in enumerate(categories):
    agreements = [kappa_results[cat][tol]['agreement'] for tol in tolerance_levels]
    x_pos = np.array(tolerance_range) + i * width
    ax.bar(x_pos, agreements, width, label=f'{category_names[i]}', color=colors[i], alpha=0.7)

    # Add value labels on bars
    for j, (x, agreement) in enumerate(zip(x_pos, agreements)):
        ax.text(x, agreement + 0.01, f'{agreement:.2f}', ha='center', va='bottom', fontsize=10)

ax.set_xlabel('Tolerance Level (±points)', fontsize=14)
ax.set_ylabel('Agreement Rate', fontsize=14)
ax.set_title('Agreement Rate by Tolerance Level', fontsize=16, fontweight='bold')
ax.set_xticks(np.array(tolerance_range) + width)
ax.set_xticklabels([f'±{t}' for t in tolerance_levels])
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(0, 1.1)

plt.suptitle('Human vs GPT Scoring Analysis', fontsize=20, fontweight='bold')
plt.show()

# Statistical tests
print("\n" + "="*60)
print("STATISTICAL TESTS")
print("="*60)

for i, cat in enumerate(categories):
    human_scores = result_df[f'Human_{cat}']
    gpt_scores = result_df[f'GPT_{cat}']

    # Paired t-test
    t_stat, p_value = stats.ttest_rel(human_scores, gpt_scores)

    # Wilcoxon signed-rank test (non-parametric alternative)
    try:
        w_stat, w_p_value = stats.wilcoxon(human_scores, gpt_scores, zero_method='wilcox')
        wilcoxon_result = f"W = {w_stat:.1f}, p = {w_p_value:.4f}"
    except ValueError:
        wilcoxon_result = "Cannot compute (identical distributions)"

    print(f"\n{category_names[i]}:")
    print(f"  Paired t-test: t = {t_stat:.3f}, p = {p_value:.4f}")
    print(f"  Wilcoxon test: {wilcoxon_result}")

    if p_value < 0.001:
        print("  → Highly significant difference (p < 0.001)")
    elif p_value < 0.01:
        print("  → Very significant difference (p < 0.01)")
    elif p_value < 0.05:
        print("  → Significant difference (p < 0.05)")
    else:
        print("  → No significant difference (p ≥ 0.05)")

# Detailed breakdown by score differences
print("\n" + "="*60)
print("SCORE DIFFERENCE BREAKDOWN")
print("="*60)

for i, cat in enumerate(categories):
    diff_col = f'Diff_{cat}'
    print(f"\n{category_names[i]} - Score Differences (Human - GPT):")

    diff_counts = result_df[diff_col].value_counts().sort_index()
    total = len(result_df)

    for diff_val, count in diff_counts.items():
        percentage = (count / total) * 100
        direction = "Human higher" if diff_val > 0 else "GPT higher" if diff_val < 0 else "Equal"
        print(f"  Difference {diff_val:+2d}: {count:2d} cases ({percentage:5.1f}%) - {direction}")

# Create summary table
print("\n" + "="*80)
print("KAPPA SUMMARY TABLE")
print("="*80)

summary_data = []
for i, cat in enumerate(categories):
    for tol in tolerance_levels:
        summary_data.append({
            'Category': category_names[i],
            'Tolerance': f'±{tol}',
            'Kappa': f"{kappa_results[cat][tol]['kappa']:.3f}",
            'Agreement': f"{kappa_results[cat][tol]['agreement']:.3f}",
            'Agreement %': f"{kappa_results[cat][tol]['agreement']*100:.1f}%"
        })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Confusion matrices for exact agreement (tolerance = 0)
print("\n" + "="*60)
print("CONFUSION MATRICES (Exact Agreement)")
print("="*60)

for i, cat in enumerate(categories):
    human_scores = result_df[f'Human_{cat}']
    gpt_scores = result_df[f'GPT_{cat}']

    print(f"\n{category_names[i]}:")
    cm = confusion_matrix(human_scores, gpt_scores)

    # Get unique scores for labels
    unique_scores = sorted(set(human_scores) | set(gpt_scores))

    # Create a formatted confusion matrix
    cm_df = pd.DataFrame(cm, index=[f'H_{s}' for s in unique_scores],
                        columns=[f'G_{s}' for s in unique_scores])
    print(cm_df)

print(f"\n\nAnalysis complete! Dataset contains {len(result_df)} essays.")
print("Key findings will be in the statistical tests and kappa values above.")

# Grade Real Essays

In [None]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

from sqlalchemy import create_engine, inspect
import os

# build URL from the locally‑forwarded port
user     = os.getenv("DB_USER")
pw       = os.getenv("DB_PASSWORD")
host     = os.getenv("DB_HOST")
port     = os.getenv("DB_PORT")
db       = os.getenv("DB_NAME")
engine   = create_engine(f"postgresql://{user}:{pw}@{host}:{port}/{db}")

In [None]:
snapshots = pd.read_sql("SELECT * FROM text_snapshots;", engine)
snapshots.head()

## Get list of accepted participants

In [None]:
pid_list = []

with open("pid_accepted.txt", "r") as fle:
    for line in fle:
        pid_list.append(line.strip())
        
len(pid_list), pid_list[0]

## Function to get final essay

In [None]:
def get_final_essay(pid):
    filtered = snapshots.loc[
        (snapshots["participant_id"] == pid) & 
        (snapshots["type"] == "final") &
        (snapshots["stage"] == "revision")
    ]

    if len(filtered) > 1:
        print(f"warning [{pid}]: more than one final submission")
        filtered = filtered.drop_duplicates(subset="participant_id", keep="last")
    
    return filtered.iloc[0]["text_content"]

get_final_essay("63e584009bf1aa55a39e1d53")

In [None]:
#################################################################################
#################################################################################
################# #  Import Essasys to Get Grades   # ###########################
#################################################################################
#################################################################################


fake_data = """essay_id,essay_text,human_band
1,"In modern society, technology plays an essential role. While some argue it isolates people, I believe it connects us more deeply by enabling communication across cultures.",7
2,"The graph shows cars go up. People buy more cars because economy is better. This is good.",5
3,"Some people think children should learn music at school, while others believe it wastes time. In my opinion, music education develops creativity and discipline that benefit students beyond the classroom.",8
4,"Nowadays, pollution is big problem. Government must do something fast to stop smoke. If not, people sick.",4
5,"It is often argued whether university education should be free. I strongly agree because access to education promotes equality and social progress.",7
6,"People are different in cities. More building, more traffic, more jobs. But village is quiet and good life.",5
7,"Although many believe social media harms young people, I contend it can foster valuable networks and learning opportunities if used responsibly.",8
8,"Internet is useful. Student use it. Sometime problem is addiction. That bad.",4
9,"Some claim space exploration wastes resources. However, it pushes science forward and inspires humanity to solve problems on Earth.",7
10,"The economy is important. People need job. If no job, then bad. Government help job is good.",3
"""

with open("essays_with_scores.csv", "w") as f:
    f.write(fake_data)

essays_not_from_the_sets = pd.read_csv("essays_with_scores.csv", encoding="utf-8-sig")
essays_not_from_the_sets.head()


#################################################################################
#################################################################################
################# ########.   Grade Here    .######## ###########################
#################################################################################
#################################################################################

results = []
count = 0

for pid in pid_list:
    count += 1
    print(f"[{count}] pid: {pid}")
    essay_text = get_final_essay(pid)

    # Call your grading function
    graded_text = grade_essay(essay_text)

    # Extract the three scores
    def extract_scores(text):
        m = re.findall(r"\b([0-9])\b", text)
        if len(m) >= 3:
            return list(map(int, m[:3]))
        return [None, None, None]

    quality, cohesion, grammar = extract_scores(graded_text)

    results.append({
        "participant_id": pid,
        "essay_text": essay_text,
        "Quality_of_argument": quality,
        "Coherence_and_cohesion": cohesion,
        "Lexical_resource_and_grammar": grammar
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)
results_df.head()

In [None]:
results_df.to_csv('csv_exports/graded_essays_with_text.csv', index=False)

In [None]:
slim_results_df = results_df.drop("essay_text", axis=1)
slim_results_df

In [None]:
slim_results_df.to_csv('csv_exports/graded_essays.csv', index=False)

In [None]:
results_df.to_pickle('essay_grading_df.pkl')