# Export data to CSVs for regressions

## Connect to Postgres database

In [None]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

from sqlalchemy import create_engine, inspect
import os

# build URL from the locally‑forwarded port
user     = os.getenv("DB_USER")
pw       = os.getenv("DB_PASSWORD")
host     = os.getenv("DB_HOST")
port     = os.getenv("DB_PORT")
db       = os.getenv("DB_NAME")
engine   = create_engine(f"postgresql://{user}:{pw}@{host}:{port}/{db}")

In [None]:
ilogs = pd.read_sql("SELECT * FROM interaction_logs;", engine)
ilogs.head()

In [None]:
surveys = pd.read_sql("SELECT * FROM survey_responses;", engine)
surveys.head()

In [None]:
snapshots = pd.read_sql("SELECT * FROM text_snapshots;", engine)
snapshots.head()

## Get list of accepted participants

In [None]:
pid_list = []

with open("pid_accepted.txt", "r") as fle:
    for line in fle:
        pid_list.append(line.strip())
        
len(pid_list), pid_list[0]

## Export pre-survey data

In [None]:
filtered = surveys.loc[
    (surveys["participant_id"].isin(pid_list)) &
    (surveys["survey_type"] == "pre")
]

filtered = filtered.drop_duplicates(subset="participant_id", keep="last")

print(len(filtered), len(pid_list))

filtered.head()

In [None]:
import json

def enrich_response(row):
    resp = row["responses"]
    if isinstance(resp, str):
        try:
            resp = json.loads(resp)
        except:
            resp = {}

    enriched = {
        **resp,  # unpack original keys
        "participant_id": row["participant_id"],
        "prompt_id": row["prompt_id"], 
        "condition": row["condition"]
    }
    return enriched

filtered["responses_enriched"] = filtered.apply(enrich_response, axis=1)
filtered

In [None]:
responses = filtered["responses_enriched"].to_list()

presurvey = pd.DataFrame(responses)
presurvey.head()

In [None]:
presurvey.to_csv('csv_exports/presurvey.csv', index=False)

## Export post-survey data

In [None]:
filtered = surveys.loc[
    (surveys["participant_id"].isin(pid_list)) &
    (surveys["survey_type"] == "post")
]

filtered = filtered.drop_duplicates(subset="participant_id", keep="last")

print(len(filtered), len(pid_list))

filtered["responses_enriched"] = filtered.apply(enrich_response, axis=1)
filtered.head()

In [None]:
responses = filtered["responses_enriched"].to_list()

postsurvey = pd.DataFrame(responses)
postsurvey.head()

In [None]:
postsurvey.to_csv('csv_exports/postsurvey.csv', index=False)

## Export behavioral data

- time in each stage
- keystrokes in each stage
- number of AI support requests
- edit distance btwn draft and revision

### Test edit distance

In [None]:
!pip install editdistance

In [None]:
import editdistance
editdistance.eval('banana', 'bahama')

In [None]:
def get_ai_draft(pid):
    filtered = ilogs.loc[ilogs["participant_id"] == pid]
    
    phrase = "Please write a complete draft essay based on this outline and prompt"
    matches = filtered[filtered["event_data"].apply(lambda d: isinstance(d, dict) and phrase in d.get("prompt", ""))]
    
    if len(matches) < 1:
        print(f"error [{pid}]: no ai draft found")
        return "null"
    
    if len(matches) > 1:
        print(f"warning [{pid}]: more than one ai draft found")
        matches = matches.drop_duplicates(subset="participant_id", keep="last")
    
    return matches.iloc[0]["event_data"]["response"]

def get_essay(pid, stage):
    filtered = snapshots.loc[
        (snapshots["participant_id"] == pid) & 
        (snapshots["type"] == "final") &
        (snapshots["stage"] == stage)
    ]
    # if condition 3, need to pull draft from api request
    if len(filtered) == 0:
        condition = surveys.loc[surveys["participant_id"] == pid].iloc[0]["condition"]
        if condition == "3":
            return get_ai_draft(pid)
        else:
            print(f"error [{pid}]: no final submission [{stage}]")
            return "error"
    if len(filtered) > 1:
        print(f"warning [{pid}]: more than one final submission [{stage}]")
        filtered = filtered.drop_duplicates(subset="participant_id", keep="last")
    
    return filtered.iloc[0]["text_content"]


draft = get_essay(pid_list[0], "draft")
revision = get_essay(pid_list[0], "revision")
print("edit distance:", editdistance.eval(draft, revision))
print("------------------")
print(draft)
print("------------------")
print(revision)

### Prep functions

In [None]:
def get_num_keystrokes(pid, stage):
    filtered = ilogs.loc[
        (ilogs["participant_id"] == pid) &
        (ilogs["stage"].str.lower() == stage)
    ]
    count = filtered['event_type'].str.contains('keystroke').sum()
    return count

def get_time_on_task(pid, stage):
    # return in minutes
    filtered = snapshots.loc[
        (snapshots["participant_id"] == pid) & 
        (snapshots["type"] == "final") &
        (snapshots["stage"] == stage)
    ]
    
    if len(filtered) > 1:
        print(f"warning [{pid}]: more than one stage [{stage}]")
        
    if len(filtered) == 0:
        return "null"
    
    return filtered.iloc[0]["time_from_stage_start"]/60  

def get_keystroke_events(pid, keyword):
    filtered = ilogs.loc[
        (ilogs["participant_id"] == pid) & 
        (ilogs["event_type"].str.contains(keyword))
    ]
    return len(filtered)

def get_api_requests(pid):
    filtered = ilogs.loc[ilogs["participant_id"] == pid]
    count = filtered['event_type'].str.contains('api_call').sum()
    return count

def get_edit_distance(pid):
    draft = get_essay(pid, "draft")
    revision = get_essay(pid, "revision")
    return editdistance.eval(draft, revision)

def get_condition(pid):
    filtered = surveys.loc[surveys["participant_id"] == pid]
    return filtered.iloc[0]["condition"]

def get_prompt_id(pid):
    filtered = surveys.loc[surveys["participant_id"] == pid]
    return filtered.iloc[0]["prompt_id"]

### Build new dataframe and export

In [None]:
rows = []


for pid in pid_list:
    k_outline = get_num_keystrokes(pid, "outline")
    k_draft = get_num_keystrokes(pid, "draft")
    k_revision = get_num_keystrokes(pid, "revision")
    k_backspace = get_keystroke_events(pid, "backspace")

    rows.append({
        "participant_id": pid,
        "condition": get_condition(pid),
        "prompt_id": get_prompt_id(pid),
        "time_on_outline": get_time_on_task(pid, "outline"),
        "time_on_draft": get_time_on_task(pid, "draft"),
        "time_on_revision": get_time_on_task(pid, "revision"),
        "keystrokes_outline": k_outline,
        "keystrokes_draft": k_draft,
        "keystrokes_revision": k_revision,
        "wc_outline": len(get_essay(pid, "outline").split(" ")),
        "wc_draft": len(get_essay(pid, "draft").split(" ")),
        "wc_revision": len(get_essay(pid, "revision").split(" ")),
        "num_paste_events": get_keystroke_events(pid, "paste"),
        "num_backspace_events": k_backspace,
        "backspace_frac": k_backspace / (k_outline + k_draft + k_revision),
        "api_requests": get_api_requests(pid),
        "revision_edit_distance": get_edit_distance(pid)
    })

behavioral_df = pd.DataFrame(rows)
behavioral_df

In [None]:
behavioral_df.to_csv('csv_exports/behavioraldata.csv', index=False)