# Export data to CSVs for regressions

## Connect to Postgres database

In [None]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

from sqlalchemy import create_engine, inspect
import os

# build URL from the locally‑forwarded port
user     = os.getenv("DB_USER")
pw       = os.getenv("DB_PASSWORD")
host     = os.getenv("DB_HOST")
port     = os.getenv("DB_PORT")
db       = os.getenv("DB_NAME")
engine   = create_engine(f"postgresql://{user}:{pw}@{host}:{port}/{db}")

In [None]:
ilogs = pd.read_sql("SELECT * FROM interaction_logs;", engine)
ilogs.head()

In [None]:
surveys = pd.read_sql("SELECT * FROM survey_responses;", engine)
surveys.head()

In [None]:
snapshots = pd.read_sql("SELECT * FROM text_snapshots;", engine)
snapshots.head()

## Get list of accepted participants

In [None]:
pid_list = []

with open("pid_accepted.txt", "r") as fle:
    for line in fle:
        pid_list.append(line.strip())
        
len(pid_list), pid_list[0]

## Export pre-survey data

In [None]:
filtered = surveys.loc[
    (surveys["participant_id"].isin(pid_list)) &
    (surveys["survey_type"] == "pre")
]

filtered = filtered.drop_duplicates(subset="participant_id", keep="last")

print(len(filtered), len(pid_list))

filtered.head()

In [None]:
import json

def enrich_response(row):
    resp = row["responses"]
    if isinstance(resp, str):
        try:
            resp = json.loads(resp)
        except:
            resp = {}

    enriched = {
        **resp,  # unpack original keys
        "participant_id": row["participant_id"],
        "prompt_id": row["prompt_id"], 
        "condition": row["condition"]
    }
    return enriched

filtered["responses_enriched"] = filtered.apply(enrich_response, axis=1)
filtered

In [None]:
responses = filtered["responses_enriched"].to_list()

presurvey = pd.DataFrame(responses)
presurvey.head()

In [None]:
presurvey.to_csv('csv_exports/presurvey.csv', index=False)

## Export post-survey data

In [None]:
filtered = surveys.loc[
    (surveys["participant_id"].isin(pid_list)) &
    (surveys["survey_type"] == "post")
]

filtered = filtered.drop_duplicates(subset="participant_id", keep="last")

print(len(filtered), len(pid_list))

filtered["responses_enriched"] = filtered.apply(enrich_response, axis=1)
filtered.head()

In [None]:
responses = filtered["responses_enriched"].to_list()

postsurvey = pd.DataFrame(responses)
postsurvey.head()

In [None]:
postsurvey.to_csv('csv_exports/postsurvey.csv', index=False)