# Set-up

Ensure libraries are installed, connect to database and load all tables into pandas dataframes.

In [None]:
!pip install psycopg2 dotenv sqlalchemy pandas numpy==1.26.4 bokeh

In [None]:
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

from sqlalchemy import create_engine, inspect
import os

# build URL from the locally‑forwarded port
user     = os.getenv("DB_USER")
pw       = os.getenv("DB_PASSWORD")
host     = os.getenv("DB_HOST")
port     = os.getenv("DB_PORT")
db       = os.getenv("DB_NAME")
engine   = create_engine(f"postgresql://{user}:{pw}@{host}:{port}/{db}")

In [None]:
ilogs = pd.read_sql("SELECT * FROM interaction_logs;", engine)
ilogs.head()

In [None]:
surveys = pd.read_sql("SELECT * FROM survey_responses;", engine)
surveys.head()

In [None]:
snapshots = pd.read_sql("SELECT * FROM text_snapshots;", engine)
snapshots.head()

# Check for potential cheating

Given a list of participant IDs, run some simple checks (num. keystrokes, time on task) for prohibited AI usage

In [None]:
def get_condition(participant_id):
    return surveys.loc[surveys["participant_id"] == participant_id]["condition"].iloc[0]

def get_prompt(participant_id):
    return surveys.loc[surveys["participant_id"] == participant_id]["prompt_id"].iloc[0]

def get_survey_responses(participant_id, survey_type):
    filtered = surveys.loc[
        (surveys["participant_id"] == participant_id) &
        (surveys["survey_type"] == survey_type)
    ]["responses"]
    if len(filtered) == 1:
        return filtered.iloc[0]
    else:
        return False

In [None]:
set([event_type.split(":")[0] for event_type in ilogs["event_type"].unique()])

In [None]:
import json

def get_num_keystrokes(pid):
    local = ilogs.loc[ilogs["participant_id"] == pid]
    count = local['event_type'].str.contains('keystroke').sum()
    return count

def get_time_on_task(pid, stage):
    # return in minutes
    filtered = snapshots.loc[
        (snapshots["participant_id"] == pid) & 
        (snapshots["type"] == "final") &
        (snapshots["stage"] == stage)
    ]
    if len(filtered) == 1:
        return filtered.iloc[0]["time_from_stage_start"]/60
    else:
        return "null"

def get_large_paste_event(pid):
    filtered = ilogs.loc[
        (ilogs["participant_id"] == pid) & 
        (ilogs["event_type"].str.contains("paste"))
    ]
    
    if len(local) > 0:
        sizes = filtered["event_data"].apply(
            lambda x: len(x["text"]) if pd.notnull(x) else 0
        )
        return sizes.max()
    else:
        return "null"

In [None]:
id_list = """
6779c8fd10ec19bf89b5f9d8
67cf28a2280e7e75fbbacbff
67d09ba717ec412519ebf44b
67e71c9c4c220e0f0df2ef30""".strip().splitlines()

In [None]:
with open("check_ids.csv", "w") as fle:
    for i in id_list:
        condition = get_condition(i)
        prompt = get_prompt(i)
        pilot = "N"
        keystrokes = get_num_keystrokes(i)
        paste = get_large_paste_event(i)
        outline = get_time_on_task(i, "outline")
        draft = get_time_on_task(i, "draft")
        revision = get_time_on_task(i, "revision")
        pre = 1 if get_survey_responses(i, "pre") else 0
        post = 1 if get_survey_responses(i, "post") else 0
        fle.write(f"{i},{condition},{prompt},{pilot},{keystrokes},{paste},{outline},{draft},{revision},{pre},{post}\n")

# Visually inspect single partipant

Look at survey results and timelines for a single participant.

In [None]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Category10
from bokeh.transform import factor_cmap, factor_mark
from bokeh.layouts import column
from bokeh.models import ColumnDataSource, HoverTool, Segment, Text, Div
from bokeh.plotting import figure, show
from bokeh.io import output_notebook


output_notebook()

In [None]:
def add_extra_col(participant_id):
    p_ilogs = ilogs.loc[ilogs["participant_id"] == participant_id]

    local = p_ilogs.copy()

    def label_events(row):
        if "keystroke:paste" in row['event_type']:
            return "paste"
        if "keystroke:copy" in row['event_type']:
            return "cut/copy"
        if "keystroke:cut" in row['event_type']:
            return "cut/copy"
        if "keystroke" in row['event_type']:
            return "keystroke"
        if "api_call" in row['event_type']:
            return "api_call"
        if "browser" in row['event_type']:
            return "browser_event"
        if "button" in row['event_type']:
            return "button_press"
        if "text_selection" in row['event_type']:
            return "text_selection"
        return "null"

    local["cat_event_type"] = local.apply(label_events, axis=1)

    local["minutes_from_stage_start"] = local["time_from_stage_start"] / 60000
    
    return local

In [None]:
def make_whole_timeline(participant_id):
    
    p_ilogs = ilogs.loc[ilogs["participant_id"] == participant_id]
    local = p_ilogs.copy()
    
    stage_factors = ["Outline", "Draft", "Revision"]
    num_stages = len(stage_factors)

    source = ColumnDataSource(data=local)

    p = figure(
        width=800, 
        height=400,
        title=f"High-level Timeline for Participant {participant_id}",
        x_axis_label="Timestamp",
        y_axis_label="Time from Stage Start"
    )

    p.scatter(
        x="created_at", 
        y="time_from_stage_start",
        color=factor_cmap('stage', f'Category10_{num_stages}', stage_factors),
        source=source,
        size=12,
        fill_alpha=.4,
        legend_field="stage"
    )


    hover = HoverTool(tooltips=[('event_type', '@event_type')])
    p.add_tools(hover)

    show(p)

In [None]:
def make_stages_timeline(participant_id):
    
    local = add_extra_col(participant_id)
    
    stages = ["Outline", "Draft", "Revision"]
    event_factors = list(local["cat_event_type"].unique())
    num_events = len(event_factors)

    shared_x_range = (0,16)
    shared_y_range = (0,2)

    annotation_map = {
        "browser_event": {"offset": 0.9, "icon": "⌕"},
        "api_call": {"offset": -0.3, "icon": "⚡︎"},
        "button_press": {"offset": 0.3, "icon": "▶"},
        "paste": {"offset": -0.6, "icon": "⤵"},
        "cut/copy": {"offset": 0.6, "icon": "✂"},
    }
    
    for key, val in annotation_map.items():
        print(f'{val["icon"]} - {key}')


    def make_stage_plot(stage, x_range, show_legend=False):

        stage_data = local.loc[local["stage"] == stage].copy()
        stage_data["y"] = 1

        source = ColumnDataSource(data=stage_data)

        p = figure(
            width=800, 
            height=150,
            x_range=x_range,
            y_range=shared_y_range,
            title=f"Stage: {stage}; Condition: {get_condition(participant_id)}; Id: {participant_id}",
            x_axis_label="Minutes from Stage Start",
            y_axis_label=""
        )

        p.scatter(
            x="minutes_from_stage_start", 
            y="y",
            color=factor_cmap('cat_event_type', f'Category10_{num_events}', event_factors),
            source=source,
            size=12,
            fill_alpha=.5,
            line_color=None,
            legend_field="cat_event_type"
        )

        # Add annotations
        for event_type, props in annotation_map.items():
            event_rows = stage_data[stage_data["cat_event_type"] == event_type]
            if event_rows.empty:
                continue

            x = event_rows["minutes_from_stage_start"]
            y0 = event_rows["y"]
            y1 = y0 + props["offset"]


            annotation_source = ColumnDataSource(data=dict(
                x=x,
                x2=x,
                y0=y0,
                y1=y1,
                icon=[props["icon"]] * len(x),
                icon_y=y1,
            ))

            # Add line (stem)
            p.segment(x0="x", y0="y0", x1="x2", y1="y1", source=annotation_source, 
                      line_width=1, line_color="grey")

            # Add icon at end
            p.text(x="x", y="icon_y", text="icon", source=annotation_source,
                   text_align="center", text_baseline="middle", text_font_size="10pt")



        hover = HoverTool(tooltips=[('event_type', '@event_type')])
        p.add_tools(hover)

        if show_legend:
            p.add_layout(p.legend[0], 'above')  # Moves legend out of main plot area
        else:
            p.legend.visible = False

        p.yaxis.visible = False         # hides the axis (line, ticks, and labels)

        return p

    plots = [make_stage_plot(stage, shared_x_range, show_legend=False) for stage in stages]
    header = Div(text=f"<h2>Event Timeline for Participant {participant_id}</h2>", width=800)
    show(column(*plots))

In [None]:
pid = "65e1f75ab3efd22e847cfbf0"
get_survey_responses(pid, "pre"), get_survey_responses(pid, "post")

In [None]:
make_whole_timeline(pid)

In [None]:
print(get_condition(pid), get_prompt(pid))
make_stages_timeline(pid)

In [None]:
local = ilogs.loc[(ilogs["participant_id"] == pid) & 
                  (ilogs["event_type"].str.contains("paste"))]

print(pid)
for index, row in local.iterrows():
    print(f"\n{row['stage']}\n\n{row['event_data']}\n")

In [None]:
local = snapshots.loc[(snapshots["participant_id"] == pid) & (snapshots["type"] == "final")]

# with pd.option_context('display.max_colwidth', None):
#   display(local)

print(pid, "\n")

for index, row in local.iterrows():
    print(row["stage"], "\n\n", row["created_at"], "\n\n", row["text_content"], "\n\n*****\n")

In [None]:
local = snapshots.loc[(snapshots["participant_id"] == pid) & (snapshots["type"] == "partial")]
local = local.sort_values(by="created_at")

print(pid, "\n")

for index, row in local.iterrows():
    print(row["stage"], "\n\n", row["created_at"], "\n\n", row["text_content"], "\n\n*****\n")

In [None]:
pid = "6710074f43e0ed058e49ccc6"

filtered = surveys.loc[surveys["participant_id"] == pid]
filtered