In [106]:
import os

import pandas as pd
import random

In [12]:
data_path = os.path.abspath(os.path.join(os.path.abspath(''), '..', 'streamlit_study', 'study_data'))
files = os.listdir(data_path)
files = [f for f in files if (f.startswith('0_') or f.startswith('1_'))]

print("Number of participant files (total):", len(files))

Number of participant files (total): 184


In [80]:
known_errors = ['0_653ec70763f99a4e8bfb0192.csv'] # double attention test
known_errors_att = ['1_604d603b057fd944c370a451.csv', '1_647922d23b058e3c3bf8a974.csv', '1_615ded1350849e2a7c332bef.csv', 
                    '0_6165e0b51a883e9db8cc7146.csv', '1_5e81efd1fdc8d601701db62b.csv', '0_6135ce2a32b53777552a46b0.csv'] 
                    # no attention test, all but last instead have double line
files = [f for f in files if not f in known_errors]
print(len(files))

182


In [103]:
data = {}
data_tasks = {}

for filename in files:
    
    path = os.path.abspath(os.path.join(data_path, filename))
    with open(path) as file:
        lines = file.readlines()

    # find index of last start, remove lines before
    i_prolific = [i for i,line in enumerate(lines) if line.startswith('Prolific ID,')][-1]
    lines = lines[i_prolific:]

    # remove duplicate consecutive lines: (if beginning is the same, last number for time does not matter)
    unique_lines = [lines[0]]
    for i in range(1, len(lines)):
        if not lines[i][:20] == lines[i-1][:20]:
            unique_lines.append(lines[i])
    lines = unique_lines

    # check number of lines
    if len(lines) < 31:
        continue # if less then 31 lines, participants did not finish
    elif len(lines) == 31: # only ok if first attention test was not recorded, otherwise also kick
        if any(l.startswith("attention_2309") for l in lines):
            continue
    elif len(lines) > 32:
        # can only happen if e.g. questionnaire was double saved
        q_starts = []
        for i, line in enumerate(lines):
            if line.startswith("1,"):
                q_starts.append(i)
        assert(len(q_starts)) == 2
        for i in range(18): # len of questionnaire
            assert lines[q_starts[0]+i] == lines[q_starts[1]+i]
        lines = lines[:q_starts[1]]

    # make sure we know have a good file with 31 or 32 lines
    assert len(lines) in [31,32]

    # make dict for this participant
    new_p = {}

    # prolific id
    p_id = lines[0].strip().split(",")
    #new_p[p_id[0]] = p_id[1]

    # first attention test
    i_att = [i for i in range(len(lines)) if lines[i].startswith("attention_2309")]
    if len(lines) == 31:
        assert len(i_att) == 0
        new_p["attention_2309"] = "No"
    else:
        assert len(i_att) == 1
        i_att = i_att[0]
        att1 = lines[i_att].strip().split(",")
        assert att1[0] == "attention_2309"
        new_p[att1[0]] = att1[1].split(" ")[-1]

    # student tasks - without tutorial and attention test
    if len(lines) == 32:
        task_lines = [l.strip().split(",") for l in lines[3:i_att]+lines[(i_att+1):14]]
    else:
        task_lines = [l.strip().split(",") for l in lines[3:14]]
    task_df = pd.DataFrame(task_lines, columns = lines[1].strip().split(","))

    # questionnaire and second attention test
    for line in lines[14:]:
        line = line.strip().split(",")
        if len(line) == 2:
            new_p[line[0]] = line[1]
        elif len(line) == 1: # attention test
            new_p["attention_questionnaire"] = line[0].split(" ")[-1]
        else: # free text for AI knowledge might include commas
            new_p[line[0]] = line[1]
            new_p["3_where"] = ",".join(line[2:])

    # add to data dict
    data[p_id[1]] = new_p
    data_tasks[p_id[1]] = task_df

print("Number of participant files (completed):", len(complete_files))

Number of participant files (completed): 121


In [104]:
df_data = pd.DataFrame.from_dict(data, orient='index')
df_data

Unnamed: 0,attention_2309,1,2,3,3_where,4,know,fai_2,com_1,fai_5,fai_4,com_3,com_2,attention_questionnaire,com_5,rel_2,rel_1,rel_5,fai_1,fai_3
610d4f4fe67b7a3fc45cc2cd,failed,25,FEMALE,YES,"Translation tools such as DeepL, chatbots",9,AGREE,AGREE,AGREE,AGREE,AGREE,NEUTRAL,NEUTRAL,completed,AGREE,AGREE,AGREE,AGREE,NEUTRAL,NEUTRAL
5bf2e3ee6d4c3e00014420d6,failed,27,MALE,YES,usually i was chatting a lot with AI,8,AGREE,STRONGLY AGREE,STRONGLY AGREE,AGREE,AGREE,STRONGLY AGREE,STRONGLY AGREE,completed,AGREE,STRONGLY AGREE,DISAGREE,STRONGLY AGREE,AGREE,STRONGLY AGREE
64d3a84f1de6820d7ad2e336,failed,24,MALE,YES,I study Ai in my masters degree,10,AGREE,DISAGREE,NEUTRAL,AGREE,NEUTRAL,NEUTRAL,AGREE,completed,AGREE,NEUTRAL,NEUTRAL,AGREE,NEUTRAL,AGREE
610d5d9b3a1f468828283794,failed,24,MALE,YES,i use chat gpt daily,8,STRONGLY AGREE,STRONGLY AGREE,NEUTRAL,AGREE,NEUTRAL,AGREE,AGREE,completed,STRONGLY AGREE,NEUTRAL,DISAGREE,AGREE,STRONGLY DISAGREE,STRONGLY DISAGREE
5b9e10d4f074140001051011,failed,27,FEMALE,NO,,7,NEUTRAL,NEUTRAL,AGREE,NEUTRAL,NEUTRAL,STRONGLY AGREE,STRONGLY AGREE,completed,AGREE,NEUTRAL,AGREE,STRONGLY AGREE,NEUTRAL,AGREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64e8c14bbe184c20cb9d0583,failed,19,MALE,YES,With chatGPT.,10,NEUTRAL,NEUTRAL,AGREE,AGREE,NEUTRAL,AGREE,AGREE,completed,STRONGLY AGREE,STRONGLY AGREE,AGREE,AGREE,AGREE,AGREE
584823aed2be990001174e56,completed,31,MALE,YES,I am very curious about new technology and sta...,9,STRONGLY AGREE,DISAGREE,AGREE,AGREE,DISAGREE,AGREE,AGREE,completed,AGREE,AGREE,NEUTRAL,AGREE,AGREE,AGREE
61330f324c6c15a907dc2706,failed,24,FEMALE,YES,Social media,10,STRONGLY AGREE,AGREE,STRONGLY AGREE,AGREE,NEUTRAL,STRONGLY AGREE,AGREE,completed,AGREE,AGREE,AGREE,AGREE,AGREE,AGREE
5bf801e3361a9b00012654c5,failed,24,FEMALE,YES,on the internet,6,NEUTRAL,AGREE,AGREE,AGREE,AGREE,AGREE,AGREE,completed,AGREE,AGREE,AGREE,AGREE,AGREE,AGREE


In [111]:
p_id = random.choice(list(data_tasks.keys()))
data_tasks[p_id]

Unnamed: 0,student_id,target,first_choice,ai_pred,second_choice,time1,time2,time3
0,2609,DROPOUT,DROPOUT,DROPOUT,DROPOUT,1710441565.9106066,1710441566.3352263,1710441573.44552
1,533,DROPOUT,DROPOUT,DROPOUT,DROPOUT,1710441584.194938,1710441584.5592413,1710441589.2383056
2,1042,GRADUATE,DROPOUT,GRADUATE,GRADUATE,1710441619.3403523,1710441619.582763,1710441622.7551277
3,3027,DROPOUT,DROPOUT,DROPOUT,DROPOUT,1710441631.9271753,1710441632.6013937,1710441634.597552
4,872,GRADUATE,GRADUATE,DROPOUT,DROPOUT,1710441638.294881,1710441638.5715065,1710441645.3629916
5,487,DROPOUT,DROPOUT,DROPOUT,DROPOUT,1710441667.9044623,1710441668.1361315,1710441670.5947454
6,223,GRADUATE,GRADUATE,GRADUATE,GRADUATE,1710441691.7085168,1710441692.5352318,1710441694.7632666
7,2642,DROPOUT,DROPOUT,DROPOUT,DROPOUT,1710441702.1592076,1710441702.379002,1710441704.714928
8,66,GRADUATE,DROPOUT,GRADUATE,DROPOUT,1710441712.9271443,1710441713.178619,1710441737.978631
9,3151,DROPOUT,GRADUATE,DROPOUT,DROPOUT,1710441751.1199496,1710441751.5267515,1710441755.7493124
