In [19]:
import pandas as pd

In [20]:
#answer key cleaning/setup

# answer_key = pd.read_csv("raw_data/answers-answerkey.csv")
# answer_key = answer_key.drop(columns=["time", "version"])
# answer_key.to_csv("raw_data/answerKey.csv", index=False)
answer_key = pd.read_csv("raw_data/answerKey.csv")
answer_key = answer_key[answer_key["pageName"] != "Background Questions"]
answer_key.head(30)
# answer_key.to_csv("raw_data/answerKey.csv", index=False)

Unnamed: 0,pageName,question,answer
0,P1,qID-1,BC
1,P1,qID-2,MN
2,P1,qID-3,HJ
3,P2,qID-1,Angle GHJ
4,P2,qID-3,Angle NQP
5,P2,qID-2,Angle DEF
6,P5,qID-1,AAS (Angle-Angle-Side)
7,P5,qID-11,"be the same shape and size, can be mirror imag..."
8,P5,qID-12,No
9,P5,qID-13,No


In [53]:
participant = "corey"

In [54]:
answers_df = pd.read_csv(f'raw_data/answers-{participant}.csv')
event_logs_df = pd.read_csv(f'raw_data/eventLogs-{participant}.csv')

In [55]:
event_logs_df.rename(columns={'t': 'time'}, inplace=True)
event_logs_df.rename(columns={'e': 'event'}, inplace=True)

# Convert the 'time' columns to datetime format
answers_df['time'] = pd.to_datetime(answers_df['time'], unit='ms')
event_logs_df['time'] = pd.to_datetime(event_logs_df['time'], unit='ms')

# Sort both DataFrames by the 'time' column
answers_df = answers_df.sort_values(by='time')
event_logs_df = event_logs_df.sort_values(by='time')

# Merge the DataFrames based on the 'time' column
merged_df = pd.concat([answers_df, event_logs_df]).sort_values(by='time').reset_index(drop=True)

# Display the first few rows of the merged DataFrame
merged_df.head(20)

Unnamed: 0,pageName,question,answer,time,version,event,c,v
0,Background Questions,0,25,2024-07-19 23:38:45.573,undefined,,,
1,Background Questions,2,3 degrees gradient,2024-07-19 23:38:45.573,undefined,,,
2,,,,2024-07-19 23:38:45.573,,n,b,
3,Background Questions,1,2015,2024-07-19 23:38:45.573,undefined,,,
4,P1,qID-3,HJ,2024-07-19 23:39:31.589,static,,,
5,P1,qID-2,MN,2024-07-19 23:39:57.440,static,,,
6,P1,qID-1,BC,2024-07-19 23:40:05.488,static,,,
7,P2,qID-3,Angle NQP,2024-07-19 23:40:21.908,static,,,
8,P2,qID-2,Angle DEF,2024-07-19 23:40:32.164,static,,,
9,P2,qID-1,Angle GHJ,2024-07-19 23:40:39.288,static,,,


In [56]:
# Add a new column to answers_df to indicate the next answer time
answers_df['next_time'] = answers_df['time'].shift(-1)

# Merge each event log with the corresponding answer based on time interval
merged_df = pd.merge_asof(event_logs_df, answers_df, on='time', direction='backward')

event_count_dict = {}

# Define event types
event_types = ['n', 'c', 'h', 'm', 'p']

# Iterate through each answer to count events between time intervals
for index, row in answers_df.iterrows():
    answer_time = row['time']
    next_time = row['next_time'] if not pd.isnull(row['next_time']) else pd.Timestamp.max
    
    # Select events that fall within the time interval
    filtered_events = merged_df[(merged_df['time'] >= answer_time) & (merged_df['time'] < next_time)]
    
    # Filter out "hover" events (h) that occur within 0.01 second of another event [debounce]
    filtered_events = filtered_events[~((filtered_events['event'] == 'h') &
                                       (filtered_events['time'].diff().dt.total_seconds() <= 0.01))]
    
    # Count occurrences of each event type
    event_counts = filtered_events['event'].value_counts()
    
    # Initialize counts for all event types
    counts = {event: event_counts.get(event, 0) for event in event_types}
    
    # Store the counts in the dictionary
    event_count_dict[answer_time] = counts

# Convert the event count dictionary into a DataFrame
event_counts_df = pd.DataFrame.from_dict(event_count_dict, orient='index').fillna(0)

# Join the event counts with the answers_df
final_df = pd.concat([answers_df.set_index('time'), event_counts_df], axis=1).reset_index()

# Drop the next_time column as it's no longer needed
final_df = final_df.drop(columns=['next_time'])

# rename version to condition column
final_df = final_df.rename(columns={"version" : "condition"})

# Rename columns
final_df.rename(columns={'n': 'next'}, inplace=True)
final_df.rename(columns={'c': 'click'}, inplace=True)
final_df.rename(columns={'m': 'mouse'}, inplace=True)
final_df.rename(columns={'h': 'hover'}, inplace=True)
final_df.rename(columns={'p': 'pointer'}, inplace=True)


# Display the first few rows of the final DataFrame
final_df.head(30)

Unnamed: 0,index,pageName,question,answer,condition,next,click,hover,mouse,pointer
0,2024-07-19 23:38:45.573,Background Questions,0,25,undefined,1,0,0,0,0
1,2024-07-19 23:38:45.573,Background Questions,1,2015,undefined,1,0,0,0,0
2,2024-07-19 23:38:45.573,Background Questions,2,3 degrees gradient,undefined,1,0,0,0,0
3,2024-07-19 23:39:31.589,P1,qID-3,HJ,static,0,0,0,0,0
4,2024-07-19 23:39:57.440,P1,qID-2,MN,static,0,0,0,0,0
5,2024-07-19 23:40:05.488,P1,qID-1,BC,static,0,0,0,0,0
6,2024-07-19 23:40:21.908,P2,qID-3,Angle NQP,static,0,0,0,0,0
7,2024-07-19 23:40:32.164,P2,qID-2,Angle DEF,static,0,0,0,0,0
8,2024-07-19 23:40:39.288,P2,qID-1,Angle GHJ,static,0,0,0,0,0
9,2024-07-19 23:40:51.493,P7,qID-1,SAS (Side-Angle-Side),static,0,0,0,0,0


In [57]:
# Score the test
scores, answers = [], []
valid_ids = set(["qID-11", "qID-12", "qID-13"])

for index, row in final_df.iterrows():
    question, proof = row['question'], row['pageName']
    a = row.loc['answer']

    # page is not included in the answer key
    if not proof in set(answer_key['pageName']):
        scores.append(None)
        answers.append(None)
        continue
    
    #special case for pretest where some questions are inserted at the 1st question about triangle congruence
    if question in valid_ids:
        ans_row = answer_key.loc[(answer_key.question==question)]
    else:
        # find the proof and the question being scored in answer key
        ans_row = answer_key.loc[(answer_key.question==question) & (answer_key.pageName==proof)]
    # this question/proof combination is not in the answer key
    if len(ans_row) == 0:
        scores.append(None)
        answers.append(None)
        continue

    # add score to list
    correct = ans_row['answer'].values[0] == a
    scores.append(1 if correct else 0)
    answers.append(ans_row['answer'].values[0])

# add columns to answer dataframe
final_df["score"] = pd.Series(scores).values
final_df["key"] = pd.Series(answers).values
final_df.head(60)

Unnamed: 0,index,pageName,question,answer,condition,next,click,hover,mouse,pointer,score,key
0,2024-07-19 23:38:45.573,Background Questions,0,25,undefined,1,0,0,0,0,,
1,2024-07-19 23:38:45.573,Background Questions,1,2015,undefined,1,0,0,0,0,,
2,2024-07-19 23:38:45.573,Background Questions,2,3 degrees gradient,undefined,1,0,0,0,0,,
3,2024-07-19 23:39:31.589,P1,qID-3,HJ,static,0,0,0,0,0,1.0,HJ
4,2024-07-19 23:39:57.440,P1,qID-2,MN,static,0,0,0,0,0,1.0,MN
5,2024-07-19 23:40:05.488,P1,qID-1,BC,static,0,0,0,0,0,1.0,BC
6,2024-07-19 23:40:21.908,P2,qID-3,Angle NQP,static,0,0,0,0,0,1.0,Angle NQP
7,2024-07-19 23:40:32.164,P2,qID-2,Angle DEF,static,0,0,0,0,0,1.0,Angle DEF
8,2024-07-19 23:40:39.288,P2,qID-1,Angle GHJ,static,0,0,0,0,0,1.0,Angle GHJ
9,2024-07-19 23:40:51.493,P7,qID-1,SAS (Side-Angle-Side),static,0,0,0,0,0,0.0,RHL (Right-Hypotenuse-Leg)


In [58]:
# add time elapsed column
final_df['delta'] = (final_df['index']-final_df['index'].shift()).dt.total_seconds().fillna(0)
final_df.head(40)

Unnamed: 0,index,pageName,question,answer,condition,next,click,hover,mouse,pointer,score,key,delta
0,2024-07-19 23:38:45.573,Background Questions,0,25,undefined,1,0,0,0,0,,,0.0
1,2024-07-19 23:38:45.573,Background Questions,1,2015,undefined,1,0,0,0,0,,,0.0
2,2024-07-19 23:38:45.573,Background Questions,2,3 degrees gradient,undefined,1,0,0,0,0,,,0.0
3,2024-07-19 23:39:31.589,P1,qID-3,HJ,static,0,0,0,0,0,1.0,HJ,46.016
4,2024-07-19 23:39:57.440,P1,qID-2,MN,static,0,0,0,0,0,1.0,MN,25.851
5,2024-07-19 23:40:05.488,P1,qID-1,BC,static,0,0,0,0,0,1.0,BC,8.048
6,2024-07-19 23:40:21.908,P2,qID-3,Angle NQP,static,0,0,0,0,0,1.0,Angle NQP,16.42
7,2024-07-19 23:40:32.164,P2,qID-2,Angle DEF,static,0,0,0,0,0,1.0,Angle DEF,10.256
8,2024-07-19 23:40:39.288,P2,qID-1,Angle GHJ,static,0,0,0,0,0,1.0,Angle GHJ,7.124
9,2024-07-19 23:40:51.493,P7,qID-1,SAS (Side-Angle-Side),static,0,0,0,0,0,0.0,RHL (Right-Hypotenuse-Leg),12.205


In [59]:
# add participant id column
final_df["participant"] = pd.Series([participant for i in range(len(final_df))]).values
final_df.head(20)

Unnamed: 0,index,pageName,question,answer,condition,next,click,hover,mouse,pointer,score,key,delta,participant
0,2024-07-19 23:38:45.573,Background Questions,0,25,undefined,1,0,0,0,0,,,0.0,corey
1,2024-07-19 23:38:45.573,Background Questions,1,2015,undefined,1,0,0,0,0,,,0.0,corey
2,2024-07-19 23:38:45.573,Background Questions,2,3 degrees gradient,undefined,1,0,0,0,0,,,0.0,corey
3,2024-07-19 23:39:31.589,P1,qID-3,HJ,static,0,0,0,0,0,1.0,HJ,46.016,corey
4,2024-07-19 23:39:57.440,P1,qID-2,MN,static,0,0,0,0,0,1.0,MN,25.851,corey
5,2024-07-19 23:40:05.488,P1,qID-1,BC,static,0,0,0,0,0,1.0,BC,8.048,corey
6,2024-07-19 23:40:21.908,P2,qID-3,Angle NQP,static,0,0,0,0,0,1.0,Angle NQP,16.42,corey
7,2024-07-19 23:40:32.164,P2,qID-2,Angle DEF,static,0,0,0,0,0,1.0,Angle DEF,10.256,corey
8,2024-07-19 23:40:39.288,P2,qID-1,Angle GHJ,static,0,0,0,0,0,1.0,Angle GHJ,7.124,corey
9,2024-07-19 23:40:51.493,P7,qID-1,SAS (Side-Angle-Side),static,0,0,0,0,0,0.0,RHL (Right-Hypotenuse-Leg),12.205,corey


In [60]:
#save the dataframe
final_df.to_csv(f"out/{participant}.csv", index=False)