In [70]:
import pandas as pd

In [71]:
answers_df = pd.read_csv('raw_data/answers-1720620742588.csv')
event_logs_df = pd.read_csv('raw_data/eventLogs-1720620744220.csv')

In [72]:
event_logs_df.rename(columns={'t': 'time'}, inplace=True)
event_logs_df.rename(columns={'e': 'event'}, inplace=True)

# Convert the 'time' columns to datetime format
answers_df['time'] = pd.to_datetime(answers_df['time'], unit='ms')
event_logs_df['time'] = pd.to_datetime(event_logs_df['time'], unit='ms')

# Sort both DataFrames by the 'time' column
answers_df = answers_df.sort_values(by='time')
event_logs_df = event_logs_df.sort_values(by='time')

# Merge the DataFrames based on the 'time' column
merged_df = pd.concat([answers_df, event_logs_df]).sort_values(by='time').reset_index(drop=True)

# Display the first few rows of the merged DataFrame
merged_df.head(20)

Unnamed: 0,pageName,question,answer,time,event,c,v
0,Background Questions,0.0,12.0,2024-07-10 14:03:05.310,,,
1,Background Questions,1.0,1.0,2024-07-10 14:03:05.310,,,
2,Background Questions,2.0,1.0,2024-07-10 14:03:05.310,,,
3,,,,2024-07-10 14:03:05.311,n,b,
4,,,,2024-07-10 14:03:07.053,h,t,triangle.ABC
5,,,,2024-07-10 14:03:07.115,h,t,triangle.ABC
6,,,,2024-07-10 14:03:07.116,h,s,segment.AC
7,,,,2024-07-10 14:03:07.132,h,s,segment.AC
8,,,,2024-07-10 14:03:07.132,h,a,BAC
9,,,,2024-07-10 14:03:07.148,h,a,BAC


In [79]:
# Add a new column to answers_df to indicate the next answer time
answers_df['next_time'] = answers_df['time'].shift(-1)

# Merge each event log with the corresponding answer based on time interval
merged_df = pd.merge_asof(event_logs_df, answers_df, on='time', direction='backward')

event_count_dict = {}

# Define event types
event_types = ['n', 'c', 'h', 'm', 'p']

# Iterate through each answer to count events between time intervals
for index, row in answers_df.iterrows():
    answer_time = row['time']
    next_time = row['next_time'] if not pd.isnull(row['next_time']) else pd.Timestamp.max
    
    # Select events that fall within the time interval
    filtered_events = merged_df[(merged_df['time'] >= answer_time) & (merged_df['time'] < next_time)]
    
    # Filter out "hover" events (h) that occur within 0.01 second of another event [debounce]
    filtered_events = filtered_events[~((filtered_events['event'] == 'h') &
                                       (filtered_events['time'].diff().dt.total_seconds() <= 0.01))]
    
    # Count occurrences of each event type
    event_counts = filtered_events['event'].value_counts()
    
    # Initialize counts for all event types
    counts = {event: event_counts.get(event, 0) for event in event_types}
    
    # Store the counts in the dictionary
    event_count_dict[answer_time] = counts

# Convert the event count dictionary into a DataFrame
event_counts_df = pd.DataFrame.from_dict(event_count_dict, orient='index').fillna(0)

# Join the event counts with the answers_df
final_df = pd.concat([answers_df.set_index('time'), event_counts_df], axis=1).reset_index()

# Drop the next_time column as it's no longer needed
final_df = final_df.drop(columns=['next_time'])

# Rename columns
final_df.rename(columns={'n': 'next'}, inplace=True)
final_df.rename(columns={'c': 'click'}, inplace=True)
final_df.rename(columns={'m': 'mouse'}, inplace=True)
final_df.rename(columns={'h': 'hover'}, inplace=True)
final_df.rename(columns={'p': 'pointer'}, inplace=True)


# Display the first few rows of the final DataFrame
final_df.head(30)

Unnamed: 0,index,pageName,question,answer,next,click,hover,mouse,pointer
0,2024-07-10 14:03:05.310,Background Questions,0,12,1,3,12,0,0
1,2024-07-10 14:03:05.310,Background Questions,1,1,1,3,12,0,0
2,2024-07-10 14:03:05.310,Background Questions,2,1,1,3,12,0,0
3,2024-07-10 14:03:42.587,TutorialProof1,qID-1,No,0,0,10,0,0
4,2024-07-10 14:04:06.148,TutorialProof1,qID-2,No,0,0,3,0,0
5,2024-07-10 14:04:18.465,TutorialProof2,qID-1,No,0,1,0,0,0
6,2024-07-10 14:04:30.705,T1_S1_IN1,qID-1,Yes,0,5,60,0,0
7,2024-07-10 14:05:00.690,T1_S1_IN1,qID-2,Yes,0,1,7,0,0
8,2024-07-10 14:05:12.224,T1_S1_C2,qID-3,Yes,0,1,31,0,0
9,2024-07-10 14:05:32.001,T1_S1_C2,qID-1,No,0,3,22,0,0


2024-07-10 14:03:05.311000
