In [1]:
# Import relevant libaries

import pandas as pd
import os
from pathlib import Path
import openpyxl  # Ensure openpyxl is installed

## 1. Drop duplicates of datasets with *One to One* relations

## 2. Drop duplications of datasets with *One to Many* relations

#### **Remarks**
1. Drop duplicates of datasets with *One to One* relations and datasets with *One to Many* relations are mixed below to follow the sequence of the experiment pages: welcome page -> background survey -> task instruction -> search interface -> search result log -> post-task survey
2. **Values in free-text fields need to be standardized**, for example "United States" and "USA", "Iran" and "Iranian", etc.
**Columns need to be standardized are** "q5_nationality", "q6_residence", "q10_search_tools", "q12_brand_model" (only keep brand?), "q13_storage", "q14_color", "q15_price" (fit into the price range?), "q18_important_features" (features need to be separated),"q39_contradictory_info" (items need to be sparated), "q42_comments" (text need to be processed).

### 1.1. Import participants_rows.csv file and drop duplicates

In [2]:
# Load the participants_rows.csv file
data_dir = Path('chat_raw_data_20251121_1801')
df_participant = pd.read_csv(data_dir / 'participants_rows.csv', sep=',')

# Preview the dataset
df_participant.head()

Unnamed: 0,participant_id,created_at,ip_address,device_type
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop
1,228fff26-6609-4d09-a10b-94449cfcc4a2,2025-11-20 16:22:10.68133+00,"84.17.45.206,84.17.45.206, 99.82.172.144",desktop
2,5b24e045-facc-4697-998c-3a72d57a42e6,2025-11-20 16:22:10.134939+00,"84.17.45.206,84.17.45.206, 99.82.172.144",desktop
3,ddebd93e-0d14-4d91-b2af-3e9647c3d695,2025-11-20 16:22:10.134909+00,"84.17.45.206,84.17.45.206, 99.82.172.144",desktop
4,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop


In [3]:
# Check for duplicates based on participant_id
duplicates = df_participant[df_participant.duplicated('participant_id', keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates.head()

# Count unique participants based on participant_id
num_unique = df_participant['participant_id'].nunique()
print(f"Number of unique participants in df_participant: {num_unique}")

Number of duplicate rows: 0
Number of unique participants in df_participant: 146


### 1.2. Import background_survey_rows.csv file and drop duplicates

In [4]:
# Load the background_survey_rows.csv file
df_background = pd.read_csv(data_dir / 'background_survey_rows.csv', sep=',')

# Preview the dataset
df_background.head()

Unnamed: 0,id,participant_id,submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,q10_search_tools
0,8e250286-faa0-49b3-a1a4-1406c71e8a9c,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:00.392222+00,18-24,Female,High school or below,Student,Indo,China,7,1,More than 10 times,tiktok
1,f473691a-6c60-48ad-b6d8-915ba83c0787,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:17:10.869409+00,35-44,Male,Master's degree,Employed,Iranian,Deutschland,5,1,More than 10 times,Google
2,1117999f-f29a-4dbc-9008-bf9fdc9c373a,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:17:05.061733+00,35-44,Male,Master's degree,Employed,Iranian,Deutschland,5,1,More than 10 times,Goodgle
3,9f1725f2-0e2b-46d6-a956-9081d0d37c6a,5c4cab45-aeba-4c55-a4c4-6b227b569474,2025-11-17 17:12:33.53304+00,18-24,Female,Bachelor's degree,Student,German,Germany,6,1,More than 10 times,"Google, Amazon, BestSecret"
4,aaa85758-33f8-457f-a614-5c70d54acb53,2bc7c0dd-423b-4c5c-bbff-fd89aa61ae8a,2025-11-16 19:56:55.964839+00,25-34,Female,Master's degree,Employed,Pakistan,Pakistan,5,6,3-5 times,Pakistan


In [5]:
# Check for duplicates based on participant_id
duplicates = df_background[df_background.duplicated('participant_id', keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates

Number of duplicate rows: 6


Unnamed: 0,id,participant_id,submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,q10_search_tools
1,f473691a-6c60-48ad-b6d8-915ba83c0787,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:17:10.869409+00,35-44,Male,Master's degree,Employed,Iranian,Deutschland,5,1,More than 10 times,Google
2,1117999f-f29a-4dbc-9008-bf9fdc9c373a,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:17:05.061733+00,35-44,Male,Master's degree,Employed,Iranian,Deutschland,5,1,More than 10 times,Goodgle
39,f9dff9c4-281b-4d28-b476-f45838b6015b,b9fede00-231a-4c78-8324-6691b1533caf,2025-11-03 20:44:12.55231+00,18-24,Male,Bachelor's degree,Student,Israeli,Germany,7,1,More than 10 times,"Amazon, chatgpt(to do research on the product)..."
41,bc852c32-0a5e-402e-8907-59a541705f92,b9fede00-231a-4c78-8324-6691b1533caf,2025-11-03 20:37:59.99122+00,18-24,Male,Bachelor's degree,Student,Israeli,Germany,7,1,More than 10 times,"Amazon, chatgpt(to do research on the product)..."
81,ad0376fa-1de0-491f-9b5a-f8a67a637074,5210feaa-5c2a-4444-b912-dc52d88c72c5,2025-10-27 13:12:33.301455+00,18-24,Male,Master's degree,Student,Uzbek,Germany,7,1,More than 10 times,Google
82,9d4fb5ca-b80a-4285-b269-62567c628018,5210feaa-5c2a-4444-b912-dc52d88c72c5,2025-10-27 13:11:06.810905+00,18-24,Male,Master's degree,Student,Uzbek,Germany,7,7,More than 10 times,Google


In [6]:
# Remove duplicate entries, keeping the latest occurrence

# Ensure submitted_at is in datetime format
df_background["submitted_at"] = pd.to_datetime(df_background["submitted_at"])

# Sort by participant_id and submitted_at (latest first)
df_background = df_background.sort_values(
    by=["participant_id", "submitted_at"], ascending=[True, False]
)

# Drop duplicates, keeping the first (which is now the latest due to sorting)
df_background = df_background.drop_duplicates(subset=["participant_id"], keep="first")

# Display the cleaned DataFrame
df_background.head()

Unnamed: 0,id,participant_id,submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,q10_search_tools
28,63be650a-84a9-4b98-8830-954148e95404,05d6363b-4f7a-47b5-a91e-8f9a45d3bf07,2025-11-07 13:23:53.965032+00:00,18-24,Female,Master's degree,Student,German,Germany,6,4,6-10 times,Google
25,a45f022f-7003-4d4d-a109-8d1a65b2dd8f,090b95b5-f52a-4e45-8665-1cc819388638,2025-11-07 15:02:50.369086+00:00,18-24,Male,Bachelor's degree,Other,Syrian,Germany,6,1,More than 10 times,Chatgpt
31,7931724c-d393-4c73-97b0-4a4d1280f836,0a4d4062-b718-43a8-a9be-95af5db59952,2025-11-07 12:41:14.251610+00:00,25-34,Female,Master's degree,Student,Taiwan,Germany,6,1,3-5 times,Shoppee
16,e398ecf5-22ce-41a3-9b6d-5931ef66081a,0ba4f0a8-673f-4054-80e0-690aaf497b51,2025-11-11 03:08:07.187937+00:00,Under 18,Female,High school or below,Student,Canada,Canada,6,1,6-10 times,Google
58,dd47d7ec-2686-4b62-b5b7-1a823e5d2e51,0e8e279b-2364-4850-9f67-b682817f8378,2025-10-29 21:05:19.372164+00:00,25-34,Male,Master's degree,Employed,peruvian,Germany,6,1,6-10 times,amazon


In [7]:
# Count unique participants based on participant_id after dropping duplicates
num_unique = df_background['participant_id'].nunique()
print(f"Number of unique participants in df_background: {num_unique}")

Number of unique participants in df_background: 88


In [8]:
# Drop unnecessary columns and rename for clarity (Preparing for Join datasets)

df_background = df_background.drop(columns=['id'])
df_background = df_background.rename(columns={'submitted_at': 'background_submitted_at'})

df_background.head()

Unnamed: 0,participant_id,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,q10_search_tools
28,05d6363b-4f7a-47b5-a91e-8f9a45d3bf07,2025-11-07 13:23:53.965032+00:00,18-24,Female,Master's degree,Student,German,Germany,6,4,6-10 times,Google
25,090b95b5-f52a-4e45-8665-1cc819388638,2025-11-07 15:02:50.369086+00:00,18-24,Male,Bachelor's degree,Other,Syrian,Germany,6,1,More than 10 times,Chatgpt
31,0a4d4062-b718-43a8-a9be-95af5db59952,2025-11-07 12:41:14.251610+00:00,25-34,Female,Master's degree,Student,Taiwan,Germany,6,1,3-5 times,Shoppee
16,0ba4f0a8-673f-4054-80e0-690aaf497b51,2025-11-11 03:08:07.187937+00:00,Under 18,Female,High school or below,Student,Canada,Canada,6,1,6-10 times,Google
58,0e8e279b-2364-4850-9f67-b682817f8378,2025-10-29 21:05:19.372164+00:00,25-34,Male,Master's degree,Employed,peruvian,Germany,6,1,6-10 times,amazon


### 1.3. Import task_instruction_rows.csv file and drop duplicates

In [9]:
# Load the task_instruction_rows.csv file
df_instruction = pd.read_csv(data_dir / 'task_instruction_rows.csv', sep=',')

# Preview the dataset
df_instruction.head()

Unnamed: 0,id,participant_id,q11_budget,completed_at
0,fe2704b0-30f1-4a67-91c2-a034af66c2d7,fe4a79bb-642e-49d8-920a-a5f39035ffe3,Over €800,2025-11-20 16:23:08.15456+00
1,ae9c09f8-42eb-4e70-b0aa-09e151415f4f,151939c4-8aae-490b-92eb-c7d353a4a95c,€600-799,2025-11-18 18:17:25.690481+00
2,08f476f3-d31b-451b-b7ce-04fe6f787198,5c4cab45-aeba-4c55-a4c4-6b227b569474,€450-599,2025-11-17 17:13:24.879267+00
3,f55c5695-d676-4083-90a7-e1da75e2ee41,2bc7c0dd-423b-4c5c-bbff-fd89aa61ae8a,€150-299,2025-11-16 19:57:03.504302+00
4,95ebee41-fbf7-4099-85e2-f9f566bf3c1a,e2f70344-8588-468c-9dc2-8c2993f5b6c9,€600-799,2025-11-15 23:30:13.982834+00


In [10]:
# Check for duplicates based on participant_id
duplicates = df_instruction[df_instruction.duplicated('participant_id', keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates

Number of duplicate rows: 24


Unnamed: 0,id,participant_id,q11_budget,completed_at
14,c492b78d-6067-4071-a5ac-49009160afe2,0ba4f0a8-673f-4054-80e0-690aaf497b51,€450-599,2025-11-11 03:10:00.06837+00
15,89d956a8-1484-4e02-9879-a6b0a14c5a9d,0ba4f0a8-673f-4054-80e0-690aaf497b51,€450-599,2025-11-11 03:08:52.053916+00
16,c2dd06be-b062-49ee-a33d-9dbd55e1288d,0ba4f0a8-673f-4054-80e0-690aaf497b51,€450-599,2025-11-11 03:08:34.307695+00
17,bb5c8078-0d87-4403-b58b-4b31131549a2,548569b2-6aeb-4403-9dfe-4b84cd9cbf25,€150-299,2025-11-10 22:31:14.471017+00
18,d4d8817b-b313-4ea9-b637-5c56a0124424,548569b2-6aeb-4403-9dfe-4b84cd9cbf25,€150-299,2025-11-10 22:30:49.44762+00
24,c9616b47-61cc-4383-9be0-b63f534a98e0,2c298a5e-7070-4fd7-a6fe-ff3907029e8c,€600-799,2025-11-09 12:17:20.020105+00
25,a96c87e4-b632-42aa-b532-5944a9d8ade0,2c298a5e-7070-4fd7-a6fe-ff3907029e8c,€600-799,2025-11-09 12:16:59.691731+00
26,bb879b17-ac13-43d9-b3b4-6b6c1155d870,21121fd2-00ef-4b2a-974f-823f26f3f7fd,Over €800,2025-11-08 13:31:16.716906+00
27,e1981cab-e2cf-4c03-9165-663d7a511042,21121fd2-00ef-4b2a-974f-823f26f3f7fd,Over €800,2025-11-08 13:28:24.631875+00
28,8edb32b1-5fa6-4d21-88f6-d5f4097d8e34,21121fd2-00ef-4b2a-974f-823f26f3f7fd,Over €800,2025-11-08 13:25:23.088201+00


In [11]:
# Remove duplicate entries, keeping the latest occurrence

# Ensure completed_at is in datetime format
df_instruction["completed_at"] = pd.to_datetime(df_instruction["completed_at"])

# Sort by participant_id and completed_at (latest first)
df_instruction = df_instruction.sort_values(
    by=["participant_id", "completed_at"], ascending=[True, False]
)

# Drop duplicates, keeping the first (which is now the latest due to sorting)
df_instruction = df_instruction.drop_duplicates(subset=["participant_id"], keep="first")

# Display the cleaned DataFrame
df_instruction.head()

Unnamed: 0,id,participant_id,q11_budget,completed_at
29,fd76061d-d067-4517-9ef2-4dfe203e4265,090b95b5-f52a-4e45-8665-1cc819388638,€450-599,2025-11-07 15:05:24.138584+00:00
14,c492b78d-6067-4071-a5ac-49009160afe2,0ba4f0a8-673f-4054-80e0-690aaf497b51,€450-599,2025-11-11 03:10:00.068370+00:00
65,039f313a-b918-4ac4-92cc-07de39925d5a,0e8e279b-2364-4850-9f67-b682817f8378,€150-299,2025-10-29 21:09:38.500944+00:00
56,0c05947a-f4bb-4572-beec-07e10df36231,145a5e3a-5d68-4457-9cc1-eaffd6662734,€300-449,2025-10-31 14:01:24.763496+00:00
76,36e55ef5-b68c-467b-add8-e27caf317e46,14ff50af-72a9-4095-a217-88246441fe94,Over €800,2025-10-29 09:51:43.515248+00:00


In [12]:
# Count unique participants based on participant_id after dropping duplicates
num_unique = df_instruction['participant_id'].nunique()
print(f"Number of unique participants in df_instruction: {num_unique}")

Number of unique participants in df_instruction: 82


In [13]:
# Drop unnecessary columns and rename for clarity (Preparing for Join datasets)

df_instruction = df_instruction.drop(columns=['id'])
df_instruction = df_instruction.rename(columns={'completed_at': 'instruction_submitted_at'})

df_instruction.head()

Unnamed: 0,participant_id,q11_budget,instruction_submitted_at
29,090b95b5-f52a-4e45-8665-1cc819388638,€450-599,2025-11-07 15:05:24.138584+00:00
14,0ba4f0a8-673f-4054-80e0-690aaf497b51,€450-599,2025-11-11 03:10:00.068370+00:00
65,0e8e279b-2364-4850-9f67-b682817f8378,€150-299,2025-10-29 21:09:38.500944+00:00
56,145a5e3a-5d68-4457-9cc1-eaffd6662734,€300-449,2025-10-31 14:01:24.763496+00:00
76,14ff50af-72a9-4095-a217-88246441fe94,Over €800,2025-10-29 09:51:43.515248+00:00


### 2.1. Import search_logs_rows.csv file and drop duplicates

In [14]:
# Load the search_logs_rows.csv file
df_logs = pd.read_csv(data_dir / 'search_logs_rows.csv', sep=',')

# Preview the dataset
df_logs.head()

Unnamed: 0,id,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number
0,3db2a856-67c0-47b6-83c8-35a3833b4ea6,fe4a79bb-642e-49d8-920a-a5f39035ffe3,in euros? iphone 16 pro price,The official starting price for the **iPhone 1...,2025-11-20 16:27:45.421413+00,2025-11-20 16:27:28.282+00,2025-11-20 16:27:37.91+00,9628.0,2
1,556e0853-4a47-4a52-a5a5-c4eb472b2dd0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,i am a gen z that is currently in the apple ec...,Based on your situation as a Gen Z user in the...,2025-11-20 16:26:36.976967+00,2025-11-20 16:23:40.106+00,2025-11-20 16:26:25.248+00,165142.0,1
2,9c4f56b0-c117-4cd0-ad7a-acf865bb6aa1,151939c4-8aae-490b-92eb-c7d353a4a95c,I meant which of the phones you listed above y...,"Based on the latest expert reviews, the **Sams...",2025-11-18 18:22:14.869105+00,2025-11-18 18:21:40.219+00,2025-11-18 18:22:00.949+00,20730.0,6
3,a71bc03e-2066-47fe-adf9-a6d99fc96bb3,151939c4-8aae-490b-92eb-c7d353a4a95c,"all in all, which one do you recommend?","To provide a recommendation, I need to know wh...",2025-11-18 18:21:31.038379+00,2025-11-18 18:21:16.929+00,2025-11-18 18:21:26.716+00,9787.0,5
4,42b495a7-0222-4644-b9e1-c4d315652753,151939c4-8aae-490b-92eb-c7d353a4a95c,"compare the batteries, which one do you recome...",For the longest battery life and minimal charg...,2025-11-18 18:20:58.632253+00,2025-11-18 18:20:17.925+00,2025-11-18 18:20:39.321+00,21396.0,4


In [15]:
# Check for duplicates in df_logs based on participant_id and prompt
duplicates = df_logs[df_logs.duplicated(['participant_id', 'prompt'], keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates.head()

Number of duplicate rows: 2


Unnamed: 0,id,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number
142,b1edddcc-bec3-4409-91f4-eac09fa1d1d6,193a0563-b869-4bbe-aa9e-d81bacf59a3e,Search for smartphones that have an excellent ...,Looking at smartphones with excellent price-to...,2025-10-27 16:25:11.095845+00,2025-10-27 16:24:43.276+00,2025-10-27 16:24:47.046+00,3770.0,2
143,561a7512-524b-4930-b1a2-3074f4236688,193a0563-b869-4bbe-aa9e-d81bacf59a3e,Search for smartphones that have an excellent ...,For a smartphone with an excellent price-to-qu...,2025-10-27 16:24:28.648307+00,2025-10-27 16:21:46.082+00,2025-10-27 16:24:11.2+00,145118.0,1


In [16]:
# Remove duplicate entries in df_logs, keeping the latest occurrence based on query_submit_time

# Ensure query_submit_time is in datetime format
df_logs["query_submit_time"] = pd.to_datetime(df_logs["query_submit_time"])

# Sort by participant_id, prompt and query_submit_time (latest first)
df_logs = df_logs.sort_values(by=["participant_id", "prompt", "query_submit_time"], ascending=[True, True, False])

# Drop duplicates, keeping the first (which is now the latest due to sorting)
df_logs = df_logs.drop_duplicates(subset=["participant_id", "prompt"], keep="first")

# Display the cleaned DataFrame
df_logs.head()

Unnamed: 0,id,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number
57,081f5514-6b20-4531-8f53-241b96ddc6e1,090b95b5-f52a-4e45-8665-1cc819388638,What is a realistic phone when it comes to cos...,A realistic phone with the best cost-to-qualit...,2025-11-07 15:08:07.503559+00,2025-11-07 15:05:33.382+00,2025-11-07 15:07:50.895000+00:00,137513.0,1
37,74c4060e-a301-45c1-a880-5109fcb9c036,0ba4f0a8-673f-4054-80e0-690aaf497b51,What phones are currently available on the mar...,Several **excellent budget and mid-range smart...,2025-11-11 03:09:30.955579+00,2025-11-11 03:08:53.203+00,2025-11-11 03:09:17.501000+00:00,24298.0,1
33,cf51fc28-d3f0-46d2-a9d0-da661e0c7391,0ba4f0a8-673f-4054-80e0-690aaf497b51,give me prices in euros,"To provide **prices in euros**, you need to co...",2025-11-11 03:16:25.604953+00,2025-11-11 03:16:17.415+00,2025-11-11 03:16:19.917000+00:00,2502.0,5
34,ed47f17f-7dc1-42a7-949c-f6a64bd56d0c,0ba4f0a8-673f-4054-80e0-690aaf497b51,give the lowest price you can find for the app...,"The **lowest price for a new, unlocked Apple i...",2025-11-11 03:14:46.851105+00,2025-11-11 03:14:19.384+00,2025-11-11 03:14:40.420000+00:00,21036.0,4
35,8b17b23e-b61b-461e-9a01-55edf53e693c,0ba4f0a8-673f-4054-80e0-690aaf497b51,"no, i mean tell me about (for the phones you m...",Based on the latest expert rankings and review...,2025-11-11 03:12:34.977885+00,2025-11-11 03:11:37.266+00,2025-11-11 03:12:16.250000+00:00,38984.0,3


In [17]:
# Count unique participants based on participant_id after dropping duplicates
num_unique = df_logs['participant_id'].nunique()
print(f"Number of unique participants in df_logs: {num_unique}")

Number of unique participants in df_logs: 59


In [18]:
# Count the number of rows in df_logs
print(f"Number of rows in df_logs: {len(df_logs)}")

Number of rows in df_logs: 169


In [19]:
# Drop unnecessary columns (Preparing for Join datasets)
df_logs = df_logs.drop(columns=['id'])

df_logs.head()

Unnamed: 0,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number
57,090b95b5-f52a-4e45-8665-1cc819388638,What is a realistic phone when it comes to cos...,A realistic phone with the best cost-to-qualit...,2025-11-07 15:08:07.503559+00,2025-11-07 15:05:33.382+00,2025-11-07 15:07:50.895000+00:00,137513.0,1
37,0ba4f0a8-673f-4054-80e0-690aaf497b51,What phones are currently available on the mar...,Several **excellent budget and mid-range smart...,2025-11-11 03:09:30.955579+00,2025-11-11 03:08:53.203+00,2025-11-11 03:09:17.501000+00:00,24298.0,1
33,0ba4f0a8-673f-4054-80e0-690aaf497b51,give me prices in euros,"To provide **prices in euros**, you need to co...",2025-11-11 03:16:25.604953+00,2025-11-11 03:16:17.415+00,2025-11-11 03:16:19.917000+00:00,2502.0,5
34,0ba4f0a8-673f-4054-80e0-690aaf497b51,give the lowest price you can find for the app...,"The **lowest price for a new, unlocked Apple i...",2025-11-11 03:14:46.851105+00,2025-11-11 03:14:19.384+00,2025-11-11 03:14:40.420000+00:00,21036.0,4
35,0ba4f0a8-673f-4054-80e0-690aaf497b51,"no, i mean tell me about (for the phones you m...",Based on the latest expert rankings and review...,2025-11-11 03:12:34.977885+00,2025-11-11 03:11:37.266+00,2025-11-11 03:12:16.250000+00:00,38984.0,3


### 2.2. Import search_clicks_rows.csv file and check unique participats

In [20]:
# Load the search_clicks_rows.csv file
df_clicks = pd.read_csv(data_dir / 'search_clicks_rows.csv', sep=',')

# Preview the dataset
df_clicks.head()

Unnamed: 0,id,participant_id,prompt_index,url,clicked_at
0,91069fed-a389-4d0e-bf76-6b556aeef03c,151939c4-8aae-490b-92eb-c7d353a4a95c,1,https://www.accio.com/business/best-selling-sm...,2025-11-18 18:23:34.693101+00
1,60f5d232-2c51-456b-afa5-f3f0bcb68231,5c4cab45-aeba-4c55-a4c4-6b227b569474,3,https://www.apple.com/shop/buy-iphone,2025-11-17 17:16:32.5906+00
2,84b7ab19-cd0f-4f73-9fb8-f17b85a2ab62,233c60f2-995c-4779-970d-dbbbd7c1da52,3,https://www.samsung.com/de/smartphones/galaxy-...,2025-11-14 23:22:22.601176+00
3,a700f17d-f80e-4f54-8ab4-ee62df4f58c8,0ba4f0a8-673f-4054-80e0-690aaf497b51,4,https://www.apple.com/shop/buy-iphone/iphone-1...,2025-11-11 03:17:08.006738+00
4,709ed9ce-9852-4a3e-a9dd-70e74b9d0f34,548569b2-6aeb-4403-9dfe-4b84cd9cbf25,1,https://www.notebookcheck.net/Ranking-Best-sma...,2025-11-10 22:33:40.496327+00


In [21]:
# Count unique participants in df_clicks based on participant_id

unique_participants_df_clicks = df_clicks['participant_id'].nunique()
print(f"Number of unique participants in df_clicks: {unique_participants_df_clicks}")

Number of unique participants in df_clicks: 21


In [22]:
# Count the number of rows in df_clicks
print(f"Number of rows in df_clicks: {len(df_clicks)}")

Number of rows in df_clicks: 33


In [23]:
# Drop unnecessary columns (Preparing for Join datasets)
df_clicks = df_clicks.drop(columns=['id'])

df_clicks.head()

Unnamed: 0,participant_id,prompt_index,url,clicked_at
0,151939c4-8aae-490b-92eb-c7d353a4a95c,1,https://www.accio.com/business/best-selling-sm...,2025-11-18 18:23:34.693101+00
1,5c4cab45-aeba-4c55-a4c4-6b227b569474,3,https://www.apple.com/shop/buy-iphone,2025-11-17 17:16:32.5906+00
2,233c60f2-995c-4779-970d-dbbbd7c1da52,3,https://www.samsung.com/de/smartphones/galaxy-...,2025-11-14 23:22:22.601176+00
3,0ba4f0a8-673f-4054-80e0-690aaf497b51,4,https://www.apple.com/shop/buy-iphone/iphone-1...,2025-11-11 03:17:08.006738+00
4,548569b2-6aeb-4403-9dfe-4b84cd9cbf25,1,https://www.notebookcheck.net/Ranking-Best-sma...,2025-11-10 22:33:40.496327+00


### 2.3. Import session_timing_rows.csv file and check unique participats

In [24]:
# Load the session_timing_rows.csv file
df_timing = pd.read_csv(data_dir / 'session_timing_rows.csv', sep=',')

# Preview the dataset
df_timing.head()

Unnamed: 0,id,participant_id,session_start_time,session_end_time,session_duration_ms,record_created_at
0,fd78bd23-3cf7-421e-bdeb-c14a5bfb4b1f,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:28:00.829+00,2025-11-20 16:28:10.12+00,9291.0,2025-11-20 16:28:02.231156+00
1,20b08b71-f6e3-45ca-a318-b2a6881d9e26,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:27:27.603+00,2025-11-20 16:27:50.572+00,22969.0,2025-11-20 16:27:29.591439+00
2,2a313e5e-4e25-4be1-94c9-ed83ebda8ecc,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:39.474+00,2025-11-20 16:26:48.525+00,189051.0,2025-11-20 16:23:41.070405+00
3,629db404-f4e6-4ee9-98d1-419673b41937,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:30.53+00,22115.0,2025-11-20 16:23:31.852449+00
4,20479efd-6636-48f9-83f4-e0923213ee76,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:29.747+00,21332.0,2025-11-20 16:23:09.530918+00


In [25]:
# Count unique participants in df_timing based on participant_id

unique_participants_df_timing = df_timing['participant_id'].nunique()
print(f"Number of unique participants in df_timing: {unique_participants_df_timing}")

Number of unique participants in df_timing: 83


In [26]:
# Drop unnecessary columns (Preparing for Join datasets)
df_timing = df_timing.drop(columns=['id'])

df_timing.head()

Unnamed: 0,participant_id,session_start_time,session_end_time,session_duration_ms,record_created_at
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:28:00.829+00,2025-11-20 16:28:10.12+00,9291.0,2025-11-20 16:28:02.231156+00
1,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:27:27.603+00,2025-11-20 16:27:50.572+00,22969.0,2025-11-20 16:27:29.591439+00
2,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:39.474+00,2025-11-20 16:26:48.525+00,189051.0,2025-11-20 16:23:41.070405+00
3,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:30.53+00,22115.0,2025-11-20 16:23:31.852449+00
4,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:29.747+00,21332.0,2025-11-20 16:23:09.530918+00


### 1.4. Import search_results_rows.csv file and drop duplicates

In [27]:
# Load the search_results_rows.csv file
df_results = pd.read_csv(data_dir / 'search_results_rows.csv', sep=',')

# Preview the dataset
df_results.head()

Unnamed: 0,id,participant_id,submitted_at,q12_brand_model,q13_storage,q14_color,q15_price,q16_website,q17_price_importance,q18_important_features
0,d69f5fd2-8647-446d-bf5f-d7d4d95bd5bd,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:28:44.825722+00,iphone 16 pro,256,,1359,https://fr.mobileinto.com/Apple-iPhone-16-Pro/,3,"[""Software support & updates""]"
1,e08ef69c-e2cf-4ca7-b1bd-8ba6c6969286,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:24:07.194885+00,Galaxy S25 Ultra,256,not mentioned,740,https://www.accio.com/business/best-selling-sm...,5,"[""Battery life/ fast charging"",""Display Qualit..."
2,61a8b287-daa2-4f2a-81fb-8b986b5846bf,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:33:02.200087+00,iphone 16,i dont remember,black,600,i dont have it anymore,7,"[""Battery life/ fast charging"",""Storage capaci..."
3,cf915025-bcc2-4601-a95c-3c100486db9f,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:41:10.904399+00,Iphon 15 plus,,,430,https://www.xfinity.com/mobile/shop/phone/ipho...,4,"[""Performance/ speed"",""Storage capacity"",""Disp..."
4,6779e0a5-d521-430b-92a4-c5a1e2399b09,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:33:03.325266+00,Iphone 17 pro,128,Black,1200,https://www.rebuy.fr/acheter/mobile/apple/gene...,3,"[""Battery life/ fast charging"",""Camera quality..."


In [28]:
# Check for duplicates based on participant_id

duplicates = df_results[df_results.duplicated('participant_id', keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates

Number of duplicate rows: 7


Unnamed: 0,id,participant_id,submitted_at,q12_brand_model,q13_storage,q14_color,q15_price,q16_website,q17_price_importance,q18_important_features
26,2cb55638-c38a-468f-a5f0-1447ca89cbc9,1cde5bc6-87ef-4a39-87da-3e93c51cfffa,2025-11-03 21:05:01.499197+00,Samsung S25,256,blue,639,https://www.kaufland.de/product/524765710/?kwd...,5,"[""Camera quality"",""Storage capacity"",""Software..."
27,c5c13cbf-19b7-4655-81e1-176af095c033,1cde5bc6-87ef-4a39-87da-3e93c51cfffa,2025-11-03 21:02:18.638279+00,Samsung S25,256,blue,639,https://www.kaufland.de/product/524765710/?kwd...,5,"[""Camera quality"",""Storage capacity"",""Software..."
28,8f493290-3008-461f-b6a4-88eb783ba000,1cde5bc6-87ef-4a39-87da-3e93c51cfffa,2025-11-03 20:59:39.161288+00,Samsung S25,256,blue,639,https://www.kaufland.de/product/524765710/?kwd...,5,"[""Camera quality"",""Storage capacity"",""Software..."
30,4684ef91-5d38-4b16-9757-13a94cd8c28d,b9fede00-231a-4c78-8324-6691b1533caf,2025-11-03 20:44:16.476432+00,Samsung galaxy S25 Ultra,512GB,Black,859 EUR,https://www.backmarket.de/de-de/p/samsung-gb/f...,2,"[""Battery life/ fast charging"",""Performance/ s..."
31,91acb4ca-bc0f-4265-94bb-706ee697da4f,b9fede00-231a-4c78-8324-6691b1533caf,2025-11-03 20:41:28.783288+00,Samsung galaxy S25 Ultra,512GB,Black,859 EUR,https://www.backmarket.de/de-de/p/samsung-gb/f...,2,"[""Battery life/ fast charging"",""Performance/ s..."
44,0ab143af-b624-4008-b73c-27facfd89550,a9ea56d9-8f16-4706-8730-7759a9ae43ff,2025-10-30 18:05:07.809727+00,Iphone 17,256GB,Purple,€620,https://lp.pinduoduo.com/poros/h5?ads_channel=...,5,"[""Display Quality"",""Software support & updates..."
45,ac98e705-b2c1-4c6f-bc01-989b7e217a64,a9ea56d9-8f16-4706-8730-7759a9ae43ff,2025-10-30 18:04:54.328411+00,Iphone 17,256GB,Purple,€620,https://lp.pinduoduo.com/poros/h5?ads_channel=...,5,"[""Display Quality"",""Software support & updates..."


In [29]:
# Remove duplicate entries, keeping the latest occurrence

# Ensure completed_at is in datetime format
df_results["submitted_at"] = pd.to_datetime(df_results["submitted_at"])

# Sort by participant_id and submitted_at (latest first)
df_results = df_results.sort_values(
    by=["participant_id", "submitted_at"], ascending=[True, False]
)

# Drop duplicates, keeping the first (which is now the latest due to sorting)
df_results = df_results.drop_duplicates(subset=["participant_id"], keep="first")

# Display the cleaned DataFrame
df_results.head()

Unnamed: 0,id,participant_id,submitted_at,q12_brand_model,q13_storage,q14_color,q15_price,q16_website,q17_price_importance,q18_important_features
18,3abfcc65-1106-424d-941a-f5832008e460,090b95b5-f52a-4e45-8665-1cc819388638,2025-11-07 15:09:47.643924+00:00,Google pixel 9a,,,,,6,"[""Battery life/ fast charging"",""Display Qualit..."
11,60c8394a-04aa-421e-ba59-0835ef943d9c,0ba4f0a8-673f-4054-80e0-690aaf497b51,2025-11-11 03:17:49.786245+00:00,Apple iPhone 17 Pro Max,,,€801,https://www.apple.com/shop/buy-iphone/iphone-1...,3,"[""Performance/ speed"",""Camera quality""]"
47,b859c304-2ba8-46dd-95a8-dc384f0690eb,0e8e279b-2364-4850-9f67-b682817f8378,2025-10-29 21:40:57.407264+00:00,Xiaomi Poco X7 Pro,512 GB,Black,299,https://www.ebay.com/itm/326414840756,4,"[""Battery life/ fast charging"",""Camera quality..."
39,35149535-51ac-4a4f-b081-17000b345b61,145a5e3a-5d68-4457-9cc1-eaffd6662734,2025-10-31 14:13:37.345332+00:00,Google Pixel 9a,128 GB,Porcelain,399,https://www.phonearena.com/news/pixel-9a-still...,4,"[""Battery life/ fast charging"",""Camera quality..."
1,e08ef69c-e2cf-4ca7-b1bd-8ba6c6969286,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:24:07.194885+00:00,Galaxy S25 Ultra,256,not mentioned,740,https://www.accio.com/business/best-selling-sm...,5,"[""Battery life/ fast charging"",""Display Qualit..."


In [30]:
# Count unique participants based on participant_id after dropping duplicates
num_unique = df_results['participant_id'].nunique()
print(f"Number of unique participants in df_results: {num_unique}")

Number of unique participants in df_results: 60


In [31]:
# Drop unnecessary columns and rename for clarity (Preparing for Join datasets)

df_results = df_results.drop(columns=['id'])
df_results = df_results.rename(columns={'submitted_at': 'results_submitted_at'})

df_results.head()

Unnamed: 0,participant_id,results_submitted_at,q12_brand_model,q13_storage,q14_color,q15_price,q16_website,q17_price_importance,q18_important_features
18,090b95b5-f52a-4e45-8665-1cc819388638,2025-11-07 15:09:47.643924+00:00,Google pixel 9a,,,,,6,"[""Battery life/ fast charging"",""Display Qualit..."
11,0ba4f0a8-673f-4054-80e0-690aaf497b51,2025-11-11 03:17:49.786245+00:00,Apple iPhone 17 Pro Max,,,€801,https://www.apple.com/shop/buy-iphone/iphone-1...,3,"[""Performance/ speed"",""Camera quality""]"
47,0e8e279b-2364-4850-9f67-b682817f8378,2025-10-29 21:40:57.407264+00:00,Xiaomi Poco X7 Pro,512 GB,Black,299,https://www.ebay.com/itm/326414840756,4,"[""Battery life/ fast charging"",""Camera quality..."
39,145a5e3a-5d68-4457-9cc1-eaffd6662734,2025-10-31 14:13:37.345332+00:00,Google Pixel 9a,128 GB,Porcelain,399,https://www.phonearena.com/news/pixel-9a-still...,4,"[""Battery life/ fast charging"",""Camera quality..."
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:24:07.194885+00:00,Galaxy S25 Ultra,256,not mentioned,740,https://www.accio.com/business/best-selling-sm...,5,"[""Battery life/ fast charging"",""Display Qualit..."


### 1.5. Import post_task_survey_rows.csv file and drop duplicates

In [32]:
# Load the post_task_survey_rows.csv file
df_postsurvey = pd.read_csv(data_dir / 'post_task_survey_rows.csv', sep=',')

# Preview the dataset
df_postsurvey.head()

Unnamed: 0,id,participant_id,submitted_at,q38_attention,q41_duration,q42_comments,q39_contradictory_info,q19_task_easy,q20_task_quick,q21_task_familiar,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,aebb3361-0bba-46d2-9622-2785acb7c8b3,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:30:12.966171+00,3,Less than 2 minutes,yea,"[""Other""]",6,4,6,...,6,5,7,6,7,5,5,6,7,2
1,b74b4dc8-b7b2-4816-965d-cc8ed24df3fc,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:27:05.061232+00,3,6–10 minutes,I might if the information is provided in a la...,"[""I did not find any contradictions""]",5,5,5,...,6,7,6,4,4,5,5,5,5,6
2,0697e7ca-d908-49e5-be65-18ade5105ddd,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:36:06.862026+00,3,3–5 minutes,it was a little slow,"[""I trusted the first response I got"",""I relie...",6,5,7,...,5,5,6,5,3,4,4,6,3,5
3,d651caff-2c77-4613-b3c2-a3132a7fdf1d,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:42:42.68637+00,3,Less than 2 minutes,"No, I felt like it was not that complicated, b...","[""I trusted the first response I got""]",2,5,3,...,4,5,4,4,5,6,3,4,3,6
4,efc9e4af-24aa-4025-87c4-d93cc6113a01,233c60f2-995c-4779-970d-dbbbd7c1da52,2025-11-14 23:28:53.32169+00,3,3–5 minutes,"Yes, helps save time and provides me with rele...","[""I chose the response that seemed most detail...",7,7,3,...,7,7,7,7,6,6,6,6,6,5


In [33]:
# Check for duplicates based on participant_id
duplicates = df_postsurvey[df_postsurvey.duplicated('participant_id', keep=False)]
print(f"Number of duplicate rows: {len(duplicates)}")
duplicates.head()

# Count unique participants based on participant_id
num_unique = df_postsurvey['participant_id'].nunique()
print(f"Number of unique participants in df_participant: {num_unique}")

Number of duplicate rows: 0
Number of unique participants in df_participant: 60


In [34]:
# Drop unnecessary columns and rename for clarity (Preparing for Join datasets)

df_postsurvey = df_postsurvey.drop(columns=['id'])
df_postsurvey = df_postsurvey.rename(columns={'submitted_at': 'postsurvey_submitted_at'})

df_postsurvey.head()

Unnamed: 0,participant_id,postsurvey_submitted_at,q38_attention,q41_duration,q42_comments,q39_contradictory_info,q19_task_easy,q20_task_quick,q21_task_familiar,q22_tool_reliable,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:30:12.966171+00,3,Less than 2 minutes,yea,"[""Other""]",6,4,6,6,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:27:05.061232+00,3,6–10 minutes,I might if the information is provided in a la...,"[""I did not find any contradictions""]",5,5,5,5,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:36:06.862026+00,3,3–5 minutes,it was a little slow,"[""I trusted the first response I got"",""I relie...",6,5,7,3,...,5,5,6,5,3,4,4,6,3,5
3,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:42:42.68637+00,3,Less than 2 minutes,"No, I felt like it was not that complicated, b...","[""I trusted the first response I got""]",2,5,3,5,...,4,5,4,4,5,6,3,4,3,6
4,233c60f2-995c-4779-970d-dbbbd7c1da52,2025-11-14 23:28:53.32169+00,3,3–5 minutes,"Yes, helps save time and provides me with rele...","[""I chose the response that seemed most detail...",7,7,3,6,...,7,7,7,7,6,6,6,6,6,5


## 3.  Check unique participants in page (n+1) but not in page n

In [35]:
# Count unique participant_ids in df_background but not in df_participant
background_ids = set(df_background['participant_id'])
participant_ids = set(df_participant['participant_id'])
unique_in_background = background_ids - participant_ids
print(f"Number of unique participant_ids in df_background but not in df_participant: {len(unique_in_background)}")

Number of unique participant_ids in df_background but not in df_participant: 0


In [36]:
# Count unique participant_ids in df_instruction but not in df_background
# Show rows with unique participant_ids in df_instruction but not in df_background
instruction_ids = set(df_instruction['participant_id'])
background_ids = set(df_background['participant_id'])
unique_in_instruction = instruction_ids - background_ids
rows_unique_in_instruction = df_instruction[df_instruction['participant_id'].isin(unique_in_instruction)]
print(f"Number of unique participant_ids in df_instruction but not in df_background: {len(rows_unique_in_instruction)}")
rows_unique_in_instruction

Number of unique participant_ids in df_instruction but not in df_background: 2


Unnamed: 0,participant_id,q11_budget,instruction_submitted_at
52,a5192e43-ba28-42e5-84ef-bae523f81e89,Over €800,2025-10-31 16:16:57.359415+00:00
42,bf56d3db-8489-45ed-9e06-f24f499e0bc8,€600-799,2025-11-03 21:13:31.029275+00:00


In [37]:
# Count unique participant_ids in df_logs but not in df_instruction
# Show rows with unique participant_ids in df_logs but not in df_instruction
logs_ids = set(df_logs['participant_id'])
instruction_ids = set(df_instruction['participant_id'])
unique_in_logs = logs_ids - instruction_ids
rows_unique_in_logs = df_logs[df_logs['participant_id'].isin(unique_in_logs)]
print(f"Number of unique participant_ids in df_logs but not in df_instruction: {len(unique_in_logs)}")
rows_unique_in_logs

Number of unique participant_ids in df_logs but not in df_instruction: 1


Unnamed: 0,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number
141,9c7b3c49-1422-474e-a77d-11f8ddb408e2,Which smartphone will be in demand in next 5 y...,"Over the next five years, **foldable smartphon...",2025-10-27 21:38:22.586228+00,2025-10-27 21:37:55.683+00,2025-10-27 21:38:11.811000+00:00,16128.0,1


In [38]:
# Count unique participant_ids in df_clicks but not in df_logs
# Show rows with unique participant_ids in df_clicks but not in df_logs
clicks_ids = set(df_clicks['participant_id'])
logs_ids = set(df_logs['participant_id'])
unique_in_clicks = clicks_ids - logs_ids
rows_unique_in_clicks = df_clicks[df_clicks['participant_id'].isin(unique_in_clicks)]
print(f"Number of unique participant_ids in df_clicks but not in df_logs: {len(unique_in_clicks)}")
rows_unique_in_clicks

Number of unique participant_ids in df_clicks but not in df_logs: 0


Unnamed: 0,participant_id,prompt_index,url,clicked_at


In [39]:
# Count unique participant_ids in df_results but not in df_logs
# Show rows with unique participant_ids in df_results but not in df_logs
results_ids = set(df_results['participant_id'])
logs_ids = set(df_logs['participant_id'])
unique_in_results = results_ids - logs_ids
rows_unique_in_results = df_results[df_results['participant_id'].isin(unique_in_results)]
print(f"Number of unique participant_ids in df_results but not in df_logs: {len(unique_in_results)}")
rows_unique_in_results

Number of unique participant_ids in df_results but not in df_logs: 12


Unnamed: 0,participant_id,results_submitted_at,q12_brand_model,q13_storage,q14_color,q15_price,q16_website,q17_price_importance,q18_important_features
52,337cc878-f0b7-4807-9005-08dcc0228a6a,2025-10-27 17:46:32.549561+00:00,Iphone 15,256,Black,700,Z,4,"[""Camera quality"",""Battery life/ fast charging""]"
54,41c0b6d2-f05f-4378-90a6-45c6fff68e44,2025-10-27 16:06:53.929083+00:00,Test,Test,Test,Teat,Teat,5,"[""Display Quality"",""Battery life/ fast chargin..."
19,47ead152-a4ca-406f-b621-4baca17ac0b0,2025-11-07 13:54:01.944450+00:00,Galaxy s22,256 gb,Green,500,.,3,"[""Battery life/ fast charging"",""Performance/ s..."
3,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:41:10.904399+00:00,Iphon 15 plus,,,430,https://www.xfinity.com/mobile/shop/phone/ipho...,4,"[""Performance/ speed"",""Storage capacity"",""Disp..."
22,95a96e83-18a8-47ad-919e-faee2ccf6d1d,2025-11-07 13:09:23.500488+00:00,vivo s20pro,256GB,Silver,300 euro,www.taobao.com,7,"[""Camera quality"",""Performance/ speed"",""Storag..."
35,a5192e43-ba28-42e5-84ef-bae523f81e89,2025-10-31 16:20:26.519274+00:00,Iphone 14 pro,256 GN,Gold,Around 1k eur 52k TRY,https://www.akakce.com/cep-telefonu/en-ucuz-ip...,3,"[""Battery life/ fast charging"",""Display Qualit..."
32,a543481c-04cb-4b98-a616-eaf896f1df99,2025-11-03 18:28:01.870550+00:00,Huawei,128gb,Black,€130,Xxx,4,"[""Display Quality"",""Performance/ speed"",""Softw..."
44,a9ea56d9-8f16-4706-8730-7759a9ae43ff,2025-10-30 18:05:07.809727+00:00,Iphone 17,256GB,Purple,€620,https://lp.pinduoduo.com/poros/h5?ads_channel=...,5,"[""Display Quality"",""Software support & updates..."
30,b9fede00-231a-4c78-8324-6691b1533caf,2025-11-03 20:44:16.476432+00:00,Samsung galaxy S25 Ultra,512GB,Black,859 EUR,https://www.backmarket.de/de-de/p/samsung-gb/f...,2,"[""Battery life/ fast charging"",""Performance/ s..."
25,bfa88b12-3ef7-44b4-b63a-edfae1b18d71,2025-11-04 16:21:08.242726+00:00,Xiaomi 17 pro Max,512GB,Black,999$,https://mstore.ie/,5,"[""Storage capacity"",""Battery life/ fast chargi..."


In [40]:
# Count unique participant_ids in df_postsurvey but not in df_results
# Show rows with unique participant_ids in df_postsurvey but not in df_results
postsurvey_ids = set(df_postsurvey['participant_id'])
results_ids = set(df_results['participant_id'])
unique_in_postsurvey = postsurvey_ids - results_ids
rows_unique_in_postsurvey = df_postsurvey[df_postsurvey['participant_id'].isin(unique_in_postsurvey)]
print(f"Number of unique participant_ids in df_postsurvey but not in df_results: {len(unique_in_postsurvey)}")
rows_unique_in_postsurvey

Number of unique participant_ids in df_postsurvey but not in df_results: 2


Unnamed: 0,participant_id,postsurvey_submitted_at,q38_attention,q41_duration,q42_comments,q39_contradictory_info,q19_task_easy,q20_task_quick,q21_task_familiar,q22_tool_reliable,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
4,233c60f2-995c-4779-970d-dbbbd7c1da52,2025-11-14 23:28:53.32169+00,3,3–5 minutes,"Yes, helps save time and provides me with rele...","[""I chose the response that seemed most detail...",7,7,3,6,...,7,7,7,7,6,6,6,6,6,5
40,ca0c9bac-e5be-4d79-af83-2c5358af9657,2025-10-30 20:13:31.17622+00,3,3–5 minutes,Yes,"[""I did not find any contradictions""]",5,5,7,7,...,7,7,7,2,7,7,7,7,7,6


## 4. Join datasets with *One to One* relations

### 4.1. **Version 1:** Only keep participants that completed all the pages, exclude the test rows

#### **Results**
In total 45 participants including those failed attention checks - 7 participants failed attention checks

= 38 participants passed attention checks

In [41]:
# First join df_participant, df_background, and df_instruction using inner join
df_merged = df_participant.merge(df_background, on='participant_id', how='inner')
df_merged = df_merged.merge(df_instruction, on='participant_id', how='inner')

In [42]:
# Only keep df_merged rows where participant_id exists in df_logs
logs_ids = set(df_logs['participant_id'])
df_merged = df_merged[df_merged['participant_id'].isin(logs_ids)]
print(f"Number of rows in filtered df_merged: {len(df_merged)}")
df_merged.head()

Number of rows in filtered df_merged: 57


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,q6_residence,q7_chatbot_familiarity,q8_data_quality,q9_chatbot_usage,q10_search_tools,q11_budget,instruction_submitted_at
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,China,7,1,More than 10 times,tiktok,Over €800,2025-11-20 16:23:08.154560+00:00
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,Deutschland,5,1,More than 10 times,Google,€600-799,2025-11-18 18:17:25.690481+00:00
2,5c4cab45-aeba-4c55-a4c4-6b227b569474,2025-11-17 17:11:04.390536+00,"104.28.45.21,104.28.45.21, 99.83.99.144",mobile,2025-11-17 17:12:33.533040+00:00,18-24,Female,Bachelor's degree,Student,German,Germany,6,1,More than 10 times,"Google, Amazon, BestSecret",€450-599,2025-11-17 17:13:24.879267+00:00
3,2bc7c0dd-423b-4c5c-bbff-fd89aa61ae8a,2025-11-16 19:56:03.579659+00,"59.103.194.47,59.103.194.47, 13.248.127.234",desktop,2025-11-16 19:56:55.964839+00:00,25-34,Female,Master's degree,Employed,Pakistan,Pakistan,5,6,3-5 times,Pakistan,€150-299,2025-11-16 19:57:03.504302+00:00
4,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,Nederland,6,1,More than 10 times,Nederland,€600-799,2025-11-15 23:30:13.982834+00:00


In [43]:
# Next, join df_results, and df_postsurvey to df_merged using inner join
df_merged = df_merged.merge(df_results, on='participant_id', how='inner')
df_merged = df_merged.merge(df_postsurvey, on='participant_id', how='inner')
print(f"Number of rows in final df_merged: {len(df_merged)}")
df_merged.head()

Number of rows in final df_merged: 46


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4
4,25e47297-047a-4582-bc17-c13f5b2ef07b,2025-11-13 12:54:47.011689+00,"153.92.90.3,153.92.90.3, 99.82.163.72",mobile,2025-11-13 12:55:37.488309+00:00,18-24,Female,Bachelor's degree,Student,Georgian,...,5,5,4,5,7,7,7,7,7,4


In [44]:
# Remove test rows
df_merged = df_merged[df_merged['q42_comments'].str.lower() != 'test']

print(f"Number of rows in df_merged after deleting test rows: {len(df_merged)}")
df_merged.head()

Number of rows in df_merged after deleting test rows: 45


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4
4,25e47297-047a-4582-bc17-c13f5b2ef07b,2025-11-13 12:54:47.011689+00,"153.92.90.3,153.92.90.3, 99.82.163.72",mobile,2025-11-13 12:55:37.488309+00:00,18-24,Female,Bachelor's degree,Student,Georgian,...,5,5,4,5,7,7,7,7,7,4


#### **Remark**
df_merged, which has **45** participants **including** those failed attention checks, is saved for further cleaning and standardization.

In [45]:
'''
# Save df_merged to .csv file 
output_path = 'chat_exports/df_merged.csv'
df_merged.to_csv(output_path, index=False)
print(f"df_merged saved to {output_path}")
'''

'\n# Save df_merged to .csv file \noutput_path = \'chat_exports/df_merged.csv\'\ndf_merged.to_csv(output_path, index=False)\nprint(f"df_merged saved to {output_path}")\n'

In [46]:
# Create df_before_standard with selected columns from df_merged
selected_columns = [
    "participant_id",
    "q5_nationality",
    "q6_residence",
    "q10_search_tools",
    "q12_brand_model",
    "q13_storage",
    "q14_color",
    "q15_price"
 ]
df_before_standard = df_merged[selected_columns]
print(f"df_before_standard shape: {df_before_standard.shape}")
df_before_standard.head()

df_before_standard shape: (45, 8)


Unnamed: 0,participant_id,q5_nationality,q6_residence,q10_search_tools,q12_brand_model,q13_storage,q14_color,q15_price
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,Indo,China,tiktok,iphone 16 pro,256,,1359
1,151939c4-8aae-490b-92eb-c7d353a4a95c,Iranian,Deutschland,Google,Galaxy S25 Ultra,256,not mentioned,740
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,Nederland,Nederland,Nederland,iphone 16,i dont remember,black,600
3,88584cfe-a550-4209-a1f3-be628bb2647b,French,rance,Google Search,Iphone 17 pro,128,Black,1200
4,25e47297-047a-4582-bc17-c13f5b2ef07b,Georgian,Germany,Amazon,Iphone 17 pro,128,White,700


#### **Remark**
df_before_standard, which only has non-standardized columns , is saved for manual processing.

In [47]:
'''
# Save df_before_standard to an Excel file
output_path = 'chat_exports/df_before_standard.xlsx'
df_before_standard.to_excel(output_path, index=False)
print(f"df_before_standard saved to {output_path}")
'''

'\n# Save df_before_standard to an Excel file\noutput_path = \'chat_exports/df_before_standard.xlsx\'\ndf_before_standard.to_excel(output_path, index=False)\nprint(f"df_before_standard saved to {output_path}")\n'

In [48]:
# Remove participants who failed attention checks
df_merged_ex = df_merged[(df_merged['q8_data_quality'] == 1) & (df_merged['q38_attention'] == 3)]
print(f"Number of rows in df_merged after removing failed attention checks: {len(df_merged_ex)}")
df_merged_ex.head()

Number of rows in df_merged after removing failed attention checks: 38


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4
4,25e47297-047a-4582-bc17-c13f5b2ef07b,2025-11-13 12:54:47.011689+00,"153.92.90.3,153.92.90.3, 99.82.163.72",mobile,2025-11-13 12:55:37.488309+00:00,18-24,Female,Bachelor's degree,Student,Georgian,...,5,5,4,5,7,7,7,7,7,4


### 4.2. **Version 2:** Keep participants that completed pages except for the *search interface page*, exclude the test rows

In [49]:
# First join df_participant, df_background, df_instruction, df_results, and df_postsurvey using inner join
df_merged_2 = df_participant.merge(df_background, on='participant_id', how='inner')
df_merged_2 = df_merged_2.merge(df_instruction, on='participant_id', how='inner')
df_merged_2 = df_merged_2.merge(df_results, on='participant_id', how='inner')
df_merged_2 = df_merged_2.merge(df_postsurvey, on='participant_id', how='inner')
print(f"Number of rows in df_merged_2: {len(df_merged_2)}")
df_merged_2.head()

Number of rows in df_merged_2: 57


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:38:39.702331+00,"76.33.233.173,76.33.233.173, 13.248.99.203",desktop,2025-11-15 06:39:15.015137+00:00,Under 18,Male,High school or below,Student,United States,...,4,5,4,4,5,6,3,4,3,6
4,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4


In [50]:
# Remove test rows
df_merged_2 = df_merged_2[df_merged_2['q42_comments'].str.lower() != 'test']

print(f"Number of rows in df_merged_2 after deleting test rows: {len(df_merged_2)}")
df_merged_2.head()

Number of rows in df_merged_2 after deleting test rows: 54


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:38:39.702331+00,"76.33.233.173,76.33.233.173, 13.248.99.203",desktop,2025-11-15 06:39:15.015137+00:00,Under 18,Male,High school or below,Student,United States,...,4,5,4,4,5,6,3,4,3,6
4,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4


In [51]:
# Remove participants who failed attention checks
df_merged_2_ex = df_merged_2[(df_merged_2['q8_data_quality'] == 1) & (df_merged_2['q38_attention'] == 3)]
print(f"Number of rows in df_merged_2 after removing failed attention checks: {len(df_merged_2_ex)}")
df_merged_2_ex.head()

Number of rows in df_merged_2 after removing failed attention checks: 43


Unnamed: 0,participant_id,created_at,ip_address,device_type,background_submitted_at,q1_age_group,q2_gender,q3_education,q4_employment,q5_nationality,...,q29_tool_easier_task,q30_tool_useful,q31_tool_too_much_info,q32_tool_difficult_focus,q33_results_accurate,q34_results_trust,q35_results_complete,q36_results_relevant,q37_results_useful,q40_purchase_likelihood
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:22:11.281835+00,"84.17.45.206,84.17.45.206, 99.82.172.145",desktop,2025-11-20 16:23:00.392222+00:00,18-24,Female,High school or below,Student,Indo,...,6,5,7,6,7,5,5,6,7,2
1,151939c4-8aae-490b-92eb-c7d353a4a95c,2025-11-18 18:15:57.927115+00,"193.197.8.143,193.197.8.143, 99.82.163.77",desktop,2025-11-18 18:17:10.869409+00:00,35-44,Male,Master's degree,Employed,Iranian,...,6,7,6,4,4,5,5,5,5,6
2,e2f70344-8588-468c-9dc2-8c2993f5b6c9,2025-11-15 23:28:20.372283+00,"178.84.99.93,178.84.99.93, 3.2.59.182",desktop,2025-11-15 23:29:47.088239+00:00,18-24,Female,Bachelor's degree,Student,Nederland,...,5,5,6,5,3,4,4,6,3,5
3,82e40cf7-aadd-46a9-a1b5-f68aed30f6af,2025-11-15 06:38:39.702331+00,"76.33.233.173,76.33.233.173, 13.248.99.203",desktop,2025-11-15 06:39:15.015137+00:00,Under 18,Male,High school or below,Student,United States,...,4,5,4,4,5,6,3,4,3,6
4,88584cfe-a550-4209-a1f3-be628bb2647b,2025-11-13 16:29:27.470605+00,"89.92.129.28,89.92.129.28, 99.82.161.171",desktop,2025-11-13 16:30:20.934922+00:00,35-44,Male,Master's degree,Self-employed,French,...,6,6,6,4,5,5,5,5,5,4


## 5. Join datasets with *One to Many* relations

In [52]:
# First, join df_logs and df_clicks on participant_id and query_sequence_number == prompt_index using left join
df_merged_logs = df_logs.merge(df_clicks, left_on=['participant_id', 'query_sequence_number'], right_on=['participant_id', 'prompt_index'], how='left')
print(f"Number of rows in df_merged_logs: {len(df_merged_logs)}")
df_merged_logs.head()

Number of rows in df_merged_logs: 178


Unnamed: 0,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number,prompt_index,url,clicked_at
0,090b95b5-f52a-4e45-8665-1cc819388638,What is a realistic phone when it comes to cos...,A realistic phone with the best cost-to-qualit...,2025-11-07 15:08:07.503559+00,2025-11-07 15:05:33.382+00,2025-11-07 15:07:50.895000+00:00,137513.0,1,,,
1,0ba4f0a8-673f-4054-80e0-690aaf497b51,What phones are currently available on the mar...,Several **excellent budget and mid-range smart...,2025-11-11 03:09:30.955579+00,2025-11-11 03:08:53.203+00,2025-11-11 03:09:17.501000+00:00,24298.0,1,,,
2,0ba4f0a8-673f-4054-80e0-690aaf497b51,give me prices in euros,"To provide **prices in euros**, you need to co...",2025-11-11 03:16:25.604953+00,2025-11-11 03:16:17.415+00,2025-11-11 03:16:19.917000+00:00,2502.0,5,,,
3,0ba4f0a8-673f-4054-80e0-690aaf497b51,give the lowest price you can find for the app...,"The **lowest price for a new, unlocked Apple i...",2025-11-11 03:14:46.851105+00,2025-11-11 03:14:19.384+00,2025-11-11 03:14:40.420000+00:00,21036.0,4,4.0,https://www.apple.com/shop/buy-iphone/iphone-1...,2025-11-11 03:17:08.006738+00
4,0ba4f0a8-673f-4054-80e0-690aaf497b51,"no, i mean tell me about (for the phones you m...",Based on the latest expert rankings and review...,2025-11-11 03:12:34.977885+00,2025-11-11 03:11:37.266+00,2025-11-11 03:12:16.250000+00:00,38984.0,3,,,


In [53]:
# Count non-missing values in prompt_index column of df_merged_logs
non_missing_prompt_index = df_merged_logs['prompt_index'].notna().sum()
print(f"Number of non-missing prompt_index values in df_merged_logs: {non_missing_prompt_index}")

Number of non-missing prompt_index values in df_merged_logs: 32


In [54]:
# Find click events in df_clicks that did not match any log event in the merge
merged_keys = set(zip(df_merged_logs['participant_id'], df_merged_logs['prompt_index']))
click_keys = set(zip(df_clicks['participant_id'], df_clicks['prompt_index']))
unmatched_click_keys = click_keys - merged_keys
unmatched_clicks = df_clicks[df_clicks.apply(lambda row: (row['participant_id'], row['prompt_index']) in unmatched_click_keys, axis=1)]
print(f"Number of unmatched click events: {len(unmatched_clicks)}")
unmatched_clicks

Number of unmatched click events: 1


Unnamed: 0,participant_id,prompt_index,url,clicked_at
20,a65764e7-9c2f-4219-aa12-fd54da7bf875,6,https://support.apple.com/en-us/108044,2025-10-31 14:32:11.705759+00


In [55]:
# Only keep rows of df_merged_logs where participant_id exists in df_merged (means including participants who failed attention checks)
merged_ids = set(df_merged['participant_id'])
df_merged_logs_filtered = df_merged_logs[df_merged_logs['participant_id'].isin(merged_ids)]
print(f"Number of rows in filtered df_merged_logs: {len(df_merged_logs_filtered)}")
df_merged_logs_filtered.head()

Number of rows in filtered df_merged_logs: 143


Unnamed: 0,participant_id,prompt,response,logged_at,query_start_time,query_submit_time,query_duration_ms,query_sequence_number,prompt_index,url,clicked_at
0,090b95b5-f52a-4e45-8665-1cc819388638,What is a realistic phone when it comes to cos...,A realistic phone with the best cost-to-qualit...,2025-11-07 15:08:07.503559+00,2025-11-07 15:05:33.382+00,2025-11-07 15:07:50.895000+00:00,137513.0,1,,,
1,0ba4f0a8-673f-4054-80e0-690aaf497b51,What phones are currently available on the mar...,Several **excellent budget and mid-range smart...,2025-11-11 03:09:30.955579+00,2025-11-11 03:08:53.203+00,2025-11-11 03:09:17.501000+00:00,24298.0,1,,,
2,0ba4f0a8-673f-4054-80e0-690aaf497b51,give me prices in euros,"To provide **prices in euros**, you need to co...",2025-11-11 03:16:25.604953+00,2025-11-11 03:16:17.415+00,2025-11-11 03:16:19.917000+00:00,2502.0,5,,,
3,0ba4f0a8-673f-4054-80e0-690aaf497b51,give the lowest price you can find for the app...,"The **lowest price for a new, unlocked Apple i...",2025-11-11 03:14:46.851105+00,2025-11-11 03:14:19.384+00,2025-11-11 03:14:40.420000+00:00,21036.0,4,4.0,https://www.apple.com/shop/buy-iphone/iphone-1...,2025-11-11 03:17:08.006738+00
4,0ba4f0a8-673f-4054-80e0-690aaf497b51,"no, i mean tell me about (for the phones you m...",Based on the latest expert rankings and review...,2025-11-11 03:12:34.977885+00,2025-11-11 03:11:37.266+00,2025-11-11 03:12:16.250000+00:00,38984.0,3,,,


In [56]:
'''
# Save df_merged_logs_filtered to .csv file 
output_path = 'chat_exports/df_merged_logs_filtered.csv'
df_merged_logs_filtered.to_csv(output_path, index=False)
print(f"df_merged_logs_filtered saved to {output_path}")
'''

'\n# Save df_merged_logs_filtered to .csv file \noutput_path = \'chat_exports/df_merged_logs_filtered.csv\'\ndf_merged_logs_filtered.to_csv(output_path, index=False)\nprint(f"df_merged_logs_filtered saved to {output_path}")\n'

## 6. Save filtered search session timing data

In [57]:
# Only keep rows of df_timing where participant_id exists in df_merged (means including participants who failed attention checks)
merged_ids = set(df_merged['participant_id'])
df_timing_filtered = df_timing[df_timing['participant_id'].isin(merged_ids)]
print(f"Number of rows in df_timing_filtered: {len(df_timing_filtered)}")

# Count unique participants in df_timing_filtered
num_unique_timing = df_timing_filtered['participant_id'].nunique()
print(f"Number of unique participants in df_timing_filtered: {num_unique_timing}")

df_timing_filtered.head()

Number of rows in df_timing_filtered: 102
Number of unique participants in df_timing_filtered: 45


Unnamed: 0,participant_id,session_start_time,session_end_time,session_duration_ms,record_created_at
0,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:28:00.829+00,2025-11-20 16:28:10.12+00,9291.0,2025-11-20 16:28:02.231156+00
1,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:27:27.603+00,2025-11-20 16:27:50.572+00,22969.0,2025-11-20 16:27:29.591439+00
2,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:39.474+00,2025-11-20 16:26:48.525+00,189051.0,2025-11-20 16:23:41.070405+00
3,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:30.53+00,22115.0,2025-11-20 16:23:31.852449+00
4,fe4a79bb-642e-49d8-920a-a5f39035ffe3,2025-11-20 16:23:08.415+00,2025-11-20 16:23:29.747+00,21332.0,2025-11-20 16:23:09.530918+00


In [None]:
'''
# Save df_timing_filtered to .csv file 
output_path = 'chat_exports/df_search_session_filtered.csv'
df_timing_filtered.to_csv(output_path, index=False)
print(f"df_timing_filtered saved to {output_path}")
'''

df_timing_filtered saved to chat_exports/df_search_session_filtered.csv
