In [4]:
import linecache
import os
import glob
import pandas as pd
import random

# Configuration
source_folder = "/Users/elisapavarino/Documents/Work_Directory/Kreiman_Lab/Turing Test Paper/TuringGithub/conversation/conversation_task_for_revisions/new_dataset"  # CHANGE THIS: Path to folder containing conv*.html files
excel_file = "esChat.xlsx"  # Path to your Excel file
output_folder = "dataset"  # Output folder name
length_list = [3, 6, 9, 12, 15, 18, 21, 24]  # All required snippet lengths

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Read the Excel file
try:
    df = pd.read_excel(excel_file)
    print(f"Successfully loaded Excel file with {len(df)} rows")
    
    # Debug: Show DataFrame info
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumn names:")
    for i, col in enumerate(df.columns):
        print(f"  {i}: '{col}'")
    
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Check if the expected columns exist
    required_columns = ['conversation_id', 'topic', 'topic_label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"\nERROR: Missing required columns: {missing_columns}")
        print("Please check your Excel file column names.")
        exit(1)
    
except Exception as e:
    print(f"Error loading Excel file: {e}")
    exit(1)

# Get all unique topics (excluding the correct one for each conversation)
all_topics = df['topic'].unique().tolist()
print(f"Found {len(all_topics)} unique topics: {all_topics}")

# Find all conversation files in the source folder
conv_files = glob.glob(os.path.join(source_folder, "conv*.html"))
conv_numbers = []

# Extract conversation numbers from filenames
for file in conv_files:
    filename = os.path.basename(file)
    if filename.startswith("conv") and filename.endswith(".html"):
        try:
            # Extract number from filename like "conv123.html"
            num_str = filename[4:-5]  # Remove "conv" and ".html"
            if num_str.isdigit():
                conv_numbers.append(int(num_str))
        except:
            continue

# Sort conversation numbers
conv_numbers.sort()
print(f"Found conversation files: {conv_numbers}")

# Process each conversation
processed_count = 0
for conv_num in conv_numbers:
    # Find this conversation in the Excel file
    conv_row = df[df['conversation_id'] == conv_num]
    
    if conv_row.empty:
        print(f"Warning: Conversation {conv_num} not found in Excel file, skipping...")
        continue
    
    conv_data = conv_row.iloc[0]
    correct_topic = conv_data['topic']
    topic_label = int(conv_data['topic_label'])  # Which position (1-5) the correct topic should be in
    
    print(f"\nProcessing conversation {conv_num}")
    print(f"  Correct topic: '{correct_topic}' (will be in position {topic_label})")
    
    original_file = os.path.join(source_folder, f'conv{conv_num}.html')
    
    # Check if original conversation file exists
    if not os.path.exists(original_file):
        print(f"Warning: {original_file} not found, skipping...")
        continue
    
    # Generate 4 other topics (excluding the correct one)
    other_topics = [topic for topic in all_topics if topic != correct_topic]
    random.shuffle(other_topics)
    selected_other_topics = other_topics[:4]  # Take first 4 after shuffle
    
    # Create the 5 topics list with correct topic in the right position
    topics_list = [None] * 5
    topics_list[topic_label - 1] = correct_topic  # topic_label is 1-based
    
    # Fill the remaining positions with the other topics
    other_index = 0
    for i in range(5):
        if topics_list[i] is None:
            topics_list[i] = selected_other_topics[other_index]
            other_index += 1
    
    print(f"  Topic options: {topics_list}")
    
    # Create snippets for each length
    for length in length_list:
        # Create main snippet file
        snippet_filename = os.path.join(output_folder, f'conv{conv_num}_len{length}.html')
        
        with open(snippet_filename, "w", encoding='utf-8') as file_html:
            # Write the conversation snippet
            for line in range(1, length + 1):
                particular_line = linecache.getline(original_file, line)
                if particular_line.strip():  # Only write non-empty lines
                    file_html.write(particular_line)
        
        # Create 5 option files with the topics
        for option_num in range(1, 6):
            option_filename = os.path.join(output_folder, f'conv{conv_num}_len{length}option{option_num}.html')
            
            with open(option_filename, "w", encoding='utf-8') as option_file:
                option_file.write(topics_list[option_num - 1])
    
    processed_count += 1

print(f"\nProcessing complete!")
print(f"Successfully processed {processed_count} conversations")
print(f"All files created in '{output_folder}' folder")
print("Files created:")
print("- Conversation snippets: conv*_len*.html")
print("- Topic options: conv*_len*option*.html")

# Clear linecache to free memory
linecache.clearcache()

Successfully loaded Excel file with 210 rows

DataFrame shape: (210, 17)

Column names:
  0: 'conversation_id'
  1: 'Conv_label'
  2: 'mcq'
  3: 'A_label'
  4: 'A_age'
  5: 'A_gender'
  6: 'B_label'
  7: 'B_age'
  8: 'B_gender'
  9: 'A_judgeByB'
  10: 'A_age_judgeByB'
  11: 'A_gender_judgeByB'
  12: 'B_judgeByA'
  13: 'B_age_judgeByA'
  14: 'B_gender_judgebyA'
  15: 'topic'
  16: 'topic_label'

First few rows:
   conversation_id           Conv_label  mcq A_label              A_age  \
0               34         GPT-4o-Human   24      AI  26 - 30 years old   
1               35          Human-Human   12   Human  31 - 35 years old   
2               46          Human-Human   12   Human  21 - 25 years old   
3               55         GPT-4o-Human   24      AI  31 - 35 years old   
4               57  Human-Llama-3.3-70B   12   Human  26 - 30 years old   

  A_gender B_label               B_age B_gender A_judgeByB     A_age_judgeByB  \
0   Female   Human   26 - 30 years old     Male       

In [1]:
import linecache
import os
import glob
import pandas as pd
import random

# Configuration
source_folder = "/Users/elisapavarino/Documents/Work_Directory/Kreiman_Lab/Turing Test Paper/TuringGithub/conversation/conversation_task_for_revisions/new_dataset"  # CHANGE THIS: Path to folder containing conv*.html files
excel_file = "esChat.xlsx"  # Path to your Excel file
output_folder = "dataset"  # Output folder name
length_list = [3, 6, 9, 12, 15, 18, 21, 24]  # All required snippet lengths

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Read the Excel file
try:
    df = pd.read_excel(excel_file)
    print(f"Successfully loaded Excel file with {len(df)} rows")
    
    # Debug: Show DataFrame info
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumn names:")
    for i, col in enumerate(df.columns):
        print(f"  {i}: '{col}'")
    
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Check if the expected columns exist
    required_columns = ['conversation_id', 'topic', 'topic_label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"\nERROR: Missing required columns: {missing_columns}")
        print("Please check your Excel file column names.")
        exit(1)
    
except Exception as e:
    print(f"Error loading Excel file: {e}")
    exit(1)

# Get all unique topics (excluding the correct one for each conversation)
all_topics = df['topic'].unique().tolist()
print(f"Found {len(all_topics)} unique topics: {all_topics}")

# Find all conversation files in the source folder
conv_files = glob.glob(os.path.join(source_folder, "conv*.html"))
conv_numbers = []

# Extract conversation numbers from filenames
for file in conv_files:
    filename = os.path.basename(file)
    if filename.startswith("conv") and filename.endswith(".html"):
        try:
            # Extract number from filename like "conv123.html"
            num_str = filename[4:-5]  # Remove "conv" and ".html"
            if num_str.isdigit():
                conv_numbers.append(int(num_str))
        except:
            continue

# Sort conversation numbers
conv_numbers.sort()
print(f"Found conversation files: {conv_numbers}")

# Process each conversation
processed_count = 0
for conv_num in conv_numbers:
    # Find this conversation in the Excel file
    conv_row = df[df['conversation_id'] == conv_num]
    
    if conv_row.empty:
        print(f"Warning: Conversation {conv_num} not found in Excel file, skipping...")
        continue
    
    conv_data = conv_row.iloc[0]
    correct_topic = conv_data['topic']
    topic_label = int(conv_data['topic_label'])  # Which position (1-5) the correct topic should be in
    
    print(f"\nProcessing conversation {conv_num}")
    print(f"  Correct topic: '{correct_topic}' (will be in position {topic_label})")
    
    original_file = os.path.join(source_folder, f'conv{conv_num}.html')
    
    # Check if original conversation file exists
    if not os.path.exists(original_file):
        print(f"Warning: {original_file} not found, skipping...")
        continue
    
    # Generate 4 other topics (excluding the correct one)
    other_topics = [topic for topic in all_topics if topic != correct_topic]
    random.shuffle(other_topics)
    selected_other_topics = other_topics[:4]  # Take first 4 after shuffle
    
    # Create the 5 topics list with correct topic in the right position
    topics_list = [None] * 5
    topics_list[topic_label - 1] = correct_topic  # topic_label is 1-based
    
    # Fill the remaining positions with the other topics
    other_index = 0
    for i in range(5):
        if topics_list[i] is None:
            topics_list[i] = selected_other_topics[other_index]
            other_index += 1
    
    print(f"  Topic options: {topics_list}")
    
    # Create snippets for each length
    for length in length_list:
        # Create main snippet file
        snippet_filename = os.path.join(output_folder, f'conv{conv_num}_len{length}.html')
        
        with open(snippet_filename, "w", encoding='utf-8') as file_html:
            # Write the HTML header with script
            file_html.write("<script>\n")
            file_html.write(" window.addEventListener('load', function() { let message = { height: document.body.scrollHeight, width: document.body.scrollWidth }; window.top.postMessage(message, '*'); }); \n")
            file_html.write(" </script> \n")
            file_html.write(" <font size='3'>\n")
            
            # Write the conversation snippet with alternating colors
            for line in range(1, length + 1):
                particular_line = linecache.getline(original_file, line).strip()
                if particular_line and particular_line.startswith('<p>'):
                    # Extract the content between <p> and </p>
                    # Expected format: <p> A: content </p> or <p> B: content </p>
                    content = particular_line.replace('<p>', '').replace('</p>', '').strip()
                    
                    if content.startswith('A:'):
                        # Speaker A in dark blue
                        formatted_line = f'<FONT COLOR=darkblue><p style="margin-bottom:-18px;">{content}</p> \n</FONT>'
                    elif content.startswith('B:'):
                        # Speaker B in crimson
                        formatted_line = f'<FONT COLOR=crimson><p style="margin-bottom:-18px;">{content}</p> \n</FONT>'
                    else:
                        # Fallback: keep original format
                        formatted_line = particular_line + '\n'
                    
                    file_html.write(formatted_line)
            
            # Close the font tag
            file_html.write("</font>\n")
        
        # Create 5 option files with the topics
        for option_num in range(1, 6):
            option_filename = os.path.join(output_folder, f'conv{conv_num}_len{length}option{option_num}.html')
            
            with open(option_filename, "w", encoding='utf-8') as option_file:
                option_file.write(topics_list[option_num - 1])
    
    processed_count += 1

print(f"\nProcessing complete!")
print(f"Successfully processed {processed_count} conversations")
print(f"All files created in '{output_folder}' folder")
print("Files created:")
print("- Conversation snippets: conv*_len*.html")
print("- Topic options: conv*_len*option*.html")

# Clear linecache to free memory
linecache.clearcache()

Successfully loaded Excel file with 210 rows

DataFrame shape: (210, 17)

Column names:
  0: 'conversation_id'
  1: 'Conv_label'
  2: 'mcq'
  3: 'A_label'
  4: 'A_age'
  5: 'A_gender'
  6: 'B_label'
  7: 'B_age'
  8: 'B_gender'
  9: 'A_judgeByB'
  10: 'A_age_judgeByB'
  11: 'A_gender_judgeByB'
  12: 'B_judgeByA'
  13: 'B_age_judgeByA'
  14: 'B_gender_judgebyA'
  15: 'topic'
  16: 'topic_label'

First few rows:
   conversation_id           Conv_label  mcq A_label              A_age  \
0               34         GPT-4o-Human   24      AI  26 - 30 years old   
1               35          Human-Human   12   Human  31 - 35 years old   
2               46          Human-Human   12   Human  21 - 25 years old   
3               55         GPT-4o-Human   24      AI  31 - 35 years old   
4               57  Human-Llama-3.3-70B   12   Human  26 - 30 years old   

  A_gender B_label               B_age B_gender A_judgeByB     A_age_judgeByB  \
0   Female   Human   26 - 30 years old     Male       

# Format conversations in other folders without parsing

In [4]:
import os
import glob
import pandas as pd
import random

# Configuration
source_folder = "/Users/elisapavarino/Documents/Work_Directory/Kreiman_Lab/Turing Test Paper/TuringGithub/conversation/conversation_task_for_revisions/conversation_24long_AIAI"  # CHANGE THIS: Path to folder containing conv*.html files
excel_file = "esChat.xlsx"  # Path to your Excel file
output_folder = "conversation_24long_AIAI_formatted"  # CHANGE THIS: Path to folder containing conv*.html files

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Read the Excel file
try:
    df = pd.read_excel(excel_file)
    print(f"Successfully loaded Excel file with {len(df)} rows")
    
    # Debug: Show DataFrame info
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumn names:")
    for i, col in enumerate(df.columns):
        print(f"  {i}: '{col}'")
    
    # Check if the expected columns exist
    required_columns = ['conversation_id', 'topic', 'topic_label']
    missing_columns = [col for col in required_columns if col not in df.columns]
    
    if missing_columns:
        print(f"\nERROR: Missing required columns: {missing_columns}")
        print("Please check your Excel file column names.")
        exit(1)
    
except Exception as e:
    print(f"Error loading Excel file: {e}")
    exit(1)

# Get all unique topics (excluding the correct one for each conversation)
all_topics = df['topic'].unique().tolist()
print(f"Found {len(all_topics)} unique topics: {all_topics}")

# Find all conversation files in the source folder
conv_files = glob.glob(os.path.join(source_folder, "conv*.html"))
conv_numbers = []

# Extract conversation numbers from filenames
for file in conv_files:
    filename = os.path.basename(file)
    if filename.startswith("conv") and filename.endswith(".html"):
        try:
            # Extract number from filename like "conv123.html"
            num_str = filename[4:-5]  # Remove "conv" and ".html"
            if num_str.isdigit():
                conv_numbers.append(int(num_str))
        except:
            continue

# Sort conversation numbers
conv_numbers.sort()
print(f"Found conversation files: {conv_numbers}")

# Process each conversation
processed_count = 0
for conv_num in conv_numbers:
    # Find this conversation in the Excel file
    conv_row = df[df['conversation_id'] == conv_num]
    
    if conv_row.empty:
        print(f"Warning: Conversation {conv_num} not found in Excel file, skipping...")
        continue
    
    conv_data = conv_row.iloc[0]
    correct_topic = conv_data['topic']
    topic_label = int(conv_data['topic_label'])  # Which position (1-5) the correct topic should be in
    
    print(f"\nFormatting conversation {conv_num}")
    print(f"  Correct topic: '{correct_topic}' (will be in position {topic_label})")
    
    original_file = os.path.join(source_folder, f'conv{conv_num}.html')
    output_file = os.path.join(output_folder, f'conv{conv_num}.html')
    
    # Check if original conversation file exists
    if not os.path.exists(original_file):
        print(f"Warning: {original_file} not found, skipping...")
        continue
    
    # Generate 4 other topics (excluding the correct one)
    other_topics = [topic for topic in all_topics if topic != correct_topic]
    random.shuffle(other_topics)
    selected_other_topics = other_topics[:4]  # Take first 4 after shuffle
    
    # Create the 5 topics list with correct topic in the right position
    topics_list = [None] * 5
    topics_list[topic_label - 1] = correct_topic  # topic_label is 1-based
    
    # Fill the remaining positions with the other topics
    other_index = 0
    for i in range(5):
        if topics_list[i] is None:
            topics_list[i] = selected_other_topics[other_index]
            other_index += 1
    
    print(f"  Topic options: {topics_list}")
    
    # Create formatted main conversation file
    with open(output_file, "w", encoding='utf-8') as file_html:
        # Write the HTML header with script
        file_html.write("<script>\n")
        file_html.write(" window.addEventListener('load', function() { let message = { height: document.body.scrollHeight, width: document.body.scrollWidth }; window.top.postMessage(message, '*'); }); \n")
        file_html.write(" </script> \n")
        file_html.write(" <font size='3'>\n")
        
        # Read and format the entire conversation
        with open(original_file, 'r', encoding='utf-8') as input_file:
            for line in input_file:
                line = line.strip()
                if line and line.startswith('<p>'):
                    # Extract the content between <p> and </p>
                    # Expected format: <p> A: content </p> or <p> B: content </p>
                    content = line.replace('<p>', '').replace('</p>', '').strip()
                    
                    if content.startswith('A:'):
                        # Speaker A in dark blue
                        formatted_line = f'<FONT COLOR=darkblue><p style="margin-bottom:-18px;">{content}</p> \n</FONT>'
                    elif content.startswith('B:'):
                        # Speaker B in crimson
                        formatted_line = f'<FONT COLOR=crimson><p style="margin-bottom:-18px;">{content}</p> \n</FONT>'
                    else:
                        # Fallback: keep original format but add some styling
                        formatted_line = f'<p style="margin-bottom:-18px;">{content}</p>\n'
                    
                    file_html.write(formatted_line)
                elif line:
                    # Write other non-empty lines as-is
                    file_html.write(line + '\n')
        
        # Close the font tag
        file_html.write("</font>\n")
    
    # Create 5 option files with the topics
    for option_num in range(1, 6):
        option_filename = os.path.join(output_folder, f'conv{conv_num}option{option_num}.html')
        
        with open(option_filename, "w", encoding='utf-8') as option_file:
            option_file.write(topics_list[option_num - 1])
    
    processed_count += 1

print(f"\nFormatting complete!")
print(f"Successfully processed {processed_count} conversations")
print(f"All files created in '{output_folder}' folder")
print("Files created:")
print("- Formatted conversations: conv*.html")
print("- Topic options: conv*option*.html")

Successfully loaded Excel file with 210 rows

DataFrame shape: (210, 17)

Column names:
  0: 'conversation_id'
  1: 'Conv_label'
  2: 'mcq'
  3: 'A_label'
  4: 'A_age'
  5: 'A_gender'
  6: 'B_label'
  7: 'B_age'
  8: 'B_gender'
  9: 'A_judgeByB'
  10: 'A_age_judgeByB'
  11: 'A_gender_judgeByB'
  12: 'B_judgeByA'
  13: 'B_age_judgeByA'
  14: 'B_gender_judgebyA'
  15: 'topic'
  16: 'topic_label'
Found 14 unique topics: ['sports', 'science', 'politics', 'movies', 'fashion', 'music', 'general_entertainment', 'books', 'food', 'technology', 'quarrel', 'traffic', 'emotions', 'ageing']
Found conversation files: [267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281]

Formatting conversation 267
  Correct topic: 'food' (will be in position 4)
  Topic options: ['technology', 'sports', 'ageing', 'food', 'movies']

Formatting conversation 268
  Correct topic: 'movies' (will be in position 4)
  Topic options: ['sports', 'ageing', 'quarrel', 'movies', 'emotions']

Formatting conv