In [None]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import sys

# Load the data
print("Loading data...", file=sys.stderr)
# Try different encodings
encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
df = None
for encoding in encodings:
    try:
        df = pd.read_csv('Speed Dating Data.csv', encoding=encoding)
        print(f"Data loaded successfully with {encoding} encoding: {len(df)} rows, {len(df.columns)} columns", file=sys.stderr)
        break
    except Exception as e:
        continue
if df is None:
    print("Error: Could not load CSV file with any encoding", file=sys.stderr)
    sys.exit(1)

# Define the three stages based on the column structure
# Stage 1: attr1_1, sinc1_1, intel1_1, fun1_1, amb1_1, shar1_1
# Stage 2: attr2_1, sinc2_1, intel2_1, fun2_1, amb2_1, shar2_1
# Stage 3: attr3_1, sinc3_1, intel3_1, fun3_1, amb3_1 (no shar3_1)

# Select only rows that have data for all three stages
stage1_cols = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
stage2_cols = ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']
stage3_cols = ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']

# Check which columns exist
available_stage1 = [col for col in stage1_cols if col in df.columns]
available_stage2 = [col for col in stage2_cols if col in df.columns]
available_stage3 = [col for col in stage3_cols if col in df.columns]

print(f"Stage 1 columns available: {len(available_stage1)}", file=sys.stderr)
print(f"Stage 2 columns available: {len(available_stage2)}", file=sys.stderr)
print(f"Stage 3 columns available: {len(available_stage3)}", file=sys.stderr)
if len(available_stage1) > 0:
    print(f"  Stage 1: {available_stage1}", file=sys.stderr)
if len(available_stage2) > 0:
    print(f"  Stage 2: {available_stage2}", file=sys.stderr)
if len(available_stage3) > 0:
    print(f"  Stage 3: {available_stage3}", file=sys.stderr)

# Use common attributes across all stages
# Focus on: attr (attractiveness), intel (intelligence), fun (fun)
common_attrs = ['attr', 'intel', 'fun']
stage1_attrs = ['attr1_1', 'intel1_1', 'fun1_1']
stage2_attrs = ['attr2_1', 'intel2_1', 'fun2_1']
stage3_attrs = ['attr3_1', 'intel3_1', 'fun3_1']

# Filter data: keep only rows with complete data for all three stages and decision
required_cols = stage1_attrs + stage2_attrs + stage3_attrs + ['dec']
available_cols = [col for col in required_cols if col in df.columns]

if len(available_cols) < len(required_cols):
    print(f"Warning: Some columns missing. Available: {available_cols}", file=sys.stderr)
    # Use only available columns
    stage1_attrs = [col for col in stage1_attrs if col in df.columns]
    stage2_attrs = [col for col in stage2_attrs if col in df.columns]
    stage3_attrs = [col for col in stage3_attrs if col in df.columns]
    print(f"Using Stage 1: {stage1_attrs}", file=sys.stderr)
    print(f"Using Stage 2: {stage2_attrs}", file=sys.stderr)
    print(f"Using Stage 3: {stage3_attrs}", file=sys.stderr)

# Filter rows with complete data
if 'dec' in df.columns:
    complete_data = df[stage1_attrs + stage2_attrs + stage3_attrs + ['dec']].dropna()
    print(f"Rows with complete data: {len(complete_data)}", file=sys.stderr)
else:
    print("Warning: 'dec' column not found. Using match column instead.", file=sys.stderr)
    if 'match' in df.columns:
        complete_data = df[stage1_attrs + stage2_attrs + stage3_attrs + ['match']].dropna()
        complete_data['dec'] = complete_data['match']
        print(f"Rows with complete data: {len(complete_data)}", file=sys.stderr)
    else:
        print("Error: Neither 'dec' nor 'match' column found.", file=sys.stderr)
        complete_data = df[stage1_attrs + stage2_attrs + stage3_attrs].dropna()
        complete_data['dec'] = 0  # Default value
        print(f"Rows with complete data (no decision): {len(complete_data)}", file=sys.stderr)

if len(complete_data) == 0:
    print("Error: No complete data found. Checking alternative column names...", file=sys.stderr)
    # Try alternative approach: use final ratings as stage 3
    if 'attr' in df.columns and 'intel' in df.columns and 'fun' in df.columns:
        print("Using final ratings (attr, intel, fun) as stage 3", file=sys.stderr)
        stage1_attrs = ['attr1_1', 'intel1_1', 'fun1_1']
        stage2_attrs = ['attr2_1', 'intel2_1', 'fun2_1']
        stage3_attrs = ['attr', 'intel', 'fun']

        required_cols = stage1_attrs + stage2_attrs + stage3_attrs
        available_cols = [col for col in required_cols if col in df.columns]

        if len(available_cols) == len(required_cols):
            if 'dec' in df.columns:
                complete_data = df[required_cols + ['dec']].dropna()
            else:
                complete_data = df[required_cols].dropna()
                complete_data['dec'] = 0
            print(f"Rows with complete data (alternative): {len(complete_data)}", file=sys.stderr)

print(f"Final dataset size: {len(complete_data)}", file=sys.stderr)

# Normalize ratings using Log transformation + Min-Max to 30-point scale
# Log transformation helps compress high values and expand low values for better distribution
if len(complete_data) > 0:
    def log_normalize_to_30(df_subset):
        """Normalize using log transformation then map to 0-30 range"""
        normalized = df_subset.copy()
        for col in df_subset.columns:
            col_data = df_subset[col].copy()
            # Shift to ensure all values are positive for log (add 1 to avoid log(0))
            col_min = col_data.min()
            if col_min <= 0:
                col_data = col_data - col_min + 1
            else:
                col_data = col_data - col_min + 1

            # Apply log transformation
            col_log = np.log1p(col_data)  # log1p(x) = log(1+x), more stable

            # Min-Max normalize to 0-30
            log_min = col_log.min()
            log_max = col_log.max()
            if log_max > log_min:
                normalized[col] = ((col_log - log_min) / (log_max - log_min)) * 30
            else:
                normalized[col] = 15
        return normalized

    # Normalize each stage using log transformation
    stage1_normalized = log_normalize_to_30(complete_data[stage1_attrs])
    stage2_normalized = log_normalize_to_30(complete_data[stage2_attrs])
    stage3_normalized = log_normalize_to_30(complete_data[stage3_attrs])

    # Calculate average rating for each stage (using normalized values)
    complete_data['stage1_avg'] = stage1_normalized.mean(axis=1)
    complete_data['stage2_avg'] = stage2_normalized.mean(axis=1)
    complete_data['stage3_avg'] = stage3_normalized.mean(axis=1)

    print(f"Stage 1 average range (log-normalized to 0-30): {complete_data['stage1_avg'].min():.2f} - {complete_data['stage1_avg'].max():.2f}, Mean: {complete_data['stage1_avg'].mean():.2f}", file=sys.stderr)
    print(f"Stage 2 average range (log-normalized to 0-30): {complete_data['stage2_avg'].min():.2f} - {complete_data['stage2_avg'].max():.2f}, Mean: {complete_data['stage2_avg'].mean():.2f}", file=sys.stderr)
    print(f"Stage 3 average range (log-normalized to 0-30): {complete_data['stage3_avg'].min():.2f} - {complete_data['stage3_avg'].max():.2f}, Mean: {complete_data['stage3_avg'].mean():.2f}", file=sys.stderr)

    # Create 3D line plot
    fig = go.Figure()

    # Separate data by decision
    yes_data = complete_data[complete_data['dec'] == 1]
    no_data = complete_data[complete_data['dec'] == 0]

    print(f"Decisions - Yes: {len(yes_data)}, No: {len(no_data)}", file=sys.stderr)

    # Create trajectory lines for each individual
    # Sample a subset for visualization (too many lines would be cluttered)
    sample_size = min(500, len(complete_data))
    sampled_data = complete_data.sample(n=sample_size, random_state=42)

    # Separate yes and no for different styling
    yes_sampled = sampled_data[sampled_data['dec'] == 1]
    no_sampled = sampled_data[sampled_data['dec'] == 0]

    # Blue-pink color scheme from SCI journal style
    # Yes decisions: Blue tones (#104e8b - deep blue)
    yes_color = '#104e8b'
    # No decisions: Pink/red tones (#d89090 - pink, or #b22222 - deep red)
    no_color = '#d89090'
    
    # Add trajectory lines for "Yes" decisions (blue) - discrete color
    for idx, row in yes_sampled.iterrows():
        x_coords = [1, 2, 3]  # Three stages
        y_coords = [row['stage1_avg'], row['stage2_avg'], row['stage3_avg']]
        z_coords = [row['dec'], row['dec'], row['dec']]  # Decision stays constant

        fig.add_trace(go.Scatter3d(
            x=x_coords,
            y=y_coords,
            z=z_coords,
            mode='lines+markers',
            line=dict(color=yes_color, width=2),
            marker=dict(size=4, color=yes_color),
            showlegend=False,
            hovertemplate='Stage: %{x}<br>Rating: %{y:.2f}<br>Decision: Yes<extra></extra>'
        ))

    # Add trajectory lines for "No" decisions (pink) - discrete color
    for idx, row in no_sampled.iterrows():
        x_coords = [1, 2, 3]  # Three stages
        y_coords = [row['stage1_avg'], row['stage2_avg'], row['stage3_avg']]
        z_coords = [row['dec'], row['dec'], row['dec']]  # Decision stays constant

        fig.add_trace(go.Scatter3d(
            x=x_coords,
            y=y_coords,
            z=z_coords,
            mode='lines+markers',
            line=dict(color=no_color, width=2),
            marker=dict(size=4, color=no_color),
            showlegend=False,
            hovertemplate='Stage: %{x}<br>Rating: %{y:.2f}<br>Decision: No<extra></extra>'
        ))

    # Add legend traces (invisible, just for legend)
    fig.add_trace(go.Scatter3d(
        x=[None], y=[None], z=[None],
        mode='lines+markers',
        line=dict(color=yes_color, width=3),
        marker=dict(size=6, color=yes_color),
        name='Yes',
        showlegend=True
    ))

    fig.add_trace(go.Scatter3d(
        x=[None], y=[None], z=[None],
        mode='lines+markers',
        line=dict(color=no_color, width=3),
        marker=dict(size=6, color=no_color),
        name='No',
        showlegend=True
    ))

    # Update layout
    fig.update_layout(
        title={
            'text': 'Interaction Trajectory Map: Attraction Over Time',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 20, 'color': '#104e8b'}  # Deep blue title
        },
        scene=dict(
            xaxis_title='Stage',
            yaxis_title='Average Rating',
            zaxis_title='Final Decision',
            xaxis=dict(
                tickvals=[1, 2, 3],
                ticktext=['Stage 1', 'Stage 2', 'Stage 3'],
                titlefont=dict(size=14, color='#104e8b'),
                tickfont=dict(size=12, color='#376b9e')
            ),
            yaxis=dict(
                titlefont=dict(size=14, color='#104e8b'),
                tickfont=dict(size=12, color='#376b9e')
            ),
            zaxis=dict(
                tickvals=[0, 1],
                ticktext=['No', 'Yes'],
                titlefont=dict(size=14, color='#104e8b'),
                tickfont=dict(size=12, color='#376b9e')
            ),
            camera=dict(
                eye=dict(x=1.5, y=1.5, z=1.2),
                center=dict(x=0, y=0, z=0)
            ),
            bgcolor='white'  # Pure white background
        ),
        width=1200,
        height=900,
        legend=dict(
            x=0.02,
            y=0.98,
            bgcolor='rgba(242, 218, 218, 0.9)',  # Light pink background (#f2dada)
            bordercolor='rgba(104, 78, 139, 0.8)',  # Blue border (#104e8b)
            borderwidth=1.5,
            title_text='Decision',
            title_font=dict(size=13, color='#104e8b')
        ),
        font=dict(family="Arial, sans-serif", size=12)
    )

    # Save the figure
    output_file = 'interaction_trajectory_map.html'
    fig.write_html(output_file)
    print(f"\nVisualization saved to {output_file}", file=sys.stderr)
    print("Open the HTML file in your browser to view the interactive 3D plot.", file=sys.stderr)
    print(f"SUCCESS: File saved as {output_file}")

    # Also show the figure
    try:
        fig.show()
    except:
        pass  # If show() fails, that's okay, we still have the HTML file
else:
    print("Error: No data available for visualization.", file=sys.stderr)
    sys.exit(1)


Loading data...
Data loaded successfully with latin-1 encoding: 8378 rows, 195 columns
Stage 1 columns available: 6
Stage 2 columns available: 6
Stage 3 columns available: 5
  Stage 1: ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
  Stage 2: ['attr2_1', 'sinc2_1', 'intel2_1', 'fun2_1', 'amb2_1', 'shar2_1']
  Stage 3: ['attr3_1', 'sinc3_1', 'intel3_1', 'fun3_1', 'amb3_1']
Rows with complete data: 8263
Final dataset size: 8263
Stage 1 average range (log-normalized to 0-30): 10.00 - 24.07, Mean: 20.14
Stage 2 average range (log-normalized to 0-30): 8.29 - 24.75, Mean: 20.33
Stage 3 average range (log-normalized to 0-30): 7.00 - 30.00, Mean: 24.56
Decisions - Yes: 3469, No: 4794


SUCCESS: File saved as interaction_trajectory_map.html



Visualization saved to interaction_trajectory_map.html
Open the HTML file in your browser to view the interactive 3D plot.
