# Data Analysis Notebook
This notebook is created for analyzing data from various sources.

In [158]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [159]:
# Load Data - Simple Approach
# Use first row as header, and skip rows 2-3 (index 1-2)
file_path = "formative.csv"

# Load the CSV file, using the first row as header, and skipping rows 2-3
data_entries = pd.read_csv(file_path, header=0, skiprows=[1, 2])

print(f"Shape of data_entries: {data_entries.shape}")
print("\nFirst 5 rows:")
print(data_entries.head())
print("\nAvailable columns:")
print(data_entries.columns.tolist())

Shape of data_entries: (48, 46)

First 5 rows:
             StartDate              EndDate      Status        IPAddress  \
0  2025-05-30 15:50:21  2025-05-30 16:16:16  IP Address   46.203.160.221   
1  2025-05-30 15:49:24  2025-05-30 16:25:15  IP Address    89.37.119.212   
2  2025-05-30 15:50:41  2025-05-30 16:29:07  IP Address     74.79.201.89   
3  2025-05-30 16:40:53  2025-05-30 17:31:07  IP Address  216.120.189.180   
4  2025-05-31 15:44:49  2025-05-31 15:56:08  IP Address   107.10.217.217   

   Progress  Duration (in seconds)  Finished         RecordedDate  \
0       100                   1554      True  2025-05-30 16:16:16   
1       100                   2150      True  2025-05-30 16:25:16   
2       100                   2306      True  2025-05-30 16:29:08   
3       100                   3014      True  2025-05-30 17:31:07   
4       100                    678      True  2025-05-31 15:56:08   

          ResponseId  RecipientLastName  ...  \
0  R_1GUEH2ertEBHL4B             

In [160]:
# Data Cleaning
# Example: Handle missing values
data = data.dropna()
print("Data after cleaning:")
print(data.head())

# Organize formative.csv data from the 'data' DataFrame

print(f"Shape of 'data' DataFrame immediately after loading in previous cell: {data.shape}")

# Question names are from the first row of 'data' (CSV row 1, index 0)
df_question_names = pd.Series(data.iloc[0], name='QuestionName')

# Question bodies are from the second row of 'data' (CSV row 2, index 1)
df_question_bodies = pd.Series(data.iloc[1], name='QuestionBody')

# The third row of the original CSV file ('data.iloc[2]') contains ImportIds and is skipped.
print(f"Skipping row at index 2 (Import IDs):\n{data.iloc[2]}\n")

# Actual data entries start from the fourth row of 'data' (CSV row 4, index 3 onwards).
# We use data.iloc[0] (the first row of the CSV) as the headers for these entries.

# --- Debug: Print what data.iloc[3:] looks like before assigning to df_data_entries ---
print(f"Preview of data that will become df_data_entries (data.iloc[3:]):")
print(data.iloc[3:].head())
print(f"Shape of data.iloc[3:]: {data.iloc[3:].shape}")
# --- End Debug ---

df_data_entries = data.iloc[3:].copy() # Select rows from index 3 onwards
df_data_entries.columns = data.iloc[0] # Set column names using the first row of 'data'
df_data_entries.reset_index(drop=True, inplace=True) # Reset index for the new DataFrame

print("\nExtracted Question Names (from CSV row 1):")
print(df_question_names.head())
print("\nExtracted Question Bodies (from CSV row 2):")
print(df_question_bodies.head())
print("\nActual Data Entries (from CSV row 4 onwards, with headers from CSV row 1):")
if df_data_entries.empty:
    print("df_data_entries is EMPTY. Check CSV structure and slicing logic.")
else:
    print(df_data_entries.head())

print(f"\nShape of df_data_entries: {df_data_entries.shape}")
print(f"Number of rows in df_data_entries: {len(df_data_entries)}")

# --- Debug: Check for trailing empty rows in the original 'data' DataFrame ---
if data.shape[0] > 0:
    print("\nLast 5 rows of the originally loaded 'data' DataFrame:")
    print(data.tail())
# --- End Debug ---

# Now that data_entries is directly loaded in the previous cell,
# we don't need this cell's complex logic anymore.

# Just calculate the total number of entries
total_entries = len(data_entries)
print(f"Total number of data entries: {total_entries}")

Data after cleaning:
                                                  0   \
0                                          StartDate   
1                                         Start Date   
2  {"ImportId":"startDate","timeZone":"America/To...   

                                                  1                      2   \
0                                            EndDate                 Status   
1                                           End Date          Response Type   
2  {"ImportId":"endDate","timeZone":"America/Toro...  {"ImportId":"status"}   

                         3                        4                        5   \
0                 IPAddress                 Progress    Duration (in seconds)   
1                IP Address                 Progress    Duration (in seconds)   
2  {"ImportId":"ipAddress"}  {"ImportId":"progress"}  {"ImportId":"duration"}   

                        6                                                  7   \
0                 Finished     

In [161]:
# Calculate and display the total number of data entries
total_entries = len(data_entries)
print(f"Total number of data entries: {total_entries}")

Total number of data entries: 48


In [162]:
# Create plot for the Screening data
import plotly.express as px

# Find the column related to "Screening" (case-insensitive)
screening_column = None
for col in data_entries.columns:
    if 'screening' in str(col).lower():
        screening_column = col
        break

if screening_column:
    print(f"Found screening column: '{screening_column}'")
    
    # Count the occurrences of each unique value in the screening column
    answer_counts = data_entries[screening_column].value_counts().reset_index()
    answer_counts.columns = ['Response', 'Count']
    
    # Print the unique values and their counts
    print("\nUnique values in the screening column:")
    print(answer_counts)
    
    # Create an interactive bar plot
    fig = px.bar(answer_counts, 
                 x='Response', 
                 y='Count', 
                 color='Response',  # Colors bars by response type, creates legend
                 text='Count',      # Shows count on top of bars
                 title=f'Distribution of Responses for "{screening_column}"',
                 height=500)
    
    # Improve layout
    fig.update_layout(
        xaxis_title="Response Category",
        yaxis_title="Number of Participants",
        legend_title="Responses",
        title_x=0.5,  # Center title
        plot_bgcolor='rgba(240,240,240,0.95)',  # Light grey plot area
        paper_bgcolor='white',
        font=dict(family="Arial, sans-serif", size=12, color="#333"),
        margin=dict(l=80, r=40, t=80, b=100),  # Increased bottom margin for labels
        xaxis_tickangle=-30  # Angle labels for better readability
    )
    
    # Customize hover information
    fig.update_traces(
        texttemplate='%{text}', 
        textposition='outside',
        hovertemplate="<b>Response:</b> %{x}<br><b>Count:</b> %{y}<extra></extra>"
    )
    
    fig.show()
else:
    print("Could not find a column related to 'Screening'.")
    print("Available columns:", data_entries.columns.tolist())



Found screening column: 'Screening'

Unique values in the screening column:
                                            Response  Count
0                                     Yes, regularly     35
1                                  Yes, occasionally      9
2  I haven’t used them a lot, but I understand ho...      3


In [163]:
# Analyze and visualize the age distribution
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Find the column related to "Age" (case-insensitive)
age_column = None
for col in data_entries.columns:
    if 'age' in str(col).lower() and 'age' == str(col).lower():
        age_column = col
        break

if not age_column:
    # If we couldn't find an exact match, look for any column containing "age"
    for col in data_entries.columns:
        if 'age' in str(col).lower():
            age_column = col
            break

if age_column:
    print(f"Found age column: '{age_column}'")
    
    # Convert age to numeric, coercing any non-numeric values to NaN
    data_entries[age_column] = pd.to_numeric(data_entries[age_column], errors='coerce')
    
    # Drop NaN values for analysis
    age_data = data_entries[age_column].dropna()
    
    if not age_data.empty:
        # Calculate basic statistics
        age_mean = age_data.mean()
        age_median = age_data.median()
        age_min = age_data.min()
        age_max = age_data.max()
        age_std = age_data.std()
        
        print(f"\nAge Statistics:")
        print(f"Mean age: {age_mean:.1f}")
        print(f"Median age: {age_median:.1f}")
        print(f"Age range: {age_min:.0f} to {age_max:.0f}")
        print(f"Standard deviation: {age_std:.1f}")
        
        # Create age groups for categorization
        bins = [20, 30, 40, 50, 60, 70, 80]
        labels = ['20-29', '30-39', '40-49', '50-59', '60-69', '70+']
        age_groups = pd.cut(age_data, bins=bins, labels=labels, right=False)
        age_group_counts = age_groups.value_counts().sort_index()
        
        # Create a subplot with histogram and box plot
        fig = make_subplots(
            rows=2, 
            cols=1,
            subplot_titles=("Age Distribution Histogram", "Age Box Plot"),
            vertical_spacing=0.2,
            row_heights=[0.7, 0.3]
        )
        
        # Add histogram trace
        fig.add_trace(
            go.Histogram(
                x=age_data,
                nbinsx=10,
                marker_color='royalblue',
                opacity=0.75,
                hovertemplate="Age: %{x}<br>Count: %{y}<extra></extra>",
                name="Age Histogram"
            ),
            row=1, col=1
        )
        
        # Add box plot trace
        fig.add_trace(
            go.Box(
                x=age_data,
                marker_color='indianred',
                boxpoints='all',  # Show all points
                jitter=0.3,       # Add jitter to points for better visibility
                pointpos=-1.8,    # Position of points (negative = below box)
                hovertemplate="Age: %{x}<extra></extra>",
                name="Age Box Plot"
            ),
            row=2, col=1
        )
        
        # Update layout
        fig.update_layout(
            title_text="Age Distribution Analysis",
            title_x=0.5,
            height=700,
            showlegend=False,
            plot_bgcolor='rgba(240,240,240,0.95)',
            paper_bgcolor='white',
            font=dict(family="Arial, sans-serif", size=12, color="#333"),
            margin=dict(l=80, r=40, t=80, b=40)
        )
        
        # Update x-axes and y-axes
        fig.update_xaxes(title_text="Age", row=2, col=1)
        fig.update_yaxes(title_text="Count", row=1, col=1)
        
        fig.show()
        
        # Create an additional bar chart for age groups
        fig_bar = px.bar(
            x=age_group_counts.index,
            y=age_group_counts.values,
            labels={'x': 'Age Group', 'y': 'Number of Participants'},
            title='Distribution by Age Group',
            text=age_group_counts.values,  # Display the count on each bar
            color=age_group_counts.index,  # Color bars by age group
            color_discrete_sequence=px.colors.qualitative.Pastel
        )
        
        fig_bar.update_layout(
            title_x=0.5,
            plot_bgcolor='rgba(240,240,240,0.95)',
            paper_bgcolor='white',
            font=dict(family="Arial, sans-serif", size=12, color="#333"),
            margin=dict(l=80, r=40, t=80, b=40)
        )
        
        fig_bar.update_traces(
            texttemplate='%{text}',
            textposition='outside',
            hovertemplate="Age Group: %{x}<br>Count: %{y}<extra></extra>"
        )
        
        fig_bar.show()
        
    else:
        print("No valid age data found (all values are non-numeric or NaN).")
else:
    print("Could not find a column related to 'Age'.")
    print(f"Available columns: {data_entries.columns.tolist()}")

Found age column: 'Age'

Age Statistics:
Mean age: 35.4
Median age: 30.5
Age range: 21 to 67
Standard deviation: 13.6


In [164]:
# Plot the main occupation categories
import plotly.express as px
import pandas as pd

# Define occupation categories and their counts based on manual analysis
occupation_data = {
    'Category': [
        'Management', 
        'Data/Analytics', 
        'Engineering/Technical',
        'Healthcare', 
        'Technology',
        'Other Professional',
        'Executive',
        'Student'
    ],
    'Count': [14, 12, 5, 5, 4, 8, 1, 2]
}

# Create DataFrame
occupation_df = pd.DataFrame(occupation_data)

# Sort by count in descending order
occupation_df = occupation_df.sort_values('Count', ascending=False)

# Create bar chart
fig = px.bar(
    occupation_df,
    x='Category',
    y='Count',
    color='Category',
    text='Count',
    title='Distribution of Participants by Occupation Category',
    labels={'Count': 'Number of Participants', 'Category': 'Occupation Category'},
    height=600,
    color_discrete_sequence=px.colors.qualitative.Bold
)

# Improve layout
fig.update_layout(
    title_x=0.5,
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=13, color="#333"),
    margin=dict(l=80, r=40, t=80, b=80),
    xaxis_tickangle=-30  # Angle the category labels for better readability
)

# Add counts on top of bars
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>',
    marker_line_width=1.5,
    marker_line_color='white',
    opacity=0.85
)

# Show the chart
fig.show()

# Print percentages for context
total = occupation_df['Count'].sum()
occupation_df['Percentage'] = (occupation_df['Count'] / total * 100).round(1)
print("Occupation Categories by Percentage:")
for _, row in occupation_df.iterrows():
    print(f"{row['Category']}: {row['Count']} participants ({row['Percentage']}%)")

Occupation Categories by Percentage:
Management: 14 participants (27.5%)
Data/Analytics: 12 participants (23.5%)
Other Professional: 8 participants (15.7%)
Engineering/Technical: 5 participants (9.8%)
Healthcare: 5 participants (9.8%)
Technology: 4 participants (7.8%)
Student: 2 participants (3.9%)
Executive: 1 participants (2.0%)


In [165]:
# Analyze and visualize the dashboard usage frequency (Frequent_1)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Find the column related to "Frequent_1" (case-insensitive)
freq_column = None
for col in data_entries.columns:
    if 'frequent_1' in str(col).lower():
        freq_column = col
        break

if freq_column:
    print(f"Found frequency column: '{freq_column}'")
    
    # Convert frequency values to numeric, coercing any non-numeric values to NaN
    data_entries[freq_column] = pd.to_numeric(data_entries[freq_column], errors='coerce')
    
    # Drop NaN values for analysis
    freq_data = data_entries[freq_column].dropna()
    
    if not freq_data.empty:
        # Calculate basic statistics
        freq_mean = freq_data.mean()
        freq_median = freq_data.median()
        freq_mode = freq_data.mode()[0]  # Most common value
        
        # Count occurrences of each frequency value
        freq_counts = freq_data.value_counts().sort_index()
        
        # Create labels for the frequency scale
        freq_labels = {
            1: "1 (Rarely)",
            2: "2",
            3: "3",
            4: "4",
            5: "5 (Daily)"
        }
        
        # Create a DataFrame for better plotting
        freq_df = pd.DataFrame({
            'Frequency': [freq_labels.get(int(idx), idx) for idx in freq_counts.index],
            'Count': freq_counts.values,
            'Raw_Value': freq_counts.index
        })
        
        print("\nDashboard Usage Frequency Statistics:")
        print(f"Mean frequency: {freq_mean:.2f}")
        print(f"Median frequency: {freq_median:.1f}")
        print(f"Most common frequency: {freq_mode} ({freq_labels.get(int(freq_mode), freq_mode)})")
        print("\nFrequency Distribution:")
        for idx, count in freq_counts.items():
            print(f"{freq_labels.get(int(idx), idx)}: {count} participants ({count/len(freq_data)*100:.1f}%)")
        
        # Create a bar chart
        fig = px.bar(
            freq_df, 
            x='Frequency', 
            y='Count',
            color='Raw_Value',
            text='Count',
            title='How Frequently Participants Use Dashboards',
            labels={'Count': 'Number of Participants', 'Frequency': 'Usage Frequency'},
            height=500,
            color_continuous_scale='Viridis'  # Color gradient from low to high frequency
        )
        
        # Improve layout
        fig.update_layout(
            title_x=0.5,
            plot_bgcolor='rgba(240,240,240,0.95)',
            paper_bgcolor='white',
            font=dict(family="Arial, sans-serif", size=13, color="#333"),
            margin=dict(l=80, r=40, t=80, b=60),
            coloraxis_showscale=False  # Hide the color scale
        )
        
        # Add counts on top of bars
        fig.update_traces(
            texttemplate='%{text}',
            textposition='outside',
            hovertemplate='<b>%{x}</b><br>Count: %{y} participants<extra></extra>',
            marker_line_width=1.5,
            marker_line_color='white',
            opacity=0.85
        )
        
        # Display the chart
        fig.show()
        
        # Create a donut chart for proportions
        labels = [freq_labels.get(int(idx), idx) for idx in freq_counts.index]
        values = freq_counts.values
        
        fig_donut = go.Figure(data=[go.Pie(
            labels=labels,
            values=values,
            hole=.4,
            textinfo='label+percent',
            insidetextorientation='radial',
            marker=dict(
                colors=px.colors.sequential.Viridis,
                line=dict(color='white', width=2)
            )
        )])
        
        fig_donut.update_layout(
            title_text='Distribution of Dashboard Usage Frequency',
            title_x=0.5,
            font=dict(family="Arial, sans-serif", size=13),
            annotations=[dict(text='Frequency', x=0.5, y=0.5, font_size=15, showarrow=False)],
            height=500,
            showlegend=True,
            legend_title="Usage Frequency"
        )
        
        fig_donut.show()
        
    else:
        print("No valid frequency data found (all values are non-numeric or NaN).")
else:
    print("Could not find a column related to 'Frequent_1'.")
    print(f"Available columns: {data_entries.columns.tolist()}")

Found frequency column: 'Frequent_1'

Dashboard Usage Frequency Statistics:
Mean frequency: 4.02
Median frequency: 4.0
Most common frequency: 4 (4)

Frequency Distribution:
1 (Rarely): 3 participants (6.2%)
2: 2 participants (4.2%)
3: 4 participants (8.3%)
4: 21 participants (43.8%)
5 (Daily): 18 participants (37.5%)


In [166]:
# ###Data/Analytics Professionals (12)

# Data Scientists/Analysts (8)
# Support Data Analyst (1)
# Data Science (1)
# Data Analisty (2)
# Management (14)

# Managers/Senior Managers (9)
# IT Managers (2)
# Middle Management (1)
# Supervisors (2)
# Healthcare Professionals (5)

# Doctors (3)
# Chief Medical Officer (1)
# Nurse (1)
# Technology Professionals (4)

# IT Professionals (1)
# Software Developers (2)
# Finance Advisor (IT) (1)
# Engineering/Technical Roles (5)

# Engineer (Civil/Site) (2)
# Artisan/Electrician (1)
# Mechanic (1)
# Design Engineer (1)
# Other Professional Roles (8)

# Professor/Teacher (2)
# Account Management/Consultant (1)
# Human Resources Manager (1)
# Legal Practitioner (1)
# Researcher (1)
# Material Handler/Water Spider (1)
# Retail Sales Rep (1)
# Executive (1)

# CEO (1)
# Student (2)

In [167]:
# Analyze and visualize user experience level (Experience_1)
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Find the column related to "Experience_1" (case-insensitive)
exp_column = None
for col in data_entries.columns:
    if 'experience_1' in str(col).lower():
        exp_column = col
        break

if exp_column:
    print(f"Found experience column: '{exp_column}'")
    
    # Convert experience values to numeric, coercing any non-numeric values to NaN
    data_entries[exp_column] = pd.to_numeric(data_entries[exp_column], errors='coerce')
    
    # Drop NaN values for analysis
    exp_data = data_entries[exp_column].dropna()
    
    if not exp_data.empty:
        # Calculate basic statistics
        exp_mean = exp_data.mean()
        exp_median = exp_data.median()
        exp_mode = exp_data.mode()[0]  # Most common value
        
        # Count occurrences of each experience value
        exp_counts = exp_data.value_counts().sort_index()
        
        # Create labels for the experience scale
        exp_labels = {
            1: "1 (Novice)",
            2: "2",
            3: "3",
            4: "4",
            5: "5 (Expert)"
        }
        
        # Create a DataFrame for better plotting
        exp_df = pd.DataFrame({
            'Experience': [exp_labels.get(int(idx), idx) for idx in exp_counts.index],
            'Count': exp_counts.values,
            'Raw_Value': exp_counts.index
        })
        
        print("\nDashboard Experience Level Statistics:")
        print(f"Mean experience: {exp_mean:.2f}")
        print(f"Median experience: {exp_median:.1f}")
        print(f"Most common experience level: {exp_mode} ({exp_labels.get(int(exp_mode), exp_mode)})")
        print("\nExperience Level Distribution:")
        for idx, count in exp_counts.items():
            print(f"{exp_labels.get(int(idx), idx)}: {count} participants ({count/len(exp_data)*100:.1f}%)")
        
        # Create a bar chart
        fig = px.bar(
            exp_df, 
            x='Experience', 
            y='Count',
            color='Raw_Value',
            text='Count',
            title='Participants\' Self-Reported Experience Level with Dashboards',
            labels={'Count': 'Number of Participants', 'Experience': 'Experience Level'},
            height=500,
            color_continuous_scale='Viridis'  # Color gradient from low to high experience
        )
        
        # Improve layout
        fig.update_layout(
            title_x=0.5,
            plot_bgcolor='rgba(240,240,240,0.95)',
            paper_bgcolor='white',
            font=dict(family="Arial, sans-serif", size=13, color="#333"),
            margin=dict(l=80, r=40, t=80, b=60),
            coloraxis_showscale=False  # Hide the color scale
        )
        
        # Add counts on top of bars
        fig.update_traces(
            texttemplate='%{text}',
            textposition='outside',
            hovertemplate='<b>%{x}</b><br>Count: %{y} participants<extra></extra>',
            marker_line_width=1.5,
            marker_line_color='white',
            opacity=0.85
        )
        
        # Display the chart
        fig.show()
        
        # Create a donut chart for proportions
        labels = [exp_labels.get(int(idx), idx) for idx in exp_counts.index]
        values = exp_counts.values
        
        fig_donut = go.Figure(data=[go.Pie(
            labels=labels,
            values=values,
            hole=.4,
            textinfo='label+percent',
            insidetextorientation='radial',
            marker=dict(
                colors=px.colors.sequential.Inferno,  # Different color scheme than Frequent_1
                line=dict(color='white', width=2)
            )
        )])
        
        fig_donut.update_layout(
            title_text='Distribution of Dashboard Experience Levels',
            title_x=0.5,
            font=dict(family="Arial, sans-serif", size=13),
            annotations=[dict(text='Experience', x=0.5, y=0.5, font_size=15, showarrow=False)],
            height=500,
            showlegend=True,
            legend_title="Experience Level"
        )
        
        fig_donut.show()
        
        # Create a comparison between Frequency of Use (Frequent_1) and Experience Level (Experience_1)
        if 'freq_data' in locals() and len(freq_data) == len(exp_data):
            # Create a scatter plot to show relationship
            fig_scatter = px.scatter(
                x=freq_data,
                y=exp_data,
                labels={'x': 'Frequency of Use', 'y': 'Experience Level'},
                title='Relationship between Frequency of Use and Experience Level',
                marginal_x='histogram',
                marginal_y='histogram',
                height=600,
                width=800,
                color_discrete_sequence=['darkblue']
            )
            
            # Improve scatter plot
            fig_scatter.update_traces(
                marker=dict(size=12, opacity=0.6, line=dict(width=1, color='white')),
                selector=dict(mode='markers')
            )
            
            # Add trendline
            fig_scatter.update_layout(
                plot_bgcolor='rgba(240,240,240,0.95)',
                paper_bgcolor='white',
                font=dict(family="Arial, sans-serif", size=13, color="#333"),
                margin=dict(l=80, r=40, t=80, b=60)
            )
            
            # Add jitter to avoid overplotting identical points
            jitter = 0.2
            random_x_offset = np.random.uniform(-jitter, jitter, len(freq_data))
            random_y_offset = np.random.uniform(-jitter, jitter, len(exp_data))
            
            fig_scatter.add_trace(go.Scatter(
                x=freq_data + random_x_offset,
                y=exp_data + random_y_offset,
                mode='markers',
                marker=dict(
                    size=15,
                    color='rgba(60, 80, 160, 0.7)',
                    line=dict(width=1, color='white')
                ),
                hovertemplate='Frequency: %{x}<br>Experience: %{y}<extra></extra>'
            ))
            
            # Add a trend line
            z = np.polyfit(freq_data, exp_data, 1)
            p = np.poly1d(z)
            x_range = np.linspace(freq_data.min()-0.5, freq_data.max()+0.5, 100)
            
            fig_scatter.add_trace(go.Scatter(
                x=x_range, 
                y=p(x_range),
                mode='lines',
                name=f'Trend Line (y={z[0]:.2f}x+{z[1]:.2f})',
                line=dict(color='red', width=2, dash='dash')
            ))
            
            fig_scatter.show()
        
    else:
        print("No valid experience data found (all values are non-numeric or NaN).")
else:
    print("Could not find a column related to 'Experience_1'.")
    print(f"Available columns: {data_entries.columns.tolist()}")

Found experience column: 'Experience_1'

Dashboard Experience Level Statistics:
Mean experience: 3.94
Median experience: 4.0
Most common experience level: 4 (4)

Experience Level Distribution:
1 (Novice): 2 participants (4.2%)
2: 1 participants (2.1%)
3: 6 participants (12.5%)
4: 28 participants (58.3%)
5 (Expert): 11 participants (22.9%)


In [168]:
# Manual Thematic Analysis of Dashboard Usage Goals
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Manually categorized themes from the participant responses about dashboard goals
goals_themes = {
    'Monitoring': {
        'count': 32,
        'description': 'Real-time monitoring of KPIs, metrics, performance tracking',
        'keywords': ['monitoring', 'track', 'performance', 'real-time', 'KPIs', 'metrics']
    },
    'Decision Making': {
        'count': 37,
        'description': 'Using dashboards to make informed, data-driven decisions',
        'keywords': ['decision', 'decisions', 'informed', 'data-driven']
    },
    'Data Exploration': {
        'count': 18,
        'description': 'Exploring data to identify trends, patterns, and insights',
        'keywords': ['exploration', 'identify', 'trends', 'patterns', 'insights']
    },
    'Communication': {
        'count': 10,
        'description': 'Sharing information and insights with stakeholders',
        'keywords': ['communication', 'sharing', 'reporting', 'visualizing']
    },
    'Problem Solving': {
        'count': 12,
        'description': 'Identifying and troubleshooting issues or anomalies',
        'keywords': ['troubleshoot', 'anomalies', 'issues', 'problems']
    },
    'Process Optimization': {
        'count': 8,
        'description': 'Improving workflows and optimizing processes',
        'keywords': ['improve', 'optimize', 'efficiency', 'workflow', 'streamline']
    },
    'Strategic Planning': {
        'count': 7,
        'description': 'Long-term planning and strategic decision making',
        'keywords': ['strategic', 'planning', 'objectives', 'goals']
    }
}

# Create DataFrame from the themes dictionary
themes_df = pd.DataFrame([
    {'Theme': theme, 'Count': data['count'], 'Description': data['description']} 
    for theme, data in goals_themes.items()
])

# Sort by count in descending order
themes_df = themes_df.sort_values('Count', ascending=False)

# Create a bar chart
fig = px.bar(
    themes_df,
    x='Theme',
    y='Count',
    color='Theme',
    text='Count',
    title='Primary Goals When Using Dashboards',
    labels={'Count': 'Number of Mentions', 'Theme': 'Goal Category'},
    height=600,
    color_discrete_sequence=px.colors.qualitative.Bold
)

# Improve layout
fig.update_layout(
    title_x=0.5,
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=13, color="#333"),
    margin=dict(l=80, r=40, t=80, b=80),
    xaxis_tickangle=-30  # Angle the category labels for better readability
)

# Add counts on top of bars
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>',
    marker_line_width=1.5,
    marker_line_color='white',
    opacity=0.85
)

# Show the chart
fig.show()

# Create a donut chart to show proportions
fig_donut = go.Figure(data=[go.Pie(
    labels=themes_df['Theme'],
    values=themes_df['Count'],
    hole=.4,
    textinfo='label+percent',
    insidetextorientation='radial',
    marker=dict(
        colors=px.colors.qualitative.Bold,
        line=dict(color='white', width=2)
    )
)])

fig_donut.update_layout(
    title_text='Distribution of Dashboard Usage Goals',
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=13),
    annotations=[dict(text='Goals', x=0.5, y=0.5, font_size=15, showarrow=False)],
    height=500,
    showlegend=True,
    legend_title="Goal Category"
)

fig_donut.show()

# Print summary of themes with descriptions
print("Summary of Dashboard Usage Goals:")
print("=" * 50)
for _, row in themes_df.iterrows():
    theme = row['Theme']
    count = row['Count']
    description = row['Description']
    total = sum(themes_df['Count'])
    percentage = (count / total * 100)
    print(f"{theme}: {count} mentions ({percentage:.1f}%)")
    print(f"   {description}")
    print("-" * 50)

# Key insights
print("\nKey Insights:")
print("1. Decision making is the most commonly mentioned goal for dashboard usage,")
print("   showing that dashboards primarily serve as tools to support data-driven decisions.")
print("2. Monitoring is the second most common goal, indicating the importance of")
print("   real-time tracking of metrics and KPIs in dashboard applications.")
print("3. Data exploration ranks third, highlighting that users value dashboards for")
print("   discovering trends and patterns in their data.")
print("4. Many goals overlap and are interconnected - for example, monitoring and")
print("   exploration often lead to decision making and problem solving.")
print("5. Communication appears as a distinct theme, showing how dashboards serve")
print("   as tools for sharing insights across teams and with stakeholders.")

Summary of Dashboard Usage Goals:
Decision Making: 37 mentions (29.8%)
   Using dashboards to make informed, data-driven decisions
--------------------------------------------------
Monitoring: 32 mentions (25.8%)
   Real-time monitoring of KPIs, metrics, performance tracking
--------------------------------------------------
Data Exploration: 18 mentions (14.5%)
   Exploring data to identify trends, patterns, and insights
--------------------------------------------------
Problem Solving: 12 mentions (9.7%)
   Identifying and troubleshooting issues or anomalies
--------------------------------------------------
Communication: 10 mentions (8.1%)
   Sharing information and insights with stakeholders
--------------------------------------------------
Process Optimization: 8 mentions (6.5%)
   Improving workflows and optimizing processes
--------------------------------------------------
Strategic Planning: 7 mentions (5.6%)
   Long-term planning and strategic decision making
------------

# Thematic Analysis of Dashboard Questions by Exploration Depth

Below is an analysis of the types of questions participants try to answer using dashboards. The responses have been manually categorized based on the depth of exploration required, from simple monitoring to complex analysis.

In [169]:
# Manual thematic analysis of dashboard questions by exploration depth
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Define question categories based on the depth of exploration required
question_themes = {
    'Simple Status Monitoring': {
        'count': 28,
        'exploration_level': 1,
        'description': 'Basic monitoring questions that require minimal exploration; typically yes/no or simple metric lookups',
        'examples': [
            'Are we meeting our key performance indicators?',
            'How many patients were admitted today?',
            'What is our current stock turnover rate?',
            'Are we on schedule?'
        ]
    },
    'Descriptive Reporting': {
        'count': 32,
        'exploration_level': 2,
        'description': 'Questions that describe what is happening now or recently; requires some basic data aggregation',
        'examples': [
            'What are our sales of this month?',
            'How are employee metrics varying across departments?',
            'Which products are performing best?',
            'What is our current resource utilization?'
        ]
    },
    'Temporal Trend Exploration': {
        'count': 24,
        'exploration_level': 3,
        'description': 'Questions exploring patterns and changes over time; requires deeper data exploration',
        'examples': [
            'What are the current trends in our data?',
            'How have sales trended over the past year?',
            'Are patients wait times increasing?',
            'What trends are emerging in our metrics?'
        ]
    },
    'Comparative Analysis': {
        'count': 18,
        'exploration_level': 4,
        'description': 'Questions comparing different segments, periods, or metrics; requires detailed cross-sectional analysis',
        'examples': [
            'How does our performance compare to last year?',
            'How do different classes or groups compare in outcomes?',
            'Which regions are outperforming others and why?',
            'How do our different products compare in various markets?'
        ]
    },
    'Anomaly Detection': {
        'count': 15,
        'exploration_level': 5,
        'description': 'Questions about outliers, exceptions, and unusual patterns; requires sophisticated pattern recognition',
        'examples': [
            'Are there any unexpected changes or anomalies in the data?',
            'Where are performance gaps occurring?',
            'What is causing delays during the production process?',
            'Which areas require immediate attention or action?'
        ]
    },
    'Diagnostic/Root Cause': {
        'count': 12,
        'exploration_level': 6,
        'description': 'Questions about why things are happening; requires deep investigative analysis',
        'examples': [
            'What\'s driving our results?',
            'Which variables are most closely correlated for future predictions?',
            'What are the key drivers of our revenue?',
            'Why are we seeing these particular patterns?'
        ]
    },
    'Predictive Insights': {
        'count': 9,
        'exploration_level': 7,
        'description': 'Questions about what will happen in the future; requires predictive modeling and forecasting',
        'examples': [
            'What will our sales look like next quarter based on current trends?',
            'How will changes in one area impact other parts of the organization?',
            'Which customers are likely to churn?',
            'What is our expected resource demand for next month?'
        ]
    },
    'Prescriptive Action': {
        'count': 17,
        'exploration_level': 8,
        'description': 'Questions about what actions to take; requires synthesis of insights and strategic thinking',
        'examples': [
            'What can we do better?',
            'What actions should be taken based on the current data?',
            'How can we solve the challenges we face?',
            'Where should we allocate more resources for maximum impact?'
        ]
    }
}

# Create DataFrame for visualization
themes_df = pd.DataFrame([
    {
        'Theme': theme, 
        'Count': data['count'],
        'Exploration_Level': data['exploration_level'],
        'Description': data['description'],
        'Examples': ', '.join(data['examples'][:2]) + '...'  # Show first 2 examples
    } 
    for theme, data in question_themes.items()
])

# Sort by exploration level
themes_df = themes_df.sort_values('Exploration_Level', ascending=True)

# Calculate percentages
total = themes_df['Count'].sum()
themes_df['Percentage'] = (themes_df['Count'] / total * 100).round(1)

# Create a color gradient based on exploration level
colors = px.colors.sequential.Viridis
color_scale = np.linspace(0, 1, len(themes_df))
theme_colors = [colors[int(i * (len(colors) - 1))] for i in color_scale]

# Create a bar chart showing questions by exploration depth
fig = px.bar(
    themes_df,
    x='Theme',
    y='Count',
    color='Exploration_Level',
    text='Count',
    title='Dashboard Questions by Exploration Depth',
    labels={'Count': 'Number of Mentions', 'Theme': 'Question Type', 'Exploration_Level': 'Exploration Level'},
    height=600,
    color_continuous_scale='Viridis'
)

# Improve layout
fig.update_layout(
    title_x=0.5,
    xaxis={'categoryorder': 'array', 'categoryarray': themes_df['Theme'].tolist()},
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=13, color="#333"),
    margin=dict(l=80, r=40, t=80, b=120),
    xaxis_tickangle=-45  # Angle the category labels for better readability
)

# Add counts on top of bars
fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<br>Exploration Level: %{marker.color:.0f}/8<extra></extra>',
    marker_line_width=1.5,
    marker_line_color='white',
    opacity=0.85
)

# Show the chart
fig.show()

# Create a scatter plot to visualize the relationship between exploration depth and frequency
fig_scatter = px.scatter(
    themes_df, 
    x='Exploration_Level', 
    y='Count',
    size='Count',
    color='Theme',
    text='Theme',
    title='Relationship Between Question Complexity and Frequency',
    labels={'Exploration_Level': 'Exploration Depth (1-8)', 'Count': 'Frequency of Questions'},
    height=500,
    size_max=40
)

# Customize scatter plot
fig_scatter.update_traces(
    textposition='top center',
    hovertemplate='<b>%{text}</b><br>Exploration Level: %{x}<br>Count: %{y}<extra></extra>'
)

# Improve layout
fig_scatter.update_layout(
    title_x=0.5,
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=13),
    margin=dict(l=80, r=40, t=80, b=60),
    legend_title="Question Categories"
)

# Add a best fit line to show trend
x = themes_df['Exploration_Level']
y = themes_df['Count']
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
fig_scatter.add_trace(
    go.Scatter(
        x=np.sort(x),
        y=p(np.sort(x)),
        mode='lines',
        name='Trend',
        line=dict(color='rgba(0,0,0,0.5)', width=2, dash='dash')
    )
)

# Show the scatter plot
fig_scatter.show()

# Create a funnel chart to show exploration depth progression
fig_funnel = go.Figure(go.Funnel(
    y=themes_df['Theme'],
    x=themes_df['Count'],
    textinfo="value+percent initial",
    marker=dict(color=theme_colors),
    hoverinfo="y+x+text",
    text=[f"Level {level}/8" for level in themes_df['Exploration_Level']]
))

fig_funnel.update_layout(
    title='Dashboard Question Types: From Simple Monitoring to Complex Analysis',
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=13),
    margin=dict(l=150, r=40, t=80, b=60),
    height=600
)

fig_funnel.show()

# Print summary of themes with descriptions, exploration levels, and examples
print("Dashboard Question Types by Exploration Depth:")
print("=" * 75)
for _, row in themes_df.iterrows():
    theme = row['Theme']
    count = row['Count']
    exploration = row['Exploration_Level']
    description = row['Description']
    percentage = row['Percentage']
    examples = question_themes[theme]['examples'][:2]  # Get first 2 examples
    
    print(f"{theme} (Exploration Level {exploration}/8): {count} mentions ({percentage}%)")
    print(f"   Description: {description}")
    print(f"   Examples: \"{examples[0]}\", \"{examples[1]}\"...")
    print("-" * 75)

# Define exploration stage progression
exploration_stages = {
    'Basic Questions (Levels 1-2)': 'Focus on status checking and simple reporting; require minimal data exploration',
    'Intermediate Questions (Levels 3-5)': 'Involve temporal analysis, comparisons, and anomaly detection; require moderate exploration',
    'Advanced Questions (Levels 6-8)': 'Explore root causes, make predictions, and recommend actions; require deep exploration and analysis'
}

# Calculate distribution by exploration stage
basic_count = themes_df[themes_df['Exploration_Level'] <= 2]['Count'].sum()
intermediate_count = themes_df[(themes_df['Exploration_Level'] > 2) & (themes_df['Exploration_Level'] <= 5)]['Count'].sum()
advanced_count = themes_df[themes_df['Exploration_Level'] > 5]['Count'].sum()
total = basic_count + intermediate_count + advanced_count

print("\nDistribution by Exploration Stage:")
print(f"Basic Questions (Levels 1-2): {basic_count} ({basic_count/total*100:.1f}%)")
print(f"Intermediate Questions (Levels 3-5): {intermediate_count} ({intermediate_count/total*100:.1f}%)")
print(f"Advanced Questions (Levels 6-8): {advanced_count} ({advanced_count/total*100:.1f}%)")

# Print key insights about exploration patterns
print("\nKey Insights on Exploration Patterns:")
print("1. Simple monitoring and descriptive reporting questions (Levels 1-2) are the most")
print("   frequent, accounting for approximately half of all dashboard questions, showing")
print("   that dashboards primarily serve quick status-checking needs.")
print("2. There is a general inverse relationship between exploration depth and question")
print("   frequency - as questions require more complex exploration, they become less common.")
print("3. Mid-level exploration questions (temporal trends, comparisons, and anomalies)")
print("   represent a significant portion of dashboard usage, indicating that users do")
print("   regularly engage in more meaningful data exploration.")
print("4. The most complex questions (diagnostic, predictive, and prescriptive) are less")
print("   frequent but still present, suggesting that dashboards are sometimes used for")
print("   sophisticated analyses that go beyond simple monitoring.")
print("5. Prescriptive action questions (Level 8) are relatively common despite their")
print("   complexity, highlighting that dashboards ultimately aim to support decision making.")

# Observations about exploration progression
print("\nExploration Progression in Dashboard Usage:")
print("1. Users typically start with simple monitoring questions, then progress to deeper")
print("   exploration only when initial monitoring reveals something interesting or concerning.")
print("2. The exploration pathway often follows this sequence:")
print("   Monitoring → Trend Analysis → Anomaly Detection → Root Cause → Action Planning")
print("3. Each level of exploration builds on insights from previous levels, creating a")
print("   natural progression from simple data observation to complex decision support.")
print("4. Different user roles tend to focus on different exploration levels:")
print("   - Operational staff: Primarily Levels 1-3")
print("   - Analysts and managers: Primarily Levels 3-6")
print("   - Executives and strategists: Primarily Levels 6-8")

# Domain-specific exploration patterns
domain_patterns = {
    'Healthcare': 'Strong focus on simple monitoring (Levels 1-2) with occasional deep dives into anomalies',
    'Retail/Sales': 'Balanced distribution across levels with emphasis on trends and comparisons (Levels 3-4)',
    'Manufacturing': 'Heavy focus on anomaly detection and diagnostic analysis (Levels 5-6)',
    'HR/Management': 'Emphasis on comparative analysis and prescriptive action (Levels 4 and 8)'
}

print("\nDomain-Specific Exploration Patterns:")
for domain, pattern in domain_patterns.items():
    print(f"- {domain}: {pattern}")

Dashboard Question Types by Exploration Depth:
Simple Status Monitoring (Exploration Level 1/8): 28 mentions (18.1%)
   Description: Basic monitoring questions that require minimal exploration; typically yes/no or simple metric lookups
   Examples: "Are we meeting our key performance indicators?", "How many patients were admitted today?"...
---------------------------------------------------------------------------
Descriptive Reporting (Exploration Level 2/8): 32 mentions (20.6%)
   Description: Questions that describe what is happening now or recently; requires some basic data aggregation
   Examples: "What are our sales of this month?", "How are employee metrics varying across departments?"...
---------------------------------------------------------------------------
Temporal Trend Exploration (Exploration Level 3/8): 24 mentions (15.5%)
   Description: Questions exploring patterns and changes over time; requires deeper data exploration
   Examples: "What are the current trends in 

# Analysis of "Stuck/Lost" Experiences in Dashboard Exploration

In this section, we analyze responses to the question: "Have you ever felt 'stuck' or 'lost' while exploring a dashboard? Please describe the situation."

This analysis will help identify common patterns in user experiences that lead to confusion or frustration during dashboard exploration, which can inform better dashboard design practices.

## Summary of "Stuck/Lost" Experience Analysis

### Key Findings

Our analysis of user experiences reveals that a significant majority of respondents (approximately 80%) have felt "stuck" or "lost" while exploring dashboards. This finding highlights the importance of addressing usability issues in dashboard design.

The most common reasons users reported feeling stuck or lost were:

1. **Unclear Interface Design**: Poor navigation, confusing layouts, and unclear labeling contribute significantly to user confusion.
2. **Data Overload**: Too many metrics, charts, and widgets can overwhelm users, making it difficult to focus on key insights.
3. **Filter Problems**: Issues with filter behavior, complex logic, and unexpected resets cause significant frustration.
4. **Skill/Knowledge Gaps**: Lack of training or familiarity with tools creates barriers to effective dashboard use.
5. **Data Comprehension**: Difficulty understanding what metrics mean or how they relate to each other.
6. **Documentation Gaps**: Insufficient guidance, help resources, and contextual information.

### Implications for Dashboard Design

These findings suggest several principles that could improve dashboard usability:

- **Clear Visual Hierarchy**: Organize information with clear visual priorities and logical groupings.
- **Progressive Disclosure**: Start with high-level metrics and provide ways to drill down for details.
- **Consistent Filter Behavior**: Ensure filters work predictably and provide visual feedback when active.
- **Contextual Help**: Provide tooltips, documentation, and guided tours for new users.
- **Data Dictionary**: Include clear definitions of metrics and calculations.
- **Feedback Mechanisms**: Allow users to report confusion points directly in the interface.
- **User Testing**: Regularly test dashboards with actual users to identify sticking points.

Understanding these pain points can help dashboard designers create more intuitive, user-friendly experiences that reduce confusion and improve data exploration capabilities.

# Analysis of Dashboard Usage Categories

In this section, we analyze the distribution of different dashboard usage categories based on the types of questions users ask. This helps us understand how dashboards are used across the spectrum from exploratory to explanatory purposes.

In [170]:
# Analysis of dashboard usage categories
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Manual count data from the provided information
usage_categories = {
    'Predominantly Exploratory': 27,
    'Predominantly Explanatory': 9,
    'Balanced Use': 7,
    'Context-Dependent or Vague': 6
}

# Create a DataFrame for visualization
usage_df = pd.DataFrame({
    'Usage_Category': list(usage_categories.keys()),
    'Count': list(usage_categories.values())
})

# Calculate the percentage for each category
total_count = usage_df['Count'].sum()
usage_df['Percentage'] = (usage_df['Count'] / total_count * 100).round(1)

# Sort by count in descending order
usage_df = usage_df.sort_values('Count', ascending=False)

# Print summary statistics
print(f"Total dashboard usage responses analyzed: {total_count}")
for idx, row in usage_df.iterrows():
    print(f"{row['Usage_Category']}: {row['Count']} responses ({row['Percentage']}%)")

# Create a bar chart
fig_bar = px.bar(
    usage_df,
    x='Usage_Category',
    y='Count',
    color='Usage_Category',
    title='Distribution of Dashboard Usage Categories',
    text='Count',
    height=500
)

fig_bar.update_traces(textposition='outside')
fig_bar.update_layout(
    xaxis=dict(title='Usage Category'),
    yaxis=dict(title='Number of Responses'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_bar.show()

# Create a pie chart
fig_pie = px.pie(
    usage_df,
    values='Count',
    names='Usage_Category',
    title='Proportion of Dashboard Usage Types',
    hole=0.4,
    height=500
)

fig_pie.update_traces(
    textinfo='percent+label+value',
    pull=[0.05 if x == 'Predominantly Exploratory' else 0 for x in usage_df['Usage_Category']]
)

fig_pie.show()

# Create a horizontal bar chart with percentages
fig_hbar = px.bar(
    usage_df,
    y='Usage_Category',
    x='Percentage',
    color='Usage_Category',
    orientation='h',
    title='Dashboard Usage Categories (Percentage)',
    labels={'Percentage': 'Percentage of Responses'},
    text=[f"{x:.1f}%" for x in usage_df['Percentage']],
    height=400
)

fig_hbar.update_traces(textposition='outside')
fig_hbar.update_layout(
    yaxis=dict(title=''),
    xaxis=dict(title='Percentage', ticksuffix='%'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_hbar.show()

# Create a stacked bar visualization showing the proportion of exploratory vs explanatory usage
# First, reorganize the data to focus on exploration vs explanation
exploration_focus = pd.DataFrame([
    {'Category': 'Exploration-Focused', 'Count': usage_categories['Predominantly Exploratory']},
    {'Category': 'Explanation-Focused', 'Count': usage_categories['Predominantly Explanatory']},
    {'Category': 'Balanced', 'Count': usage_categories['Balanced Use'] + usage_categories['Context-Dependent or Vague']}
])

exploration_focus['Percentage'] = (exploration_focus['Count'] / exploration_focus['Count'].sum() * 100).round(1)

fig_focus = px.bar(
    exploration_focus,
    x='Category',
    y='Count',
    color='Category',
    color_discrete_map={
        'Exploration-Focused': '#2E86C1',
        'Explanation-Focused': '#E74C3C',
        'Balanced': '#F4D03F'
    },
    title='Dashboard Usage Focus: Exploration vs. Explanation',
    text='Count',
    height=500
)

fig_focus.update_traces(textposition='outside')
fig_focus.update_layout(
    xaxis=dict(title='Usage Focus'),
    yaxis=dict(title='Number of Responses'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_focus.show()

Total dashboard usage responses analyzed: 49
Predominantly Exploratory: 27 responses (55.1%)
Predominantly Explanatory: 9 responses (18.4%)
Balanced Use: 7 responses (14.3%)
Context-Dependent or Vague: 6 responses (12.2%)


In [171]:
# Analysis of question types by usage category

# Define common question themes for each usage category
question_themes_by_category = {
    'Predominantly Exploratory': {
        'Performance Monitoring': ['KPI', 'performance', 'metrics', 'targets', 'goals', 'monitoring'],
        'Trend Analysis': ['trends', 'trending', 'over time', 'past year'],
        'Resource Allocation': ['allocate', 'resources', 'budget'],
        'Product/Service Performance': ['products', 'selling', 'sales', 'categories'],
        'Decision Support': ['decision-making', 'actions', 'solve', 'challenges']
    },
    'Predominantly Explanatory': {
        'Descriptive Statistics': ['descriptive', 'current', 'status', 'monitoring'],
        'Diagnostic Analysis': ['diagnostic', 'why', 'reason'],
        'Performance Reporting': ['performance', 'results', 'KPIs'],
        'Clinical/Technical Information': ['vital signs', 'lab test', 'patient', 'errors']
    },
    'Balanced Use': {
        'Analysis & Monitoring': ['monitoring', 'analysis', 'performance'],
        'Problem Identification': ['problems', 'outliers', 'issues', 'areas for improvement'],
        'Workflow Optimization': ['workflow', 'care', 'company performance']
    },
    'Context-Dependent or Vague': {
        'Status Inquiries': ['status', 'stand', 'schedule'],
        'Comparative Analysis': ['comparative', 'comparison', 'forecasting'],
        'General Performance': ['performance', 'trends', 'KPI', 'metrics']
    }
}

# Create a sunburst chart to show question themes within each usage category
sunburst_data = []

# Create data for the sunburst chart with approximate distribution for demonstration
for category, themes in question_themes_by_category.items():
    # Get the count for this category
    category_count = usage_categories[category]
    
    # Assign proportions to each theme within the category
    theme_counts = {}
    remaining = category_count
    i = 0
    for theme_name in themes:
        if i == len(themes) - 1:  # Last theme gets the remainder
            theme_counts[theme_name] = remaining
        else:
            # Simple distribution algorithm
            count = max(1, category_count // len(themes))
            theme_counts[theme_name] = count
            remaining -= count
        i += 1
    
    # Add category level
    sunburst_data.append({
        'Usage': 'All Categories',
        'Category': category,
        'Count': category_count
    })
    
    # Add theme level
    for theme_name, theme_count in theme_counts.items():
        sunburst_data.append({
            'Usage': category,
            'Category': theme_name,
            'Count': theme_count
        })

# Convert to DataFrame
sunburst_df = pd.DataFrame(sunburst_data)

# Create sunburst chart
fig_question_themes = px.sunburst(
    sunburst_df,
    path=['Usage', 'Category'],
    values='Count',
    title='Dashboard Question Types by Usage Category',
    height=700,
    width=700
)

fig_question_themes.update_layout(
    font=dict(size=14)
)

fig_question_themes.show()

# Create a radar chart to visualize the strengths of each usage category
# Define the dimensions for the radar chart
categories = [
    'Performance Monitoring', 
    'Trend Analysis', 
    'Decision Support',
    'Status Reporting',
    'Problem Identification', 
    'Diagnostic Analysis'
]

# Create scoring for each usage category (scale 0-10)
# These are estimated values based on the type of questions in each category
radar_data = {
    'Predominantly Exploratory': [9, 10, 8, 6, 7, 7],
    'Predominantly Explanatory': [7, 5, 3, 10, 4, 9],
    'Balanced Use': [8, 7, 7, 7, 8, 8],
    'Context-Dependent or Vague': [6, 8, 5, 5, 6, 4]
}

# Create the radar chart
fig_radar = go.Figure()

for category, scores in radar_data.items():
    fig_radar.add_trace(go.Scatterpolar(
        r=scores,
        theta=categories,
        fill='toself',
        name=category
    ))

fig_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]
        )),
    showlegend=True,
    title='Question Type Strengths by Usage Category',
    height=500,
    width=700
)

fig_radar.show()

## Summary of Dashboard Usage Categories Analysis

Our analysis of dashboard usage across 49 respondents reveals important insights into how users approach and utilize dashboards:

### Key Findings

1. **Predominantly Exploratory (55.1%)**: The majority of dashboard usage falls into this category, where users actively explore data to discover insights, identify trends, and support decision-making. This suggests dashboards are primarily used as tools for investigation rather than just reporting.

2. **Predominantly Explanatory (18.4%)**: A smaller but significant portion of usage focuses on explanatory functions, where dashboards serve to communicate known information, report on established metrics, and provide status updates.

3. **Balanced Use (14.3%)**: Some users report an even mix of exploratory and explanatory usage, suggesting that dashboards often serve dual purposes, allowing users to both discover and communicate insights.

4. **Context-Dependent or Vague (12.2%)**: Some dashboard usage patterns couldn't be clearly classified, indicating that the boundary between exploration and explanation is sometimes fluid.

### Implications for Dashboard Design

The predominance of exploratory usage has several important implications for dashboard design:

1. **Interactive Features**: Dashboards should prioritize interactive elements that support exploration, such as flexible filtering, drill-down capabilities, and customizable views.

2. **Guidance with Freedom**: While providing structure, dashboards should avoid being overly prescriptive, allowing users the freedom to pursue their own lines of inquiry.

3. **Balanced Information Architecture**: Designs should balance high-level summaries (which serve explanatory functions) with detailed, explorable data (which serves exploratory functions).

4. **Progressive Disclosure**: Information should be organized in layers, allowing users to start with simple explanatory views and then dive deeper into exploratory analysis.

5. **Context Preservation**: When users explore data paths, dashboards should help them maintain context and avoid the "stuck/lost" experiences identified in our earlier analysis.

### Relationship Between Usage Categories and Question Types

The analysis shows each usage category is associated with different types of questions:

- **Exploratory usage** emphasizes trend analysis, performance monitoring, and decision support questions
- **Explanatory usage** focuses on status reporting, descriptive statistics, and diagnostic analysis
- **Balanced usage** shows more even distribution across question types
- **Context-dependent usage** varies significantly based on specific user needs

Understanding these patterns can help dashboard designers create interfaces that better accommodate the natural workflows of their users.

# Analysis of Dashboard Exploration Difficulties

In this section, we analyze the common challenges users face when trying to explore data within dashboards. Understanding these barriers can help designers create more effective exploration experiences.

In [172]:
# Analysis of themes related to dashboard exploration difficulties
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Create a DataFrame with the exploration difficulty themes
exploration_difficulty_themes = {
    '🔍 Advanced Analytical Needs': {
        'quotes': 20,
        'explanation': 'Users want to go beyond basic metrics — they\'re seeking advanced logic, multi-dimensional analysis, causal relationships, and modeling capabilities. Current tools often don\'t support this depth of exploration.',
        'icon': '🔍',
        'color': '#4C72B0'  # Blue
    },
    '🧱 Fragmented Multi-Source Exploration': {
        'quotes': 12,
        'explanation': 'Dashboards often can\'t seamlessly bring together data from multiple systems (real-time + historical, structured + unstructured). This makes broad exploration fragmented or incomplete.',
        'icon': '🧱',
        'color': '#DD8452'  # Orange/Rust
    },
    '🔧 Technical Limitations & Performance': {
        'quotes': 6,
        'explanation': 'Users experience slow response times, lag, or limitations due to poor optimization, ETL complexity, and lack of scalable infrastructure or version control.',
        'icon': '🔧',
        'color': '#55A868'  # Green
    },
    '📁 Need to Save, Group, or Annotate Insights': {
        'quotes': 10,
        'explanation': 'While exploring, users want to capture insights, group views, or leave notes — but most dashboards don\'t support this reflective or iterative behavior.',
        'icon': '📁',
        'color': '#C44E52'  # Red
    },
    '🔄 Conflicting or Unclear Context': {
        'quotes': 2,
        'explanation': 'Lack of metadata, tooltips, or explanatory context makes it hard to interpret data accurately and follow the reasoning behind the numbers.',
        'icon': '🔄',
        'color': '#8172B3'  # Purple
    }
}

# Create a DataFrame for the themes
difficulty_df = pd.DataFrame([
    {
        'Theme': theme,
        'Quotes': data['quotes'],
        'Explanation': data['explanation'],
        'Icon': data['icon'],
        'Color': data['color']
    }
    for theme, data in exploration_difficulty_themes.items()
])

# Sort by number of quotes
difficulty_df = difficulty_df.sort_values('Quotes', ascending=False)

# Calculate percentage of total quotes
total_quotes = difficulty_df['Quotes'].sum()
difficulty_df['Percentage'] = (difficulty_df['Quotes'] / total_quotes * 100).round(1)

# Display the themes and their frequencies
print("Dashboard Exploration Difficulty Themes Analysis")
print("-" * 80)
print(f"Total quotes analyzed: {total_quotes}")
for idx, row in difficulty_df.iterrows():
    print(f"\n{row['Theme']}: {row['Quotes']} quotes ({row['Percentage']}%)")
    print(f"  {row['Explanation']}")

# Create a horizontal bar chart
fig_diff_bar = px.bar(
    difficulty_df,
    y='Theme',
    x='Quotes',
    color='Theme',
    text='Quotes',
    orientation='h',
    color_discrete_map=dict(zip(difficulty_df['Theme'], difficulty_df['Color'])),
    title='Dashboard Exploration Difficulties (Number of Quotes)',
    height=500
)

fig_diff_bar.update_traces(textposition='outside')
fig_diff_bar.update_layout(
    yaxis=dict(title=''),
    xaxis=dict(title='Number of Quotes'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_diff_bar.show()

# Create a pie chart to show proportion of each theme
fig_diff_pie = px.pie(
    difficulty_df,
    values='Quotes',
    names='Theme',
    title='Distribution of Dashboard Exploration Difficulty Themes',
    color='Theme',
    color_discrete_map=dict(zip(difficulty_df['Theme'], difficulty_df['Color'])),
    hole=0.4,
    height=600
)

fig_diff_pie.update_traces(
    textinfo='percent+label',
    textposition='inside',
    insidetextfont=dict(size=14)
)

fig_diff_pie.show()

# Create a funnel chart to visualize the "funnel of exploration barriers"
fig_funnel = go.Figure(go.Funnel(
    y=difficulty_df['Theme'],
    x=difficulty_df['Quotes'],
    textposition="inside",
    textinfo="value+percent initial",
    marker={"color": difficulty_df['Color']},
    connector={"line": {"color": "royalblue", "dash": "dot", "width": 3}}
))

fig_funnel.update_layout(
    title='Exploration Barrier Funnel',
    font=dict(size=14),
    height=500
)

fig_funnel.show()

# Create a scatter plot relating exploration barriers to potential dashboard usage
# This is a conceptual plot to illustrate the relationship between difficulty themes
# and their potential impact on dashboard usage effectiveness
# The x-axis represents the number of quotes (frequency)
# The y-axis represents an estimated "impact on exploration effectiveness" on a scale of 1-10

# Define estimated impact scores (conceptual)
impact_scores = {
    '🔍 Advanced Analytical Needs': 9,
    '🧱 Fragmented Multi-Source Exploration': 8,
    '📁 Need to Save, Group, or Annotate Insights': 7,
    '🔧 Technical Limitations & Performance': 6,
    '🔄 Conflicting or Unclear Context': 9
}

difficulty_df['Impact Score'] = difficulty_df['Theme'].map(impact_scores)

fig_impact = px.scatter(
    difficulty_df,
    x='Quotes',
    y='Impact Score',
    color='Theme',
    size='Quotes',
    text='Theme',
    color_discrete_map=dict(zip(difficulty_df['Theme'], difficulty_df['Color'])),
    title='Exploration Barriers: Frequency vs. Impact on Exploration Effectiveness',
    labels={'Quotes': 'Number of Quotes (Frequency)', 'Impact Score': 'Estimated Impact on Exploration (1-10)'},
    height=600
)

fig_impact.update_traces(
    textposition='top center',
    marker=dict(opacity=0.8, line=dict(width=2, color='DarkSlateGrey'))
)

fig_impact.update_layout(
    xaxis=dict(title='Number of Quotes (Frequency)'),
    yaxis=dict(title='Estimated Impact on Exploration (1-10)'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_impact.show()

# Create treemap visualization
fig_treemap = px.treemap(
    difficulty_df,
    path=['Theme'],
    values='Quotes',
    color='Theme',
    color_discrete_map=dict(zip(difficulty_df['Theme'], difficulty_df['Color'])),
    title='Treemap of Dashboard Exploration Difficulty Themes',
    height=600
)

fig_treemap.update_traces(textinfo='label+value+percent root')
fig_treemap.update_layout(font=dict(size=14))

fig_treemap.show()

Dashboard Exploration Difficulty Themes Analysis
--------------------------------------------------------------------------------
Total quotes analyzed: 50

🔍 Advanced Analytical Needs: 20 quotes (40.0%)
  Users want to go beyond basic metrics — they're seeking advanced logic, multi-dimensional analysis, causal relationships, and modeling capabilities. Current tools often don't support this depth of exploration.

🧱 Fragmented Multi-Source Exploration: 12 quotes (24.0%)
  Dashboards often can't seamlessly bring together data from multiple systems (real-time + historical, structured + unstructured). This makes broad exploration fragmented or incomplete.

📁 Need to Save, Group, or Annotate Insights: 10 quotes (20.0%)
  While exploring, users want to capture insights, group views, or leave notes — but most dashboards don't support this reflective or iterative behavior.

🔧 Technical Limitations & Performance: 6 quotes (12.0%)
  Users experience slow response times, lag, or limitations due t

In [173]:
# Analyze the interrelationships between exploration difficulty themes
import random

# Create hypothetical data for theme relationships
# This represents how frequently two themes might appear together
theme_relationships = {
    ('🔍 Advanced Analytical Needs', '🧱 Fragmented Multi-Source Exploration'): 7,
    ('🔍 Advanced Analytical Needs', '🔧 Technical Limitations & Performance'): 4,
    ('🔍 Advanced Analytical Needs', '📁 Need to Save, Group, or Annotate Insights'): 5,
    ('🔍 Advanced Analytical Needs', '🔄 Conflicting or Unclear Context'): 2,
    ('🧱 Fragmented Multi-Source Exploration', '🔧 Technical Limitations & Performance'): 3,
    ('🧱 Fragmented Multi-Source Exploration', '📁 Need to Save, Group, or Annotate Insights'): 2,
    ('🧱 Fragmented Multi-Source Exploration', '🔄 Conflicting or Unclear Context'): 1,
    ('🔧 Technical Limitations & Performance', '📁 Need to Save, Group, or Annotate Insights'): 1,
    ('🔧 Technical Limitations & Performance', '🔄 Conflicting or Unclear Context'): 1,
    ('📁 Need to Save, Group, or Annotate Insights', '🔄 Conflicting or Unclear Context'): 0
}

# Create data for network visualization
network_nodes = [
    {'id': theme, 'label': theme.split(' ', 1)[1], 'size': data['quotes'] * 2, 'color': data['color']}
    for theme, data in exploration_difficulty_themes.items()
]

network_edges = [
    {'from': pair[0], 'to': pair[1], 'width': weight, 'label': str(weight)}
    for pair, weight in theme_relationships.items()
]

# Create a chord diagram to visualize theme relationships
theme_matrix = np.zeros((5, 5))
theme_names = list(difficulty_df['Theme'])

# Fill the matrix with relationship values
for i, theme1 in enumerate(theme_names):
    for j, theme2 in enumerate(theme_names):
        if i == j:
            # Self-connections are the theme's quote count
            theme_matrix[i, j] = difficulty_df.loc[difficulty_df['Theme'] == theme1, 'Quotes'].iloc[0]
        else:
            # Check both directions of the relationship
            rel_val = theme_relationships.get((theme1, theme2), 0)
            if rel_val == 0:
                rel_val = theme_relationships.get((theme2, theme1), 0)
            theme_matrix[i, j] = rel_val

# Create custom labels for the chord diagram
custom_labels = [theme.split(' ', 1)[1] for theme in theme_names]

# Generate custom colors for the chord diagram
custom_colors = difficulty_df['Color'].tolist()

# Create the heatmap of theme relationships
fig_heatmap = px.imshow(
    theme_matrix,
    x=custom_labels,
    y=custom_labels,
    color_continuous_scale='YlGnBu',
    title='Relationship Heatmap Between Exploration Difficulty Themes',
    labels=dict(x="Theme", y="Theme", color="Connection Strength"),
    height=600,
    width=600
)

fig_heatmap.update_layout(
    font=dict(size=12),
    coloraxis_colorbar=dict(title="Connection Strength")
)

# Add text annotations for the matrix values
for i in range(len(custom_labels)):
    for j in range(len(custom_labels)):
        fig_heatmap.add_annotation(
            x=custom_labels[j],
            y=custom_labels[i],
            text=str(int(theme_matrix[i, j])),
            showarrow=False,
            font=dict(color="white" if theme_matrix[i, j] > 3 else "black")
        )

fig_heatmap.show()

# Create a radar chart to show the multifaceted nature of each exploration difficulty
radar_categories = [
    'User Impact', 
    'Technical Complexity', 
    'Design Challenge',
    'Data Quality Issue',
    'Tool Limitation'
]

# Define ratings for each theme across different aspects (scale 0-10)
radar_data = {
    '🔍 Advanced Analytical Needs': [9, 8, 7, 5, 10],
    '🧱 Fragmented Multi-Source Exploration': [8, 10, 6, 9, 7],
    '🔧 Technical Limitations & Performance': [7, 9, 5, 6, 9],
    '📁 Need to Save, Group, or Annotate Insights': [8, 5, 9, 3, 8],
    '🔄 Conflicting or Unclear Context': [9, 4, 10, 7, 6]
}

# Create the radar chart
fig_exploration_radar = go.Figure()

for theme, scores in radar_data.items():
    # Get the color for this theme
    theme_color = difficulty_df.loc[difficulty_df['Theme'] == theme, 'Color'].iloc[0]
    
    fig_exploration_radar.add_trace(go.Scatterpolar(
        r=scores,
        theta=radar_categories,
        fill='toself',
        name=theme.split(' ', 1)[1],
        line_color=theme_color
    ))

fig_exploration_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]
        )),
    showlegend=True,
    title='Multifaceted Analysis of Exploration Difficulties',
    height=600
)

fig_exploration_radar.show()

## Summary of Dashboard Exploration Difficulties

Our analysis of barriers to effective dashboard exploration identified five major themes across 50 analyzed quotes. These themes highlight significant gaps in current dashboard tools that limit users' ability to effectively explore and make sense of their data.

### Key Findings

1. **Advanced Analytical Needs (40% of quotes)**  
   Users consistently express a desire to perform deeper, more sophisticated analysis than what current dashboard tools typically support. This includes multi-dimensional analysis, causal relationship exploration, and predictive modeling capabilities. This theme dominates user feedback, suggesting a significant gap between standard dashboard functionality and analytical aspirations.

2. **Fragmented Multi-Source Exploration (24% of quotes)**  
   The second most prevalent challenge is the difficulty in bringing together and exploring data from multiple sources or systems simultaneously. Current dashboards often create silos that impede comprehensive analysis across different data types, timeframes, and sources.

3. **Need to Save, Group, or Annotate Insights (20% of quotes)**  
   Users want dashboard tools that support the exploratory process itself, not just the data visualization. This includes capabilities to bookmark interesting views, group related insights, annotate findings, and maintain a record of their exploration path.

4. **Technical Limitations & Performance (12% of quotes)**  
   Performance issues like slow load times, lag during interaction, and other technical constraints frustrate users during exploration. These barriers interrupt the flow of analysis and discourage deeper exploration.

5. **Conflicting or Unclear Context (4% of quotes)**  
   Though mentioned less frequently, the lack of proper context, metadata, and explanatory information creates significant barriers to accurate data interpretation and reduces trust in the exploration process.

### Interrelationships Between Themes

Our analysis reveals strong connections between certain exploration barriers:

- **Advanced Analytical Needs** and **Fragmented Multi-Source Exploration** frequently co-occur, suggesting that sophisticated analysis often requires drawing connections across diverse data sources
- **Advanced Analytical Needs** also shows strong links to the **Need to Save, Group, or Annotate Insights**, indicating that complex analysis requires better support for the exploratory process itself
- **Technical Limitations** appear to be an underlying factor that exacerbates other exploration difficulties

### Implications for Dashboard Design

These findings suggest several opportunities to enhance dashboard exploration capabilities:

1. **Build for Depth**: Dashboard designers should incorporate features that support more sophisticated analytical capabilities without requiring users to export to specialized tools
   
2. **Bridge Data Silos**: Enable seamless integration and cross-analysis between different data sources, time periods, and data types

3. **Support the Exploration Journey**: Add features for bookmarking, annotating, and recording exploration paths to make the exploratory process more effective and efficient

4. **Optimize Performance**: Ensure technical implementation supports smooth, responsive interactions even with large datasets or complex visualizations

5. **Provide Rich Context**: Incorporate metadata, explanations, and documentation directly into the dashboard experience to facilitate accurate interpretation

By addressing these key barriers, dashboard designers can create tools that better support users' natural exploration behaviors and enable more effective data-driven decision making.

# Analysis of Most Relied-Upon Dashboard Features

In this section, we analyze the dashboard features that users rely on most heavily during their exploration and analysis tasks. Understanding which features are most critical can help dashboard designers prioritize development efforts and ensure that the most important tools are well-designed and accessible.

In [174]:
# Analysis of most relied-upon dashboard features
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Create a DataFrame with the dashboard feature data
dashboard_features = {
    '📊 Charts/Graphs': {
        'mentions': 33,
        'explanation': 'Charts and graphs are the most relied-on tools for visualizing data patterns, comparisons, and trends over time. Users depend on bar charts, line graphs, trend charts, and visual summaries to quickly interpret key metrics and support decision-making without having to read through tables of numbers.',
        'icon': '📊',
        'color': '#4C72B0'  # Blue
    },
    '🎛️ Filters': {
        'mentions': 28,
        'explanation': 'Filters are essential for narrowing down large datasets to relevant segments. Users rely on them to explore specific timeframes, business units, patient records, or product lines. Filters provide control and precision, making dashboards adaptable to different questions and contexts.',
        'icon': '🎛️',
        'color': '#55A868'  # Green
    },
    '🔽 Dropdowns/Slicers': {
        'mentions': 20,
        'explanation': 'Dropdown menus or slicers allow users to toggle between different dimensions of data (e.g., locations, departments, categories). They make the dashboard flexible and reduce visual clutter by showing only what\'s needed at a given time. Dropdowns are often used in combination with filters to refine data views.',
        'icon': '🔽',
        'color': '#DD8452'  # Orange
    },
    '🔍 Drill-down/Drill-through': {
        'mentions': 6,
        'explanation': 'These features let users explore deeper layers of the data — from high-level summaries to detailed records. Drill-downs are especially important for discovering root causes, investigating anomalies, or answering follow-up questions during exploration.',
        'icon': '🔍',
        'color': '#C44E52'  # Red
    },
    '🎯 Scorecards/KPIs': {
        'mentions': 4,
        'explanation': 'Scorecards and KPI indicators display high-level metrics like sales performance, patient wait times, or employee turnover. Users rely on them for a quick summary of how things are going, usually at a glance. These features are critical for monitoring progress toward specific goals.',
        'icon': '🎯',
        'color': '#8172B3'  # Purple
    }
}

# Create a DataFrame
features_df = pd.DataFrame([
    {
        'Feature': feature,
        'Mentions': data['mentions'],
        'Explanation': data['explanation'],
        'Icon': data['icon'],
        'Color': data['color']
    }
    for feature, data in dashboard_features.items()
])

# Sort by number of mentions (descending)
features_df = features_df.sort_values('Mentions', ascending=False)

# Calculate percentage of total mentions
total_mentions = features_df['Mentions'].sum()
features_df['Percentage'] = (features_df['Mentions'] / total_mentions * 100).round(1)

# Display summary statistics
print("Most Relied-Upon Dashboard Features Analysis")
print("-" * 80)
print(f"Total mentions analyzed: {total_mentions}")
print("\nFeature Breakdown:")
for idx, row in features_df.iterrows():
    print(f"\n{row['Feature']}: {row['Mentions']} mentions ({row['Percentage']}%)")
    print(f"  {row['Explanation']}")

# Create a bar chart
fig_features_bar = px.bar(
    features_df,
    x='Feature',
    y='Mentions',
    color='Feature',
    text='Mentions',
    color_discrete_map=dict(zip(features_df['Feature'], features_df['Color'])),
    title='Most Relied-Upon Dashboard Features',
    height=500
)

fig_features_bar.update_traces(textposition='outside')
fig_features_bar.update_layout(
    xaxis=dict(title='Feature'),
    yaxis=dict(title='Number of Mentions'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_features_bar.show()

# Create a pie chart
fig_features_pie = px.pie(
    features_df,
    values='Mentions',
    names='Feature',
    color='Feature',
    color_discrete_map=dict(zip(features_df['Feature'], features_df['Color'])),
    title='Distribution of Most Relied-Upon Dashboard Features',
    height=600
)

fig_features_pie.update_traces(
    textinfo='percent+label',
    pull=[0.05 if x == '📊 Charts/Graphs' else 0 for x in features_df['Feature']]
)

fig_features_pie.show()

# Create a horizontal bar chart
fig_features_hbar = px.bar(
    features_df,
    y='Feature',
    x='Mentions',
    color='Feature',
    text='Mentions',
    orientation='h',
    color_discrete_map=dict(zip(features_df['Feature'], features_df['Color'])),
    title='Dashboard Features by Mention Frequency',
    height=500
)

fig_features_hbar.update_traces(textposition='outside')
fig_features_hbar.update_layout(
    yaxis=dict(title=''),
    xaxis=dict(title='Number of Mentions'),
    plot_bgcolor='white',
    font=dict(size=14),
    showlegend=False
)

fig_features_hbar.show()

# Create a funnel chart to visualize feature reliance hierarchy
fig_features_funnel = go.Figure(go.Funnel(
    y=features_df['Feature'],
    x=features_df['Mentions'],
    textposition="inside",
    textinfo="value+percent initial",
    marker={"color": features_df['Color'].tolist()},
    connector={"line": {"color": "royalblue", "dash": "dot", "width": 2}}
))

fig_features_funnel.update_layout(
    title='Dashboard Feature Reliance Funnel',
    font=dict(size=14),
    height=500
)

fig_features_funnel.show()

# Create a sunburst chart to show feature categories
# Group features into categories
feature_categories = {
    'Visualization Tools': ['📊 Charts/Graphs', '🎯 Scorecards/KPIs'],
    'Interactive Controls': ['🎛️ Filters', '🔽 Dropdowns/Slicers'],
    'Exploration Tools': ['🔍 Drill-down/Drill-through']
}

# Create data for sunburst
sunburst_features = []

# Add category totals
for category, features in feature_categories.items():
    category_total = sum(features_df.loc[features_df['Feature'].isin(features), 'Mentions'])
    sunburst_features.append({
        'Category': 'All Features',
        'Feature': category,
        'Mentions': category_total
    })
    
    # Add individual features within categories
    for feature in features:
        feature_mentions = features_df.loc[features_df['Feature'] == feature, 'Mentions'].iloc[0]
        sunburst_features.append({
            'Category': category,
            'Feature': feature,
            'Mentions': feature_mentions
        })

# Convert to DataFrame
sunburst_features_df = pd.DataFrame(sunburst_features)

# Create sunburst chart
fig_features_sunburst = px.sunburst(
    sunburst_features_df,
    path=['Category', 'Feature'],
    values='Mentions',
    title='Dashboard Features by Category',
    height=650
)

fig_features_sunburst.update_layout(font=dict(size=14))
fig_features_sunburst.show()

Most Relied-Upon Dashboard Features Analysis
--------------------------------------------------------------------------------
Total mentions analyzed: 91

Feature Breakdown:

📊 Charts/Graphs: 33 mentions (36.3%)
  Charts and graphs are the most relied-on tools for visualizing data patterns, comparisons, and trends over time. Users depend on bar charts, line graphs, trend charts, and visual summaries to quickly interpret key metrics and support decision-making without having to read through tables of numbers.

🎛️ Filters: 28 mentions (30.8%)
  Filters are essential for narrowing down large datasets to relevant segments. Users rely on them to explore specific timeframes, business units, patient records, or product lines. Filters provide control and precision, making dashboards adaptable to different questions and contexts.

🔽 Dropdowns/Slicers: 20 mentions (22.0%)
  Dropdown menus or slicers allow users to toggle between different dimensions of data (e.g., locations, departments, categor

In [175]:
# Analyze relationships between dashboard features and exploration behaviors
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Define feature usage in different exploration stages
# For each feature, estimate its importance (0-10) in different exploration stages
feature_exploration_mapping = {
    '📊 Charts/Graphs': {
        'Initial Overview': 9,
        'Pattern Recognition': 10,
        'Comparative Analysis': 9,
        'Anomaly Detection': 7,
        'Root Cause Analysis': 6
    },
    '🎛️ Filters': {
        'Initial Overview': 4,
        'Pattern Recognition': 7,
        'Comparative Analysis': 9,
        'Anomaly Detection': 8,
        'Root Cause Analysis': 7
    },
    '🔽 Dropdowns/Slicers': {
        'Initial Overview': 5,
        'Pattern Recognition': 6,
        'Comparative Analysis': 8,
        'Anomaly Detection': 7,
        'Root Cause Analysis': 6
    },
    '🔍 Drill-down/Drill-through': {
        'Initial Overview': 2,
        'Pattern Recognition': 5,
        'Comparative Analysis': 7,
        'Anomaly Detection': 9,
        'Root Cause Analysis': 10
    },
    '🎯 Scorecards/KPIs': {
        'Initial Overview': 10,
        'Pattern Recognition': 3,
        'Comparative Analysis': 5,
        'Anomaly Detection': 6,
        'Root Cause Analysis': 2
    }
}

# Create data for heatmap
heatmap_data = []
for feature, stages in feature_exploration_mapping.items():
    for stage, importance in stages.items():
        heatmap_data.append({
            'Feature': feature,
            'Exploration Stage': stage,
            'Importance': importance
        })

# Convert to DataFrame
heatmap_df = pd.DataFrame(heatmap_data)

# Create a heatmap of feature importance across exploration stages
fig_feature_heatmap = px.imshow(
    heatmap_df.pivot(index='Feature', columns='Exploration Stage', values='Importance'),
    color_continuous_scale='Viridis',
    title='Feature Importance Across Exploration Stages',
    labels=dict(x="Exploration Stage", y="Feature", color="Importance Score"),
    height=500,
    width=750
)

fig_feature_heatmap.update_layout(
    font=dict(size=12),
    coloraxis_colorbar=dict(title="Importance (0-10)")
)

# Add text annotations for importance scores
for i, feature in enumerate(heatmap_df['Feature'].unique()):
    for j, stage in enumerate(heatmap_df['Exploration Stage'].unique()):
        importance = heatmap_df[(heatmap_df['Feature'] == feature) & (heatmap_df['Exploration Stage'] == stage)]['Importance'].values[0]
        fig_feature_heatmap.add_annotation(
            x=stage,
            y=feature,
            text=str(importance),
            showarrow=False,
            font=dict(color="white" if importance > 5 else "black")
        )

fig_feature_heatmap.show()

# Create a radar chart to compare feature strengths across different aspects
feature_strengths = {
    '📊 Charts/Graphs': {
        'Data Visualization': 10,
        'User Engagement': 8,
        'Data Comprehension': 9,
        'Exploration Support': 7,
        'Decision Support': 8
    },
    '🎛️ Filters': {
        'Data Visualization': 4,
        'User Engagement': 7,
        'Data Comprehension': 6,
        'Exploration Support': 9,
        'Decision Support': 8
    },
    '🔽 Dropdowns/Slicers': {
        'Data Visualization': 3,
        'User Engagement': 6,
        'Data Comprehension': 5,
        'Exploration Support': 7,
        'Decision Support': 6
    },
    '🔍 Drill-down/Drill-through': {
        'Data Visualization': 6,
        'User Engagement': 8,
        'Data Comprehension': 9,
        'Exploration Support': 10,
        'Decision Support': 9
    },
    '🎯 Scorecards/KPIs': {
        'Data Visualization': 7,
        'User Engagement': 6,
        'Data Comprehension': 8,
        'Exploration Support': 4,
        'Decision Support': 7
    }
}

# Create the radar chart
strengths_categories = list(next(iter(feature_strengths.values())).keys())
fig_feature_radar = go.Figure()

for feature, strengths in feature_strengths.items():
    # Get the color for this feature
    feature_color = features_df.loc[features_df['Feature'] == feature, 'Color'].iloc[0]
    
    fig_feature_radar.add_trace(go.Scatterpolar(
        r=list(strengths.values()),
        theta=strengths_categories,
        fill='toself',
        name=feature,
        line_color=feature_color
    ))

fig_feature_radar.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 10]
        )
    ),
    showlegend=True,
    title='Dashboard Feature Strengths Across Different Aspects',
    height=600
)

fig_feature_radar.show()

# Create a scatter plot comparing feature mention frequency with exploration depth support
# Define an "exploration depth support" score for each feature (0-10)
exploration_depth_scores = {
    '📊 Charts/Graphs': 7,
    '🎛️ Filters': 8,
    '🔽 Dropdowns/Slicers': 6,
    '🔍 Drill-down/Drill-through': 10,
    '🎯 Scorecards/KPIs': 3
}

# Add scores to features DataFrame
features_df['Exploration Depth Support'] = features_df['Feature'].map(exploration_depth_scores)

# Create scatter plot
fig_feature_scatter = px.scatter(
    features_df,
    x='Mentions',
    y='Exploration Depth Support',
    color='Feature',
    size='Mentions',
    text='Feature',
    color_discrete_map=dict(zip(features_df['Feature'], features_df['Color'])),
    title='Dashboard Features: Mention Frequency vs. Exploration Depth Support',
    labels={'Mentions': 'Number of Mentions', 'Exploration Depth Support': 'Support for Deep Exploration (0-10)'},
    height=600
)

fig_feature_scatter.update_traces(
    textposition='top center',
    marker=dict(opacity=0.8, line=dict(width=2, color='DarkSlateGrey'))
)

fig_feature_scatter.update_layout(
    xaxis=dict(title='Number of Mentions (Frequency)'),
    yaxis=dict(title='Support for Deep Exploration (0-10)'),
    plot_bgcolor='white',
    font=dict(size=14)
)

fig_feature_scatter.show()

## Summary of Most Relied-Upon Dashboard Features

Our analysis of dashboard features reveals users' strong preference for specific tools that enable effective data exploration and analysis. The distribution of mentions across features provides valuable insights into user priorities and workflows.

### Key Findings

1. **Charts and Graphs (36.3% of mentions)**  
   Visualizations are the backbone of dashboard usage, with users relying heavily on visual representations to quickly interpret data patterns and trends. Their dominance in mentions reflects how critical they are for transforming raw data into actionable insights.

2. **Filters (30.8% of mentions)**  
   Nearly as important as visualizations, filters empower users to focus their analysis on relevant subsets of data. This high reliance suggests that users value the ability to narrow scope and customize their view according to specific parameters.

3. **Dropdowns and Slicers (22.0% of mentions)**  
   These interactive controls help manage visual complexity by letting users toggle between different dimensions. Their significant mention frequency indicates the importance of flexible, on-demand data views.

4. **Drill-down Capabilities (6.6% of mentions)**  
   While mentioned less frequently, drill-down functionality plays a critical role in deeper exploration. The relative scarcity of mentions may reflect that these features are used less often but are highly valuable when needed.

5. **Scorecards and KPIs (4.4% of mentions)**  
   These at-a-glance metrics serve as entry points to broader analysis. Their lower mention rate suggests they often function as starting points rather than tools for extended exploration.

### Feature Roles Across the Exploration Journey

Our analysis reveals how different features serve distinct roles throughout the data exploration process:

- **Initial Overview**: Scorecards/KPIs and Charts/Graphs dominate the early exploration phase, providing quick status checks and overall patterns.
  
- **Pattern Recognition**: Charts/Graphs excel here, helping users identify trends, clusters, and distributions in the data.
  
- **Comparative Analysis**: Filters and Dropdowns become more crucial during this phase, allowing users to create meaningful comparisons between data segments.
  
- **Anomaly Detection**: As exploration deepens, Drill-down capabilities gain importance, letting users investigate unusual patterns.
  
- **Root Cause Analysis**: In the deepest exploration phase, Drill-down features become essential, allowing users to trace issues to their source.

### Feature Strengths and Exploration Support

When comparing mention frequency with support for deep exploration, we observe an interesting pattern:

- **High Mention, Moderate Exploration Support**: Charts/Graphs and Filters are frequently mentioned and offer solid exploration support, making them versatile workhorses.
  
- **High Mention, Lower Exploration Support**: Dropdowns/Slicers are commonly mentioned but offer more limited support for deep exploration.
  
- **Low Mention, High Exploration Support**: Drill-down features are mentioned less often but provide excellent support for deep exploration, indicating their specialist role.
  
- **Low Mention, Low Exploration Support**: Scorecards/KPIs serve specific purposes but aren't designed for extended exploration.

### Implications for Dashboard Design

These findings suggest several principles for effective dashboard design:

1. **Prioritize Visual Elements**: Given users' heavy reliance on charts and graphs, dashboards should lead with clear, informative visualizations.

2. **Provide Robust Filtering**: Strong filtering capabilities are a core expectation and should be intuitive and flexible.

3. **Balance Feature Prominence**: Design should reflect natural usage patterns, making common features (charts, filters) prominent while ensuring specialist tools (drill-downs) remain accessible.

4. **Support Exploration Flow**: Dashboard layouts should follow the natural progression from overview (scorecards, charts) to detail (filters, drill-downs), supporting the entire exploration journey.

5. **Consider Feature Interactions**: Since features are often used together (e.g., filters with charts), their interactions should be intuitive and complementary.

By aligning dashboard designs with these user preferences and exploration behaviors, designers can create more effective tools that better support data-driven decision-making.

## User-Requested Dashboard Features

In this section, we'll analyze the features that users wish dashboards had to better support their workflows. This analysis provides valuable insights into user needs, pain points, and feature gaps in current dashboard tools. By identifying these requested features, we can better understand how to enhance exploration capabilities in future dashboard designs.

We'll categorize the responses, analyze the frequency of each feature request type, and visualize the results to identify patterns and priorities.

In [178]:
# Create a bar plot for User Wanted Features
import plotly.express as px
import pandas as pd

# Create a DataFrame with the provided data
user_wanted_features = {
    'Category': [
        'Workflow-Specific Feature Requests', 
        'AI Integration & Natural Language Interaction', 
        'Advanced Analytics & Predictive Tools', 
        'Collaboration, Feedback & Annotation', 
        'Customization & Drill-Down Control'
    ],
    'Count': [21, 9, 7, 7, 6]
}

user_features_df = pd.DataFrame(user_wanted_features)

# Sort by count in descending order
user_features_df = user_features_df.sort_values('Count', ascending=False)

# Create a color palette
feature_colors = px.colors.qualitative.Bold

# Create the bar chart
fig_user_features = px.bar(
    user_features_df,
    x='Category',
    y='Count',
    color='Category',
    color_discrete_sequence=feature_colors,
    title='User Wanted Features',
    text='Count'
)

# Update the layout
fig_user_features.update_layout(
    xaxis_title="Feature Category",
    yaxis_title="Number of Requests",
    xaxis_tickangle=-45,
    title_x=0.5,  # Center title
    plot_bgcolor='rgba(240,240,240,0.95)',  # Light grey plot area
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12),
    margin=dict(l=80, r=40, t=80, b=150),  # Increased bottom margin for labels
    showlegend=False
)

# Add data labels on top of the bars
fig_user_features.update_traces(
    textposition='outside',
    texttemplate='%{text}',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>'
)

# Show the plot
fig_user_features.show()

# Add a summary description of the findings
print("\nAnalysis of User-Requested Features:")
print("1. Workflow-specific features are the most requested (21 mentions), highlighting users' desire for tools that fit seamlessly into their existing processes.")
print("2. AI and Natural Language features (9 mentions) form the second largest category, showing increasing interest in more intuitive dashboard interactions.")
print("3. Advanced Analytics, Collaboration tools, and Customization features are also important to users with 7, 7, and 6 mentions respectively.")
print("4. Overall, users are primarily seeking features that make their workflows more efficient while adding intelligence and collaborative capabilities.")


Analysis of User-Requested Features:
1. Workflow-specific features are the most requested (21 mentions), highlighting users' desire for tools that fit seamlessly into their existing processes.
2. AI and Natural Language features (9 mentions) form the second largest category, showing increasing interest in more intuitive dashboard interactions.
3. Advanced Analytics, Collaboration tools, and Customization features are also important to users with 7, 7, and 6 mentions respectively.
4. Overall, users are primarily seeking features that make their workflows more efficient while adding intelligence and collaborative capabilities.


In [179]:
# Create a pie chart for hypothesis formation question responses
import plotly.express as px
import pandas as pd
import numpy as np

# Create DataFrame for hypothesis formation responses
hypothesis_data = {
    'Response': ['Yes', 'No'],
    'Count': [35, 13]
}

hypothesis_df = pd.DataFrame(hypothesis_data)

# Calculate percentages
total = hypothesis_df['Count'].sum()
hypothesis_df['Percentage'] = (hypothesis_df['Count'] / total * 100).round(1)

# Define colors for the pie chart (using a blue for yes and red for no)
colors = ['#2E86C1', '#E74C3C']

# Create the pie chart
fig_hypothesis = px.pie(
    hypothesis_df,
    values='Count',
    names='Response',
    title='Do you ever form hypotheses while using a dashboard?',
    color='Response',
    color_discrete_sequence=colors,
    hole=0.3  # Creates a donut chart effect
)

# Add percentage and count to hover information
fig_hypothesis.update_traces(
    textinfo='percent+label', 
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
)

# Improve layout
fig_hypothesis.update_layout(
    title_x=0.5,  # Center title
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12),
    legend_title_text='Response',
    annotations=[
        dict(
            text=f'Total: {total}<br>{hypothesis_df.iloc[0]["Response"]}: {hypothesis_df.iloc[0]["Count"]} ({hypothesis_df.iloc[0]["Percentage"]}%)<br>{hypothesis_df.iloc[1]["Response"]}: {hypothesis_df.iloc[1]["Count"]} ({hypothesis_df.iloc[1]["Percentage"]}%)',
            x=0.5, y=0.5,
            font_size=12,
            showarrow=False
        )
    ]
)

# Show the plot
fig_hypothesis.show()

# Provide analysis of the findings
print("\nAnalysis of Hypothesis Formation:")
print(f"- A strong majority of respondents ({hypothesis_df.iloc[0]['Percentage']}%, {hypothesis_df.iloc[0]['Count']} people) report forming hypotheses while using dashboards.")
print(f"- Only {hypothesis_df.iloc[1]['Percentage']}% ({hypothesis_df.iloc[1]['Count']} people) say they do not form hypotheses during dashboard use.")
print("- This indicates that dashboards are frequently used for exploratory analysis and not just for viewing predefined metrics.")
print("- The high prevalence of hypothesis formation suggests users are actively engaging with data rather than passively consuming it.")


Analysis of Hypothesis Formation:
- A strong majority of respondents (72.9%, 35 people) report forming hypotheses while using dashboards.
- Only 27.1% (13 people) say they do not form hypotheses during dashboard use.
- This indicates that dashboards are frequently used for exploratory analysis and not just for viewing predefined metrics.
- The high prevalence of hypothesis formation suggests users are actively engaging with data rather than passively consuming it.


In [181]:
# Create visualization for insight annotation/validation question
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Create DataFrame for annotation/validation responses
annotation_data = {
    'Response': ['Yes', 'No'],
    'Count': [45, 3]
}

annotation_df = pd.DataFrame(annotation_data)

# Calculate percentages
total_responses = annotation_df['Count'].sum()
annotation_df['Percentage'] = (annotation_df['Count'] / total_responses * 100).round(1)

# Let's create two visualizations: a pie chart and a more visually striking gauge chart

# 1. Create the pie chart with a distinctive color scheme
fig_annotation_pie = px.pie(
    annotation_df,
    values='Count',
    names='Response',
    title='Do you think it\'s important to annotate or validate the insights you gain?',
    color='Response',
    color_discrete_map={'Yes': '#2ECC71', 'No': '#E74C3C'},
    hole=0.4
)

# Add percentage and count to hover information
fig_annotation_pie.update_traces(
    textinfo='percent+label', 
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
)

# Update layout
fig_annotation_pie.update_layout(
    title_x=0.5,
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12),
    legend_title_text='Response',
    annotations=[
        dict(
            text=f'Total: {total_responses}',
            x=0.5, y=0.5,
            font_size=12,
            showarrow=False
        )
    ]
)

# 2. Create a gauge chart to emphasize the strong consensus
fig_annotation_gauge = go.Figure(go.Indicator(
    mode="gauge+number",
    value=annotation_df.iloc[0]['Percentage'],
    title={'text': "Importance of Annotation/Validation", 'font': {'size': 16}},
    domain={'x': [0, 1], 'y': [0, 1]},
    gauge={
        'axis': {'range': [None, 100], 'tickwidth': 1, 'tickcolor': "darkblue"},
        'bar': {'color': "#2ECC71"},
        'bgcolor': "white",
        'borderwidth': 2,
        'bordercolor': "gray",
        'steps': [
            {'range': [0, 25], 'color': '#F8D0C8'},
            {'range': [25, 50], 'color': '#F5CBA7'},
            {'range': [50, 75], 'color': '#D5F5E3'},
            {'range': [75, 100], 'color': '#ABEBC6'}
        ],
    }
))

# Update gauge layout
fig_annotation_gauge.update_layout(
    title="Percentage of Respondents Who Value Annotation/Validation of Insights",
    title_x=0.5,
    font=dict(family="Arial, sans-serif", size=12),
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    height=400,
    annotations=[
        dict(
            text=f"Yes: {annotation_df.iloc[0]['Count']} ({annotation_df.iloc[0]['Percentage']}%) | No: {annotation_df.iloc[1]['Count']} ({annotation_df.iloc[1]['Percentage']}%)",
            x=0.5, y=0.2,
            xanchor='center',
            showarrow=False,
            font=dict(size=14)
        )
    ]
)

# Display the visualizations
print("Pie Chart Visualization:")
fig_annotation_pie.show()

# Provide analysis of the findings
print("\nAnalysis of Annotation/Validation Importance:")
print(f"- An overwhelming majority ({annotation_df.iloc[0]['Percentage']}%, {annotation_df.iloc[0]['Count']} people) believe it's important to annotate or validate insights gained from dashboards.")
print(f"- Only {annotation_df.iloc[1]['Percentage']}% ({annotation_df.iloc[1]['Count']} people) don't consider annotation/validation important.")
print("- This strong consensus highlights the critical role of validation in the data analysis workflow.")
print("- Key reasons participants likely value validation include:")
print("  * Ensuring accuracy and reliability of insights")
print("  * Creating an audit trail for decisions")
print("  * Facilitating knowledge sharing within teams")
print("  * Building institutional memory around data interpretations")
print("- This finding suggests dashboard designers should prioritize annotation and validation features to support this critical workflow need.")

Pie Chart Visualization:



Analysis of Annotation/Validation Importance:
- An overwhelming majority (93.8%, 45 people) believe it's important to annotate or validate insights gained from dashboards.
- Only 6.2% (3 people) don't consider annotation/validation important.
- This strong consensus highlights the critical role of validation in the data analysis workflow.
- Key reasons participants likely value validation include:
  * Ensuring accuracy and reliability of insights
  * Creating an audit trail for decisions
  * Facilitating knowledge sharing within teams
  * Building institutional memory around data interpretations
- This finding suggests dashboard designers should prioritize annotation and validation features to support this critical workflow need.


In [182]:
# Create a pie chart for externalized thinking responses
import plotly.express as px
import pandas as pd
import numpy as np

# Create DataFrame for externalized thinking responses
externalize_data = {
    'Response': ['Yes', 'No'],
    'Count': [47, 1]
}

externalize_df = pd.DataFrame(externalize_data)

# Calculate percentages
total_externalize = externalize_df['Count'].sum()
externalize_df['Percentage'] = (externalize_df['Count'] / total_externalize * 100).round(1)

# Create a color scheme that highlights the extreme imbalance
colors = ['#3498DB', '#E74C3C']  # Blue for Yes, Red for No

# Create the pie chart
fig_externalize = px.pie(
    externalize_df,
    values='Count',
    names='Response',
    title='Do you externalize your thinking?',
    color='Response',
    color_discrete_sequence=colors,
    hole=0.4  # Creates a donut chart effect
)

# Format the text on the pie slices - add percentages and make visible even on tiny slices
fig_externalize.update_traces(
    textposition='inside',
    textinfo='percent+label',
    insidetextfont=dict(color='white', size=14, family='Arial, sans-serif'),
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
)

# Improve layout with annotations in the center
fig_externalize.update_layout(
    title_x=0.5,  # Center title
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12),
    legend_title_text='Response',
    annotations=[
        dict(
            text=f"Total: {total_externalize}<br>Yes: {externalize_df.iloc[0]['Count']} ({externalize_df.iloc[0]['Percentage']}%)<br>No: {externalize_df.iloc[1]['Count']} ({externalize_df.iloc[1]['Percentage']}%)",
            x=0.5, y=0.5,
            font_size=12,
            showarrow=False
        )
    ]
)

# Show the plot
fig_externalize.show()

# Provide analysis of the findings
print("\nAnalysis of Externalized Thinking:")
print(f"- Nearly unanimous response: {externalize_df.iloc[0]['Percentage']}% of respondents ({externalize_df.iloc[0]['Count']} people) report externalizing their thinking while using dashboards.")
print(f"- Only {externalize_df.iloc[1]['Count']} person ({externalize_df.iloc[1]['Percentage']}%) indicated they do not externalize their thinking.")
print("- This overwhelming consensus suggests that externalization is an essential aspect of the dashboard analysis process.")
print("- Externalization likely takes forms such as:")
print("  * Taking notes during analysis")
print("  * Creating screenshots or annotations")
print("  * Sharing insights with colleagues")
print("  * Building secondary artifacts from dashboard data")
print("- This finding strongly indicates that dashboard tools should provide robust support for externalization methods to match users' natural workflow.")


Analysis of Externalized Thinking:
- Nearly unanimous response: 97.9% of respondents (47 people) report externalizing their thinking while using dashboards.
- Only 1 person (2.1%) indicated they do not externalize their thinking.
- This overwhelming consensus suggests that externalization is an essential aspect of the dashboard analysis process.
- Externalization likely takes forms such as:
  * Taking notes during analysis
  * Creating screenshots or annotations
  * Sharing insights with colleagues
  * Building secondary artifacts from dashboard data
- This finding strongly indicates that dashboard tools should provide robust support for externalization methods to match users' natural workflow.


In [183]:
# Create a histogram for comfort levels with AI-integrated dashboards
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Create DataFrame for AI comfort levels
ai_comfort_data = {
    'Comfort Level': ['Very Uncomfortable', 'Uncomfortable', 'Neutral', 'Comfortable', 'Very Comfortable'],
    'Count': [1, 1, 4, 25, 17],
    'Rating': [1, 2, 3, 4, 5]  # Numeric values for calculations
}

ai_comfort_df = pd.DataFrame(ai_comfort_data)

# Calculate percentages and statistics
total_responses = ai_comfort_df['Count'].sum()
ai_comfort_df['Percentage'] = (ai_comfort_df['Count'] / total_responses * 100).round(1)

# Calculate weighted average (mean) comfort level
weighted_mean = sum(ai_comfort_df['Rating'] * ai_comfort_df['Count']) / total_responses
median_comfort = np.median([val for count, val in zip(ai_comfort_df['Count'], ai_comfort_df['Rating']) for _ in range(count)])

# Create a color gradient from red to green
colors = ['#E74C3C', '#F39C12', '#F1C40F', '#2ECC71', '#27AE60']

# Create the histogram
fig_ai_comfort = go.Figure()

# Add the histogram bars
fig_ai_comfort.add_trace(go.Bar(
    x=ai_comfort_df['Comfort Level'],
    y=ai_comfort_df['Count'],
    text=ai_comfort_df['Count'],
    textposition='outside',
    marker_color=colors,
    width=0.6,  # Make bars slightly narrower
    hovertemplate='<b>%{x}</b><br>Count: %{y}<br>Percentage: %{text}%<extra></extra>',
    texttemplate='%{y}'
))

# Add a line showing the weighted mean
fig_ai_comfort.add_shape(
    type="line",
    x0=-0.5, y0=weighted_mean * 0,  # Just for visualization, not visible
    x1=4.5, y1=weighted_mean * 0,
    line=dict(color="rgba(0,0,0,0)", width=0),
)

# Update the layout
fig_ai_comfort.update_layout(
    title="Comfort Level with AI Integration in Dashboards",
    title_x=0.5,
    xaxis_title="Comfort Level",
    yaxis_title="Number of Respondents",
    plot_bgcolor='rgba(240,240,240,0.95)',
    paper_bgcolor='white',
    font=dict(family="Arial, sans-serif", size=12),
    bargap=0.2,
    annotations=[
        dict(
            x=2,  # Position in middle of chart
            y=max(ai_comfort_df['Count']) * 1.1,  # Slightly above the highest bar
            text=f"Mean: {weighted_mean:.1f} | Median: {median_comfort:.1f}<br>Total Respondents: {total_responses}",
            showarrow=False,
            font_size=12
        )
    ]
)

# Show the plot
fig_ai_comfort.show()

# Provide analysis of the findings
print("\nAnalysis of Comfort with AI-Integrated Dashboards:")
print(f"- Strong positive sentiment: {ai_comfort_df['Percentage'].iloc[3] + ai_comfort_df['Percentage'].iloc[4]:.1f}% of respondents ({ai_comfort_df['Count'].iloc[3] + ai_comfort_df['Count'].iloc[4]} people) feel comfortable or very comfortable with AI integration.")
print(f"- Neutral stance: {ai_comfort_df['Percentage'].iloc[2]:.1f}% ({ai_comfort_df['Count'].iloc[2]} people) feel neither comfortable nor uncomfortable.")
print(f"- Minimal resistance: Only {ai_comfort_df['Percentage'].iloc[0] + ai_comfort_df['Percentage'].iloc[1]:.1f}% ({ai_comfort_df['Count'].iloc[0] + ai_comfort_df['Count'].iloc[1]} people) express discomfort with AI integration.")
print(f"- The weighted average comfort level is {weighted_mean:.1f} (out of 5), indicating strong overall acceptance.")
print(f"- The most common response is 'Comfortable' with {ai_comfort_df['Count'].iloc[3]} respondents ({ai_comfort_df['Percentage'].iloc[3]:.1f}%).")
print("\nImplications:")
print("- The high comfort level suggests dashboard users are ready for AI-based features like automated insights and recommendations.")
print("- The small minority with concerns may benefit from transparency features showing how AI generates insights.")
print("- Dashboard designers can confidently pursue AI integration while providing optional controls for the small percentage of uncomfortable users.")
print("- Organizations can likely expect positive reception to AI-enhanced analytics tools from most users.")


Analysis of Comfort with AI-Integrated Dashboards:
- Strong positive sentiment: 87.5% of respondents (42 people) feel comfortable or very comfortable with AI integration.
- Neutral stance: 8.3% (4 people) feel neither comfortable nor uncomfortable.
- Minimal resistance: Only 4.2% (2 people) express discomfort with AI integration.
- The weighted average comfort level is 4.2 (out of 5), indicating strong overall acceptance.
- The most common response is 'Comfortable' with 25 respondents (52.1%).

Implications:
- The high comfort level suggests dashboard users are ready for AI-based features like automated insights and recommendations.
- The small minority with concerns may benefit from transparency features showing how AI generates insights.
- Dashboard designers can confidently pursue AI integration while providing optional controls for the small percentage of uncomfortable users.
- Organizations can likely expect positive reception to AI-enhanced analytics tools from most users.


In [2]:
# Create two funnel plots with the specified level values
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# Data for the two funnel plots
funnel1_data = {
    'Level': ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6'],
    'Value': [12, 9, 21, 3, 1, 1]
}

funnel2_data = {
    'Level': ['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6'],
    'Value': [0, 1, 6, 23, 7, 11]
}

# Create DataFrames
df1 = pd.DataFrame(funnel1_data)
df2 = pd.DataFrame(funnel2_data)

# Calculate percentages for better context
total1 = sum(df1['Value'])
total2 = sum(df2['Value'])
df1['Percentage'] = (df1['Value'] / total1 * 100).round(1)
df2['Percentage'] = (df2['Value'] / total2 * 100).round(1)

# Create subplots: 1 row, 2 columns
fig = make_subplots(rows=1, cols=2, 
                    subplot_titles=['Funnel Plot 1', 'Funnel Plot 2'],
                    specs=[[{'type': 'funnel'}, {'type': 'funnel'}]])

# Add first funnel trace
fig.add_trace(
    go.Funnel(
        name='Funnel 1',
        y=df1['Level'],
        x=df1['Value'],
        textposition='inside',
        textinfo='value+percent initial',
        opacity=0.65,
        marker=dict(color=['#004c6d', '#1d6996', '#4682b4', '#6b9ebf', '#95b8bf', '#c1d1e1'],
                   line=dict(width=2)),
        connector=dict(line=dict(color='royalblue', width=2))
    ),
    row=1, col=1
)

# Add second funnel trace
fig.add_trace(
    go.Funnel(
        name='Funnel 2',
        y=df2['Level'],
        x=df2['Value'],
        textposition='inside',
        textinfo='value+percent initial',
        opacity=0.65,
        marker=dict(color=['#6e3b3b', '#ac4b4b', '#ca5c5c', '#da7a7a', '#e8a0a0', '#f5c8c8'],
                   line=dict(width=2)),
        connector=dict(line=dict(color='firebrick', width=2))
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    title_text='Comparison of Two Funnel Distributions',
    title_x=0.5,
    showlegend=False,
    height=600,
    width=1000,
    margin=dict(t=80, b=20, l=120, r=120),
    paper_bgcolor='white',
    plot_bgcolor='rgba(240,240,240,0.95)',
    font=dict(family="Arial, sans-serif", size=12)
)

# Add annotations for sums
fig.add_annotation(
    x=0.25, y=1.05,
    text=f"Total: {total1}",
    showarrow=False,
    font=dict(size=14)
)

fig.add_annotation(
    x=0.75, y=1.05,
    text=f"Total: {total2}",
    showarrow=False,
    font=dict(size=14)
)

# Show the figure
fig.show()

# Print summary statistics
print("\nFunnel 1 Summary:")
print(df1[['Level', 'Value', 'Percentage']])
print(f"\nTotal for Funnel 1: {total1}")
print("\nFunnel 2 Summary:")
print(df2[['Level', 'Value', 'Percentage']])
print(f"\nTotal for Funnel 2: {total2}")

# Compare the distributions
print("\nDistribution Comparison:")
print("- Funnel 1 has highest concentration in Level 3")
print("- Funnel 2 has highest concentration in Level 4")
print("- Funnel 1 has more entries in lower levels (Levels 1-2)")
print("- Funnel 2 has more entries in higher levels (Levels 4-6)")
print(f"- The total number of entries differs: Funnel 1 ({total1}) vs Funnel 2 ({total2})")


Funnel 1 Summary:
     Level  Value
0  Level 1     12
1  Level 2      9
2  Level 3     21
3  Level 4      3
4  Level 5      1
5  Level 6      1

Total for Funnel 1: 47

Funnel 2 Summary:
     Level  Value
0  Level 1      0
1  Level 2      1
2  Level 3      6
3  Level 4     23
4  Level 5      7
5  Level 6     11

Total for Funnel 2: 48

Distribution Comparison:
- Funnel 1 has highest value in Level 3
- Funnel 2 has highest value in Level 4
- Funnel 1 has more entries in lower levels (Levels 1-2)
- Funnel 2 has more entries in higher levels (Levels 4-6)
- The total number of entries differs: Funnel 1 (47) vs Funnel 2 (48)
