In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Breakfast Buffet Example


In [2]:
# Set seed for reproducibility
np.random.seed(10010)

foods = ["Pastry", "Granola", "Yogurt", "Potatoes", "Eggs"]
N_obs = 5000
probabilities = [0.1, 0.4, 0.4, 0.1] # make 2-3 items more common than the extremes
num_events = np.random.choice([1, 2, 3, 4], N_obs, p=probabilities)
print(pd.Series(num_events).value_counts().sort_index())


1     498
2    1973
3    1983
4     546
Name: count, dtype: int64


In [3]:
def one_event(length, data, sep='-'):
    prob = np.array([5, 4, 3, 2, 1]) / 15
    event = np.random.choice(data, length, p=prob, replace=True)
    return sep.join(event)

# Test the function
print(one_event(3, foods))
print(one_event(6, foods))


Granola-Granola-Granola
Potatoes-Potatoes-Pastry-Granola-Potatoes-Pastry


In [4]:
# Generate all event sequences
events = [one_event(length, foods) for length in num_events]
events_series = pd.Series(events)

# Calculate frequencies
events_freq = events_series.value_counts().reset_index()
events_freq.columns = ['path', 'count']

# Define the color palette and a dictionary to map food items to colors
breakfast_palette = ['#E41A1C', '#377EB8', '#4DAF4A', '#984EA3', '#FF7F00']
color_map = {food: color for food, color in zip(foods, breakfast_palette)}


In [5]:
def build_buffet_sunburst_data(df):
    node_data = []
    added_nodes = set()

    for _, row in df.iterrows():
        path = row['path']
        value = row['count']

        nodes = path.split('-')

        for i in range(1, len(nodes) + 1):
            current_path = '-'.join(nodes[:i])
            parent_path = '-'.join(nodes[:i-1]) if i > 1 else ''

            if current_path not in added_nodes:
                # Add the color to the node data
                label = nodes[i-1]
                color = color_map.get(label, '#CCCCCC') # Default to grey if not in map
                node_data.append({
                    'id': current_path,
                    'parent': parent_path,
                    'label': label,
                    'value': 0, # Initialize intermediate node values to zero
                    'color': color
                })
                added_nodes.add(current_path)

            if i == len(nodes):
                for node in node_data:
                    if node['id'] == current_path:
                        node['value'] = value
                        break

    return pd.DataFrame(node_data)

sunburst_df = build_buffet_sunburst_data(events_freq)

In [6]:
# Create the sunburst figure
fig = go.Figure(go.Sunburst(
    ids=sunburst_df['id'],
    labels=sunburst_df['label'],
    parents=sunburst_df['parent'],
    values=sunburst_df['value'],
    marker_colors=sunburst_df['color'], # Use the new color column
    insidetextorientation='radial',
    domain={'x': [0, 1], 'y': [0, 1]}
))

fig.update_layout(
    title_text='Sunburst Chart of Example Food Sequences',
    uniformtext=dict(minsize=12, mode='hide')
)

fig.show()

# EPA Example
The example assumes that you ran `jupyter notebook` from the `notebooks` directory of the repository. If you ran it from another folder, fix the paths so that Jupyter can locate the data file.

In [8]:
# Load the web log data from the CSV file
epa_df = pd.read_csv('../data/epa_log.csv')


In [9]:
epa_df['datetime'] = pd.to_datetime(epa_df['datetime'], format='mixed', errors='coerce')
# Order the data by host and datetime
epa_ordered = epa_df.sort_values(by=['host', 'datetime']).copy()

# Get time differences between rows in minutes
epa_ordered['time_diff'] = epa_ordered.groupby('host')['datetime'].diff().dt.total_seconds() / 60

# Determine new sessions based on host change or time cutoff
session_time_cutoff = 15
epa_ordered['new_session'] = (epa_ordered['host'] != epa_ordered['host'].shift(1)) | \
                             (epa_ordered['time_diff'] >= session_time_cutoff)
epa_ordered.loc[0, 'new_session'] = True
epa_ordered['session'] = epa_ordered['new_session'].cumsum()

# Filter for HTML pages and prepare for sunburst processing
epa_html = epa_ordered[epa_ordered['pagetype'] == 'html'].copy()
epa_html['page'] = epa_html['page'].str.replace('-', '_')

def create_sequence(pages):
    return '-'.join(pages.head(5))

In [10]:
# Group by session and create sequences
epa_sequences = epa_html.groupby('session')['page'].apply(create_sequence).reset_index()
epa_sequences.columns = ['session', 'sequence']

# Get frequency of each sequence
epa_sequences_freq = epa_sequences['sequence'].value_counts().reset_index()
epa_sequences_freq.columns = ['path', 'count']

# Filter for sequences with more than one occurrence
epa_sequences_freq_filtered = epa_sequences_freq[epa_sequences_freq['count'] > 1].copy()

# Build Sunburst Hierarchy and Colors
# Create a color map for the unique pages
unique_pages = pd.Series(epa_sequences_freq_filtered['path'].str.split('-')).explode().unique()
colors = px.colors.qualitative.Plotly
color_map = {page: colors[i % len(colors)] for i, page in enumerate(unique_pages)}

In [11]:
def build_epa_sunburst_data(df):
    node_data = []
    added_nodes = set()

    for _, row in df.iterrows():
        path = row['path']
        value = row['count']
        nodes = path.split('-')

        for i in range(1, len(nodes) + 1):
            current_path = '-'.join(nodes[:i])
            parent_path = '-'.join(nodes[:i-1]) if i > 1 else ''

            if current_path not in added_nodes:
                label = nodes[i-1]
                color = color_map.get(label, '#CCCCCC')
                node_data.append({
                    'id': current_path,
                    'parent': parent_path,
                    'label': label,
                    'value': 0,
                    'color': color
                })
                added_nodes.add(current_path)

            if i == len(nodes):
                for node in node_data:
                    if node['id'] == current_path:
                        node['value'] = value
                        break

    return pd.DataFrame(node_data)

sunburst_df_epa = build_epa_sunburst_data(epa_sequences_freq_filtered)

In [12]:
# Create the Sunburst Plot
fig_final = go.Figure(go.Sunburst(
    ids=sunburst_df_epa['id'],
    labels=sunburst_df_epa['label'],
    parents=sunburst_df_epa['parent'],
    values=sunburst_df_epa['value'],
    marker_colors=sunburst_df_epa['color'],
    insidetextorientation='radial',
    domain={'x': [0, 1], 'y': [0, 1]}
))

fig_final.update_layout(
    title_text='Web Log Sunburst Chart',
    uniformtext=dict(minsize=12, mode='hide')
)

fig_final.show()