In [8]:
import pandas as pd
import plotly.graph_objs as go
import ipywidgets as widgets
from ipywidgets import HBox
import numpy as np
import html  # to escape tooltip text

# Parameters
bin_width = 5

# Load CSV (make sure this is correct in your notebook)
df = pd.read_csv('SAMD12_RO.csv')

# Parameters
df.loc[df['assigned_haplotype'] == '', 'assigned_haplotype'] = 'Other'
top_haps = df['assigned_haplotype'].value_counts().head(10).index.tolist()
df = df[df['assigned_haplotype'].isin(top_haps)]

# Bin repeat lengths
df['repeat_bin'] = (df['repeatLength'] // bin_width) * bin_width
df['repeat_bin_center'] = df['repeat_bin'] + bin_width / 2

# Count unique encodedSeqs
seq_counts = (
    df.groupby(['repeat_bin_center', 'assigned_haplotype', 'encodedSeq'])
    .size()
    .reset_index(name='encodedSeq_count')
)

def format_tooltip(group):
    return "<br>".join(
        f"{html.escape(seq)} (n={int(count)})"
        for seq, count in zip(group['encodedSeq'], group['encodedSeq_count'])
    )

tooltip_df = (
    seq_counts
    .groupby(['repeat_bin_center', 'assigned_haplotype'])
    .apply(format_tooltip, include_groups=False)
    .reset_index(name='encodedSeq_summary')
)

bar_counts = (
    df.groupby(['repeat_bin_center', 'assigned_haplotype'])
    .size()
    .reset_index(name='count')
)

merged = pd.merge(bar_counts, tooltip_df, on=['repeat_bin_center', 'assigned_haplotype'], how='left')

def shorten_label(haplotype, max_len=50):
    return haplotype if len(haplotype) <= max_len else haplotype[:max_len] + "…"

merged['legend_label'] = merged['assigned_haplotype'].apply(shorten_label)

# === FigureWidget setup ===
fig = go.FigureWidget()
haplotypes = merged['assigned_haplotype'].unique()

for hap in haplotypes:
    sub = merged[merged['assigned_haplotype'] == hap]
    fig.add_bar(
        x=sub['repeat_bin_center'],
        y=sub['count'],
        name=sub['legend_label'].iloc[0],
        customdata=sub['encodedSeq_summary'],
        hovertemplate=(
            "Repeat Length Bin Center: %{x}<br>" +
            "Count: %{y}<br>" +
            "EncodedSeqs:<br>%{customdata}<extra></extra>"
        )
    )

fig.update_layout(
    barmode='stack',
    height=600,
    width=1000,
    xaxis=dict(
        title='Repeat Length (binned, bp)',
        rangeslider=dict(visible=True),
        type='linear'
    ),
    yaxis=dict(title='Allele Count'),
    legend=dict(
        orientation="v",
        x=1.02,
        y=1,
        title="Haplotype",
        font=dict(size=10)
    )
)

# === Slider ===
max_y = int(merged.groupby('repeat_bin_center')['count'].sum().max() * 1.5)

y_slider = widgets.IntRangeSlider(
    value=[0, int(max_y * 0.9)],
    min=0,
    max=max_y,
    step=10,
    description='Y-axis',
    layout=widgets.Layout(width='70px', height='600px'),
    continuous_update=False,
    orientation='vertical',
    style={'description_width': '0px'}
)

def update_y_range(change):
    fig.update_yaxes(range=change['new'])

y_slider.observe(update_y_range, names='value')

# === Voilà output ===
HBox([fig, y_slider])


HBox(children=(FigureWidget({
    'data': [{'customdata': array(['&lt;TAAAA&gt;2&lt;TAA&gt;1&lt;TAAAA&gt;3 (n=…

In [23]:
import plotly.graph_objs as go
import re
from collections import Counter
import html
import ipywidgets as widgets
from IPython.display import display

# === Color Palette ===
qPalette = [
    '#b2df8a', '#a6cee3', '#fdbf6f', '#cab2d6', '#ccebc5',
    '#ffffb3', '#d9d9d9', '#e5c494', '#fccde5', '#b3b3b3',
    '#66c2a5', '#80b1d3', '#fdb462', '#fb8072', '#fb9a99',
    '#e78ac3', '#ffd92f', '#bc80bd', '#a65628', '#b3de69',
    '#33a02c', '#1f78b4', '#ff7f00', '#e41a1c', '#999999',
    '#984ea3', '#4daf4a', '#377eb8', '#ffed6f', '#6a3d9a',
    '#b15928', '#8da0cb', '#f781bf', '#fb8072'
]

# === Functions ===
def generate_motif_color_map(encoded_seqs, min_count=1):
    motif_counter = Counter()
    for seq in encoded_seqs:
        if not isinstance(seq, str): continue
        for motif, count in re.findall(r'<([^<>]+)>(\d+)', seq):
            motif_counter[motif] += int(count)

    filtered = [motif for motif, count in motif_counter.items() if count >= min_count]
    color_map = {motif: qPalette[i] if i < len(qPalette) else 'gray' for i, motif in enumerate(filtered)}
    return color_map

def compute_repeat_length(encoded):
    return sum(len(m) * int(n) for m, n in re.findall(r'<([^<>]+)>(\d+)', encoded))

def plot_segmented_waterfall_with_freq(df, motif_to_color, min_count):
    allele_counts = df['encodedSeq'].value_counts().reset_index()
    allele_counts.columns = ['encodedSeq', 'count']
    allele_counts['repeat_length'] = allele_counts['encodedSeq'].apply(compute_repeat_length)
    total = allele_counts['count'].sum()

    allele_counts = allele_counts.sort_values('repeat_length', ascending=False).reset_index(drop=True)

    fig = go.Figure()
    max_freq = allele_counts['count'].max()
    max_length = allele_counts['repeat_length'].max()
    legend_motifs = set()

    for i, row in allele_counts.iterrows():
        encoded = row['encodedSeq']
        count = row['count']
        freq = count / total
        y_val = i
        x_start = 0

        for motif, n in re.findall(r'<([^<>]+)>(\d+)', encoded):
            n = int(n)
            seg_len = len(motif) * n
            color = motif_to_color.get(motif, 'gray')
            legend_motifs.add(motif if color != 'gray' else 'Other')
            label = f"&lt;{motif}&gt;{n}"

            fig.add_trace(go.Bar(
                x=[seg_len],
                y=[y_val],
                base=x_start,
                orientation='h',
                marker=dict(color=color),
                hovertemplate=label + "<extra></extra>",
                showlegend=False,
                width=1
            ))
            x_start += seg_len

        scaler = 0.1
        freq_width = scaler * max_length * (count / max_freq)
        freq_width = max(5, freq_width)

        fig.add_trace(go.Bar(
            x=[freq_width],
            y=[y_val],
            base=x_start + 3,
            orientation='h',
            marker=dict(color='black'),
            hovertemplate=(
                f"EncodedSeq: {html.escape(encoded)}<br>"
                f"Allele count: {count}<br>"
                f"Frequency: {freq:.2%}<extra></extra>"
            ),
            showlegend=False,
            width=0.6
        ))

    for motif, color in motif_to_color.items():
        name = motif if color != 'gray' else 'Other'
        if name in legend_motifs:
            fig.add_trace(go.Scatter(
                x=[None], y=[None],
                mode='markers',
                marker=dict(size=10, color=color),
                name=name,
                hoverinfo='skip'
            ))

    num_alleles = len(allele_counts)
    row_height = 25
    plot_height = min(max(num_alleles * row_height, 400), 1500)

    fig.update_layout(
        title="",
        plot_bgcolor='white',
        paper_bgcolor='snow',
        xaxis=dict(
            title='Repeat Length (bp)',
            range=[0, max_length + 25]
        ),
        yaxis=dict(
            title='Alleles',
            showticklabels=False,
            autorange='reversed'
        ),
        barmode='stack',
        height=plot_height,
        width=1400,
        legend=dict(
            x=1.01,
            y=0,
            yanchor='bottom',
            font=dict(size=10),
            title='Motifs'
        ),
        margin=dict(t=40, l=40, r=180, b=40)
    )

    fig.show()

# === Interactive Voilà Widget Setup ===
max_count = df['encodedSeq'].value_counts().max()
allele_slider = widgets.IntSlider(
    value=2,
    min=1,
    max=max_count,
    step=1,
    description='Min Count:',
    continuous_update=False
)

summary_label = widgets.Label()
plot_out = widgets.Output()

def plot_filtered_alleles(min_count):
    plot_out.clear_output(wait=True)
    with plot_out:
        allele_counts = df['encodedSeq'].value_counts()
        filtered = allele_counts[allele_counts >= min_count].index
        df_filtered = df[df['encodedSeq'].isin(filtered)]

        num_alleles = len(filtered)
        total_alleles = allele_counts.shape[0]
        min_freq = min_count / df.shape[0]

        summary_label.value = (
            f"Showing {num_alleles} / {total_alleles} alleles (\u2265 {min_freq:.2%} AF)"
        )

        color_map = generate_motif_color_map(df_filtered['encodedSeq'], min_count=2)
        plot_segmented_waterfall_with_freq(df_filtered, color_map, min_count)

allele_slider.observe(lambda change: plot_filtered_alleles(change['new']), names='value')

# === Voilà-compatible rendering ===
def build_voila_app():
    plot_filtered_alleles(allele_slider.value)
    return widgets.VBox([allele_slider, summary_label, plot_out])

build_voila_app()


VBox(children=(IntSlider(value=2, continuous_update=False, description='Min Count:', max=161, min=1), Label(va…