In [26]:
import pandas as pd
import plotly.graph_objs as go
import ipywidgets as widgets
from ipywidgets import HBox, Output
import html

# Parameters
bin_width = 3
csv_file = "FGF14_RO.csv"

# Load data
df = pd.read_csv(csv_file)

# Sanitize haplotype group
df.loc[df['assigned_haplotype'] == '', 'assigned_haplotype'] = 'Other'
top_haps = df['assigned_haplotype'].value_counts().head(10).index.tolist()
df = df[df['assigned_haplotype'].isin(top_haps)]

# Bin repeat lengths
df['repeat_bin'] = (df['repeatLength'] // bin_width) * bin_width
df['repeat_bin_center'] = df['repeat_bin'] + bin_width / 2

# Count unique encodedSeqs per bin × haplotype
seq_counts = (
    df.groupby(['repeat_bin_center', 'assigned_haplotype', 'encodedSeq'])
    .size()
    .reset_index(name='encodedSeq_count')
)

# Format tooltip text safely
def format_tooltip(group):
    return "<br>".join(
        f"{html.escape(seq)} (n={int(count)})" for seq, count in zip(group['encodedSeq'], group['encodedSeq_count'])
    )

tooltip_df = (
    seq_counts
    .groupby(['repeat_bin_center', 'assigned_haplotype'])
    .apply(format_tooltip)
    .reset_index(name='encodedSeq_summary')
)

# Bar heights
bar_counts = (
    df.groupby(['repeat_bin_center', 'assigned_haplotype'])
    .size()
    .reset_index(name='count')
)

# Merge for plotting
merged = pd.merge(bar_counts, tooltip_df, on=['repeat_bin_center', 'assigned_haplotype'], how='left')

# Shorten legend labels
def shorten_label(haplotype, max_len=50):
    return haplotype if len(haplotype) <= max_len else haplotype[:max_len] + "…"

merged['legend_label'] = merged['assigned_haplotype'].apply(shorten_label)

# Output container for Voilà
fig_out = Output()

# Build initial figure
def create_figure(y_range=None):
    fig = go.Figure()
    haplotypes = merged['assigned_haplotype'].unique()

    for hap in haplotypes:
        sub = merged[merged['assigned_haplotype'] == hap]
        fig.add_bar(
            x=sub['repeat_bin_center'],
            y=sub['count'],
            name=sub['legend_label'].iloc[0],
            customdata=sub['encodedSeq_summary'],
            hovertemplate=(
                "Repeat Length Bin Center: %{x}<br>" +
                "Count: %{y}<br>" +
                "EncodedSeqs:<br>%{customdata}<extra></extra>"
            )
        )

    fig.update_layout(
        barmode='stack',
        height=600,
        width=1000,
        xaxis=dict(
            title='Repeat Length (binned, bp)',
            rangeslider=dict(visible=True),
            type='linear'
        ),
        yaxis=dict(
            title='Allele Count',
            range=y_range
        ),
        legend=dict(
            orientation="v",
            x=1.02,
            y=1,
            title="Haplotype",
            font=dict(size=10)
        )
    )
    return fig

# Y-axis slider setup
max_y = int(merged.groupby('repeat_bin_center')['count'].sum().max() * 1.5)
y_slider = widgets.IntRangeSlider(
    value=[0, int(max_y * 0.9)],
    min=0,
    max=max_y,
    step=10,
    description='Y-axis',
    layout=widgets.Layout(width='70px', height='600px'),
    continuous_update=False,
    orientation='vertical',
    style={'description_width': '0px'}
)

# Update function
def update_y_range(change):
    with fig_out:
        fig_out.clear_output(wait=True)
        fig = create_figure(y_range=change['new'])
        fig.show()

# Attach observer
y_slider.observe(update_y_range, names='value')

# Initial render
with fig_out:
    fig = create_figure(y_range=y_slider.value)
    fig.show()

# Display
HBox([fig_out, y_slider])






HBox(children=(Output(), IntRangeSlider(value=(0, 246), continuous_update=False, description='Y-axis', layout=…

In [31]:
# import pandas as pd
# import plotly.graph_objs as go
# import re
# from collections import Counter
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# import matplotlib.colors as mcolors
# import html

# def generate_motif_color_map(encoded_seqs, min_count=1, palette='tab20'):
#     """
#     Auto-generate motif-to-color mapping based on motif frequency.
#     """
#     motif_counter = Counter()
#     for seq in encoded_seqs:
#         if not isinstance(seq, str): continue
#         for motif, count in re.findall(r'<([^<>]+)>(\d+)', seq):
#             motif_counter[motif] += int(count)

#     filtered = [motif for motif, count in motif_counter.items() if count >= min_count]

#     cmap = cm.get_cmap(palette, len(filtered))
#     motif_to_color = {
#         motif: mcolors.to_hex(cmap(i)) for i, motif in enumerate(filtered)
#     }

#     motif_to_color['Other'] = 'gray'
#     return motif_to_color

# def get_primary_motif(encoded):
#     """Returns the motif that contributes the most base pairs"""
#     matches = re.findall(r'<([^<>]+)>(\d+)', encoded)
#     if not matches:
#         return 'Other'
#     weighted = [(motif, len(motif) * int(count)) for motif, count in matches]
#     return max(weighted, key=lambda x: x[1])[0]

# def compute_repeat_length(encoded):
#     matches = re.findall(r'<([^<>]+)>(\d+)', encoded)
#     return sum(len(motif) * int(count) for motif, count in matches)

# def plot_interactive_waterfall_grouped(df, motif_to_color, top_n=None):
#     """
#     Interactive waterfall plot (1 bar per encodedSeq) with length + frequency info
#     """
#     # Count alleles
#     allele_counts = df['encodedSeq'].value_counts().reset_index()
#     allele_counts.columns = ['encodedSeq', 'count']
#     total = allele_counts['count'].sum()

#     # Compute repeat length and dominant motif
#     allele_counts['repeat_length'] = allele_counts['encodedSeq'].apply(compute_repeat_length)
#     allele_counts['primary_motif'] = allele_counts['encodedSeq'].apply(get_primary_motif)

#     # Filter top N by frequency if specified
#     if top_n:
#         allele_counts = allele_counts.head(top_n)

#     # Assign colors
#     allele_counts['color'] = allele_counts['primary_motif'].map(motif_to_color).fillna(motif_to_color.get('Other', 'gray'))

#     # Sort by repeat length
#     allele_counts = allele_counts.sort_values('repeat_length').reset_index(drop=True)

#     # Create hover text safely
#     allele_counts['hover'] = allele_counts.apply(
#         lambda row: (
#             f"EncodedSeq: {html.escape(row['encodedSeq'])}<br>"
#             f"Repeat length: {row['repeat_length']} bp<br>"
#             f"Allele count: {row['count']}<br>"
#             f"Frequency: {row['count'] / total:.2%}"
#         ), axis=1
#     )

#     # Plotly bar chart
#     fig = go.Figure(go.Bar(
#         x=allele_counts['repeat_length'],
#         y=allele_counts.index,
#         orientation='h',
#         marker=dict(color=allele_counts['color']),
#         hovertext=allele_counts['hover'],
#         hoverinfo='text'
#     ))

#     fig.update_layout(
#         title='Motif-Resolved Tandem Repeat Alleles',
#         xaxis_title='Repeat Length (bp)',
#         yaxis=dict(title='Alleles (ranked by length)', showticklabels=False),
#         height=25 * len(allele_counts) + 200,
#         margin=dict(t=40, l=40, r=20, b=40)
#     )

#     fig.show()



# motif_to_color = generate_motif_color_map(df['encodedSeq'], min_count=10)
# plot_interactive_waterfall(df, motif_to_color, top_n=50)
