In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
# all_atac_merged = pd.read_parquet("encode_data/all_ATAC-seq_merged.parquet")
# all_cage_merged = pd.read_parquet("encode_data/all_CAGE_merged.parquet")
atac_unmerged = pd.read_parquet("encode_data/experiments/atac_gc.parquet")
cage_unmerged = pd.read_parquet("encode_data/experiments/cage_gc.parquet")

## Figures from Unmerged ENCODE Experiment Data
The following figures were generated by extending (10kbp) and merging overlapping or adjacent peaks for each individual experiment downloaded from ENCODE. 

In [None]:
all_system_slims = []
combined_slims = pd.concat([atac_unmerged['system_slims'], cage_unmerged['system_slims']])
for slim_array in combined_slims:
    if len(slim_array) > 0:
        all_system_slims.extend(slim_array)

slim_counts = pd.Series(all_system_slims).value_counts().reset_index()
slim_counts.columns = ['System', 'Count']
total_count = slim_counts['Count'].sum()

fig = px.pie(
    slim_counts,
    values='Count',
    names='System',
    title=f'Distribution of Systems',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/system_slims_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
all_organ_slims = []
combined_slims = pd.concat([atac_unmerged['organ_slims'], cage_unmerged['organ_slims']])
for slim_array in combined_slims:
    if len(slim_array) > 0:
        all_organ_slims.extend(slim_array)

slim_counts = pd.Series(all_organ_slims).value_counts().reset_index()
slim_counts.columns = ['Organ', 'Count']
total_count = slim_counts['Count'].sum()

fig = px.pie(
    slim_counts,
    values='Count',
    names='Organ',
    title=f'Distribution of Organs',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/organ_slims_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
all_cell_slims = []
combined_slims = pd.concat([atac_unmerged['cell_slims'], cage_unmerged['cell_slims']])
for slim_array in combined_slims:
    if len(slim_array) > 0:
        all_cell_slims.extend(slim_array)

slim_counts = pd.Series(all_cell_slims).value_counts().reset_index()
slim_counts.columns = ['Cell', 'Count']
total_count = slim_counts['Count'].sum()

fig = px.pie(
    slim_counts,
    values='Count',
    names='Cell',
    title=f'Distribution of Cells',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/cell_slims_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
all_system_slims = []
combined_slims = pd.concat([atac_unmerged['system_slims'], cage_unmerged['system_slims']])
for slim_array in combined_slims:
    if len(slim_array) > 0:
        all_system_slims.extend(slim_array)

slim_counts = pd.Series(all_system_slims).value_counts().reset_index()
slim_counts.columns = ['System', 'Count']
total_count = slim_counts['Count'].sum()
fig = px.pie(
    slim_counts,
    values='Count',
    names='System',
    title=f'Distribution of Systems',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label', 
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/systems_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
combined_biosample_class = pd.concat([atac_unmerged['biosample_class'], cage_unmerged['biosample_class']])
biosample_class_counts = combined_biosample_class.value_counts().reset_index()
biosample_class_counts.columns = ['Biosample Classification', 'Count']
total_count = biosample_class_counts['Count'].sum()
fig = px.pie(
    biosample_class_counts,
    values='Count',
    names='Biosample Classification',
    title=f'Distribution of Biosample Classification',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label', 
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/biosample_class_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
atac_unmerged["length"] = atac_unmerged["end"] - atac_unmerged["start"]
cage_unmerged["length"] = cage_unmerged["end"] - cage_unmerged["start"]
combined_lengths = pd.concat([atac_unmerged["length"], cage_unmerged["length"]])

mean_length = combined_lengths.mean()
median_length = combined_lengths.median()
q1 = combined_lengths.quantile(0.25)
q3 = combined_lengths.quantile(0.75)

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=combined_lengths,
        name="Sequence Length",
        marker=dict(
            color="#6236FF",
            line=dict(color="white", width=0.5)
        ),
        opacity=0.8,
        xbins=dict(
            start=0,
            end=combined_lengths.max() * 1.05,
            size=(combined_lengths.max()- 0) / 50
        )
    )
)
print(combined_lengths.max() * 1.05)
fig.update_layout(
    title={
        'text': "Sequence Length Distribution",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    xaxis_title="Length (bp)",
    yaxis_title="Count",
    bargap=0.1,
    margin=dict(t=100, b=50, l=50, r=50),
    height=600,
    width=1000,
    template="plotly_white"
)

fig.add_vline(x=median_length, line_width=2, line_dash="dash", line_color="red")

fig.add_annotation(
    x=median_length,
    y=0.95,
    xref="x",
    yref="paper",
    text=f"Median: {median_length:.0f} bp",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40
)

fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Mean: {mean_length:.0f} bp<br>Median: {median_length:.0f} bp<br>Q1: {q1:.0f} bp<br>Q3: {q3:.0f} bp",
    showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# fig.write_image("figures/all_exps/seq_length_histogram.png", width=1000, height=600, scale=2)
fig.show()

In [None]:
all_atac_with_length = atac_unmerged[['assay', 'length']]
all_cage_with_length = cage_unmerged[['assay', 'length']]

combined_df = pd.concat([all_atac_with_length, all_cage_with_length])

length_by_assay = combined_df.groupby('assay')['length'].sum().reset_index()
length_by_assay.columns = ['Assay Type', 'Total Length']

total_length = length_by_assay['Total Length'].sum()

length_by_assay['Custom Label'] = length_by_assay.apply(
    lambda row: f"{row['Assay Type']} ({row['Total Length']:,} bp)", axis=1
)

fig = px.pie(
    length_by_assay,
    values='Total Length',
    names='Custom Label', 
    title=f'Number of Nucleotides by Assay Type (Total: {total_length:,} bp)',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/all_exps/nucleotides_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
combined_gc = pd.concat([atac_unmerged["gc_content"], cage_unmerged["gc_content"]])

mean_gc = combined_gc.mean()
median_gc = combined_gc.median()
q1 = combined_gc.quantile(0.25)
q3 = combined_gc.quantile(0.75)

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=combined_gc,
        name="GC Content",
        marker=dict(
            color="#6236FF",
            line=dict(color="white", width=0.5)
        ),
        opacity=0.8,
        xbins=dict(
            start=0,
            end=combined_gc.max() * 1.05,
            size=(combined_gc.max() - 0) / 50
        )
    )
)

fig.update_layout(
    title={
        'text': "GC Content Distribution",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    xaxis_title="GC Content (%)",
    yaxis_title="Count",
    bargap=0.1,
    margin=dict(t=100, b=50, l=50, r=50),
    height=600,
    width=1000,
    template="plotly_white"
)

fig.add_vline(x=median_gc, line_width=2, line_dash="dash", line_color="red")

fig.add_annotation(
    x=median_gc,
    y=0.95,
    xref="x",
    yref="paper",
    text=f"Median: {median_gc:.0f}%",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40
)

fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Mean: {mean_gc:.0f}%<br>Median: {median_gc:.0f}%<br>Q1: {q1:.0f}%<br>Q3: {q3:.0f}%",
    showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# fig.write_image("figures/all_exps/gc_content_histogram.png", width=1000, height=600, scale=2)
fig.show()

## Figures from Merged ENCODE Experiment Data
The following figures were generated by first extending (10kbp) and merging overlapping or adjacent peaks from all downloaded ENCODE experiments for each assay. 

In [None]:
all_atac_with_length = all_atac_merged[['assay', 'length']]
all_cage_with_length = all_cage_merged[['assay', 'length']]

combined_df = pd.concat([all_atac_with_length, all_cage_with_length])

length_by_assay = combined_df.groupby('assay')['length'].sum().reset_index()
length_by_assay.columns = ['Assay Type', 'Total Length']

total_length = length_by_assay['Total Length'].sum()

length_by_assay['Custom Label'] = length_by_assay.apply(
    lambda row: f"{row['Assay Type']} ({row['Total Length']:,} bp)", axis=1
)

fig = px.pie(
    length_by_assay,
    values='Total Length',
    names='Custom Label', 
    title=f'Number of Nucleotides by Assay Type (Total: {total_length:,} bp)',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/merged_exps/nucleotides_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
combined_assay = pd.concat([all_atac_merged['assay'], all_cage_merged['assay']])
assay_counts = combined_assay.value_counts().reset_index()
assay_counts.columns = ['Assay Type', 'Count']
total_count = assay_counts['Count'].sum()
fig = px.pie(
    assay_counts,
    values='Count',
    names='Assay Type',
    title=f'Distribution of Assay Types (Total: {total_count} sequences)',
    color_discrete_sequence=px.colors.qualitative.G10,
)

fig.update_layout(
    title_font=dict(size=22),
    legend_title_font=dict(size=16),
    legend_font=dict(size=14),
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label', 
    textfont=dict(size=14, color='white')
)

# fig.write_image("figures/merged_exps/assay_types_pie_chart.png", width=1200, height=800, scale=2)
fig.show()

In [None]:
combined_lengths = pd.concat([all_atac_merged["length"], all_cage_merged["length"]])

mean_length = combined_lengths.mean()
median_length = combined_lengths.median()
q1 = combined_lengths.quantile(0.25)
q3 = combined_lengths.quantile(0.75)

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=combined_lengths,
        name="Sequence Length",
        marker=dict(
            color="#6236FF",
            line=dict(color="white", width=0.5)
        ),
        opacity=0.8,
        xbins=dict(
            start=0,
            end=combined_lengths.max() * 1.05,
            size=(combined_lengths.max()- 0) / 50
        )
    )
)
print(combined_lengths.max() * 1.05)
fig.update_layout(
    title={
        'text': "Sequence Length Distribution",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    xaxis_title="Length (bp)",
    yaxis_title="Count",
    bargap=0.1,
    margin=dict(t=100, b=50, l=50, r=50),
    height=600,
    width=1000,
    template="plotly_white"
)

fig.add_vline(x=median_length, line_width=2, line_dash="dash", line_color="red")

fig.add_annotation(
    x=median_length,
    y=0.95,
    xref="x",
    yref="paper",
    text=f"Median: {median_length:.0f} bp",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40
)

fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Mean: {mean_length:.0f} bp<br>Median: {median_length:.0f} bp<br>Q1: {q1:.0f} bp<br>Q3: {q3:.0f} bp",
    showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# fig.write_image("figures/merged_exps/seq_length_histogram.png", width=1000, height=600, scale=2)
fig.show()

In [None]:
combined_gc = pd.concat([all_atac_merged["gc_content"], all_cage_merged["gc_content"]])

mean_gc = combined_gc.mean()
median_gc = combined_gc.median()
q1 = combined_gc.quantile(0.25)
q3 = combined_gc.quantile(0.75)

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=combined_gc,
        name="GC Content",
        marker=dict(
            color="#6236FF",
            line=dict(color="white", width=0.5)
        ),
        opacity=0.8,
        xbins=dict(
            start=0,
            end=combined_gc.max() * 1.05,
            size=(combined_gc.max() - 0) / 50
        )
    )
)

fig.update_layout(
    title={
        'text': "GC Content Distribution",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    xaxis_title="GC Content (%)",
    yaxis_title="Count",
    bargap=0.1,
    margin=dict(t=100, b=50, l=50, r=50),
    height=600,
    width=1000,
    template="plotly_white"
)

fig.add_vline(x=median_gc, line_width=2, line_dash="dash", line_color="red")

fig.add_annotation(
    x=median_gc,
    y=0.95,
    xref="x",
    yref="paper",
    text=f"Median: {median_gc:.0f}%",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40
)

fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Mean: {mean_gc:.0f}%<br>Median: {median_gc:.0f}%<br>Q1: {q1:.0f}%<br>Q3: {q3:.0f}%",
    showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)

# fig.write_image("figures/merged_exps/gc_content_histogram.png", width=1000, height=600, scale=2)
fig.show()

In [None]:
all_atac_counts = all_atac_merged["uniprot_ids"].apply(lambda x: len(x) if isinstance(x, np.ndarray) and x.size > 0 else 0)
all_cage_counts = all_cage_merged["uniprot_ids"].apply(lambda x: len(x) if isinstance(x, np.ndarray) and x.size > 0 else 0)

combined_counts = pd.concat([all_atac_counts, all_cage_counts])

mean_counts = combined_counts.mean()
median_counts = combined_counts.median()
q1 = combined_counts.quantile(0.25)
q3 = combined_counts.quantile(0.75)
p95 = combined_counts.quantile(0.95)
max_count = combined_counts.max()

filtered_counts = combined_counts[combined_counts <= p95]

filtered_mean = filtered_counts.mean()
filtered_median = filtered_counts.median()

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=filtered_counts,
        name="UniProt ID Counts",
        marker=dict(
            color="#6236FF",
            line=dict(color="white", width=0.5)
        ),
        opacity=0.8,
        xbins=dict(
            start=0,
            end=p95 * 1.05,
            size=max(1, int(p95 / 50))
        )
    )
)

fig.update_layout(
    title={
        'text': "Distribution of UniProt IDs per Region",
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    xaxis_title="Number of UniProt IDs",
    yaxis_title="Count",
    bargap=0.1,
    margin=dict(t=100, b=50, l=50, r=50),
    height=600,
    width=1000,
    template="plotly_white"
)

fig.add_vline(x=median_counts, line_width=2, line_dash="dash", line_color="red")
fig.add_annotation(
    x=median_counts,
    y=0.95,
    xref="x",
    yref="paper",
    text=f"Median: {median_counts:.1f}",
    showarrow=True,
    arrowhead=1,
    ax=0,
    ay=-40
)

fig.add_annotation(
    x=0.95,
    y=0.95,
    xref="paper",
    yref="paper",
    text=f"Displayed Data (≤ 95th percentile):<br>" +
         f"Mean: {filtered_mean:.1f}<br>" +
         f"Median: {filtered_median:.1f}<br>" +
         f"Q1: {q1:.1f}<br>Q3: {q3:.1f}<br>" +
         f"<br>Full Dataset:<br>" +
         f"95th percentile: {p95:.1f}<br>" +
         f"Max value: {max_count}<br>" +
         f"Mean: {mean_counts:.1f}",
    showarrow=False,
    bgcolor="rgba(255, 255, 255, 0.8)",
    bordercolor="black",
    borderwidth=1,
    borderpad=4
)


# fig.write_image("figures/merged_exps/uniprot_ids_count_histogram.png", width=1000, height=600, scale=2)
fig.show()