In [None]:
import pandas as pd
import numpy as np
import plotly.express as plt
import seaborn as sns

In [None]:
matched_spectra_df = pd.read_csv("results/matched_spectra_df.csv")
matched_spectra_df['scoring_function'] = matched_spectra_df['scoring_function'].replace('optimize_q_wrapper', 'Andromeda')
matched_spectra_df['scoring_function'] = matched_spectra_df['scoring_function'].replace('simple_scoring_function', 'Simple Scoring')
matched_spectra_df['experiment_name'] = matched_spectra_df['experiment_name'].str.replace('new_', '')
matched_spectra_df.head()


# FDR calculation

In [None]:
from utils import calculate_fdr
fdr_df = calculate_fdr(matched_spectra_df)
fdr_df.head()

# Pie chart for target database showing dropped identifications for FDR < 0.01

In [24]:
# across all three experiments: two pie charts comparing the simple scoring with andromeda
# only consider the target database and use plotly

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming fdr_df is already loaded
# If not, uncomment the following line and provide the correct path
# fdr_df = pd.read_csv("results/fdr_results.csv")

# 1. Filter for target identifications only
target_df = fdr_df[fdr_df['is_target'] == True].copy()

# 2. Create a new column for FDR threshold
target_df['passes_fdr'] = target_df['fdr'] <= 0.01

# 3. Get unique scoring functions and experiments
scoring_functions = target_df['scoring_function'].unique()
experiments = target_df['experiment_name'].unique()

# =====================================================
# First set of charts: Overall comparison by scoring function
# =====================================================

# Create a figure with subplots
fig1 = make_subplots(rows=1, cols=len(scoring_functions),
                    specs=[[{'type':'domain'} for _ in scoring_functions]],
                    subplot_titles=[f"Scoring Function: {func}" for func in scoring_functions])

# Add a pie chart for each scoring function
for i, func in enumerate(scoring_functions):
    func_data = target_df[target_df['scoring_function'] == func]

    # Count passes vs. fails
    counts = func_data['passes_fdr'].value_counts()
    labels = ["FDR <= 0.01", "FDR > 0.01"]

    # Handle case where all pass or all fail
    if len(counts) < 2:
        if True in counts:
            values = [counts[True], 0]
        else:
            values = [0, counts[False]]
    else:
        values = [counts[True], counts[False]]

    fig1.add_trace(
        go.Pie(labels=labels, values=values, name=func),
        row=1, col=i+1
    )

    # Customize pie chart appearance
    fig1.update_traces(
        hoverinfo='label+percent+value',
        textinfo='value+percent',
        textfont_size=14,
        marker=dict(colors=['#2ca02c', '#d62728'], line=dict(color='#000000', width=2)),
        row=1, col=i+1
    )

# Update layout with more space for titles
fig1.update_layout(
    title_text="Peptide Identifications by Scoring Function (Target Database, FDR <= 0.01)",
    height=500,
    width=800 * len(scoring_functions) // 2,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.1,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=120)  # Add top margin
)

# Adjust annotations (subplot titles) to be higher above the pies
for annotation in fig1.layout.annotations:
    annotation.y = annotation.y + 0.1  # Move title up


# Display the figures
fig1.show()

fig1.write_image("results/target_database_fdr_pie_charts_total.png")


In [28]:
# =====================================================
# Second set of charts: 2x3 grid with scoring functions as rows and experiments as columns
# =====================================================

# Prepare data for pie charts using cumulative threshold logic
results = []
for (experiment, scoring_func), group in fdr_df.groupby(['experiment_name', 'scoring_function']):
    # Find the threshold where FDR <= 0.01
    valid_fdr = group[group['fdr'] <= 0.01]

    if valid_fdr.empty:
        passing_targets = 0
    else:
        threshold_idx = valid_fdr.index.max()
        passing_targets = int(group.loc[threshold_idx, 'cumulative_targets'])

    total_targets = group['is_target'].sum()
    failing_targets = total_targets - passing_targets

    results.append({
        'experiment': experiment,
        'scoring_function': scoring_func,
        'passing': passing_targets,
        'failing': failing_targets
    })

pie_data = pd.DataFrame(results)
# Create subplot grid
scoring_functions = pie_data['scoring_function'].unique()
experiments = pie_data['experiment'].unique()

rows = len(scoring_functions)
cols = len(experiments)

fig2 = make_subplots(
    rows=rows,
    cols=cols,
    specs=[[{'type':'domain'} for _ in range(cols)] for _ in range(rows)],
    vertical_spacing=0.2,
    horizontal_spacing=0.05
)

# Add pie charts to grid
for i, scoring_func in enumerate(scoring_functions):
    for j, exp in enumerate(experiments):
        # Get data for this combination
        data = pie_data[(pie_data['scoring_function'] == scoring_func) &
                        (pie_data['experiment'] == exp)]

        if data.empty:
            continue

        passing = data['passing'].values[0]
        failing = data['failing'].values[0]

        fig2.add_trace(
            go.Pie(
                labels=['Passing (FDR â‰¤ 0.01)', 'Failing'],
                values=[passing, failing],
                textinfo='value+percent',
                marker=dict(colors=['#2ca02c', '#d62728']),
            ),
            row=i+1, col=j+1
        )

# Update layout with more space for titles
fig2.update_layout(
    title_text="Peptide Identifications by Scoring Function and Experiment (Target Database, FDR Threshold 0.01)",
    height=400 * rows,
    width=400 * cols,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.1,
        xanchor="center",
        x=0.5
    ),
    margin=dict(t=50, b=50, l=50, r=50),  # Add margins
    grid=dict(rows=rows, columns=cols, pattern="independent")  # Ensure independent subplots
)

# Add annotations for subplot titles
annotations = []
for i, scoring_func in enumerate(scoring_functions):
    for j, exp in enumerate(experiments):
        annotations.append(dict(
            text=f"{scoring_func}<br>{exp}",
            x=0.5,
            y=1.1,
            xref=f"x{j+1} domain",
            yref=f"y{i+1} domain",
            showarrow=False,
            font=dict(size=12)
        ))

fig2.update_layout(annotations=annotations)

# Display the figure
fig2.show()

ValueError: 
    Invalid value of type 'builtins.str' received for the 'xref' property of layout.annotation
        Received value: 'x1 domain'

    The 'xref' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['paper']
      - A string that matches one of the following regular expressions:
            ['^x([2-9]|[1-9][0-9]+)?( domain)?$']

# Density plot for match scores

In [None]:
# Density plot for match scores
from visualization import SpectrumVisualizer
total_sv = SpectrumVisualizer(matched_spectra_df)
kde_plot = total_sv.plot_score_distributions("Match Score Score Distributions by Scoring Function and Experiment", custom_xlims={'Andromeda': (-40, 10), 'Simple Scoring': (-5, 0.5)}, custom_ylims={'Andromeda': (0, 0.2), 'Simple Scoring': (0, 1)})

In [20]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Define constants
FDR_THRESHOLDS = [0.01, 0.1, 0.25]
SCORING_FUNCTIONS = ['Andromeda', 'Simple Scoring']
EXPERIMENTS = list(fdr_df['experiment_name'].unique())

# Initialize subplot grid
fig = make_subplots(
    rows=2,
    cols=3,
    subplot_titles=[f"{exp}<br>{sf}" for sf in SCORING_FUNCTIONS for exp in EXPERIMENTS],
    vertical_spacing=0.15,
    horizontal_spacing=0.1,
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]]
)

# Create mapping for subplot positions
subplot_positions = {
    (sf, exp): (row, col)
    for row, sf in enumerate(SCORING_FUNCTIONS, 1)
    for col, exp in enumerate(EXPERIMENTS, 1)
}

# Process data and create plots
for (scoring_func, experiment), group_df in fdr_df[fdr_df['is_target'] == True].groupby(['scoring_function', 'experiment_name']):
    print(len(group_df))
    # Sort by achieved_fdr ascending (best thresholds first)
    sorted_df = group_df.sort_values('fdr', ascending=True)

    # Calculate hits at each FDR threshold
    hit_counts = []
    for threshold in FDR_THRESHOLDS:
        valid = sorted_df[sorted_df['fdr'] <= threshold]
        hit_counts.append(valid['cumulative_targets'].max() if not valid.empty else 0)

    # Get subplot position
    row, col = subplot_positions.get((scoring_func, experiment), (1, 1))

    # Create bar trace
    fig.add_trace(
        go.Bar(
            x=[f'{t*100}%' for t in FDR_THRESHOLDS],
            y=hit_counts,
            name=f'{experiment} - {scoring_func}',
            text=hit_counts,
            textposition='auto'
        ),
        row=row,
        col=col
    )

# Update layout
fig.update_layout(
    height=800,
    width=1200,
    title_text="FDR-Calibrated Hit Counts by Experiment and Scoring Function",
    showlegend=False,
    margin=dict(t=100)
)

# Add axis labels
for row in [1, 2]:
    for col in [1, 2, 3]:
        fig.update_xaxes(title_text="FDR Threshold", row=row, col=col)
        fig.update_yaxes(title_text="Number of Identifications", row=row, col=col)

fig.show()

40217
47211
46431
47211
