In [None]:
import os
import re
import pandas as pd
import plotly.express as px
import kaleido

ipa_df = pd.read_csv('IPA.csv')
ipa_pathways = []
for idx, row in ipa_df.iterrows():
    pathway = row[ipa_df.columns[0]]
    molecules = row[ipa_df.columns[-1]]
    if pd.isna(molecules):
        genes = set()
    else:
        genes = set(g.strip().upper() for g in str(molecules).split(',') if g.strip())
    ipa_pathways.append({'IPA_pathway': pathway, 'IPA_genes': genes})

dfs = {}
low_counts = {}
answers_dir = 'answers'
os.makedirs(answers_dir, exist_ok=True)

for filename in os.listdir(answers_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(answers_dir, filename)
        with open(filepath, 'r') as file:
            content = file.read()
        blocks = re.split(r'\n\s*\n', content)
        table_data = []
        low_count = 0
        for block in blocks:
            block = block.strip()
            if not block:
                continue
            lines = block.splitlines()
            if len(lines) < 2:
                continue
            pathway_name_line = lines[0].strip()
            answer_pathway = pathway_name_line[:-1].strip() if pathway_name_line.endswith(':') else pathway_name_line.strip()
            gene_line = lines[1].strip()
            if gene_line.lower().startswith('genes involved:'):
                gene_list_str = gene_line[len('Genes involved:'):].strip()
            else:
                gene_list_str = gene_line
            answer_genes = set(g.strip().upper() for g in gene_list_str.split(',') if g.strip())
            
            if len(answer_genes) < 4:
                low_count += 1
                continue
            
            overlaps = []
            for ipa in ipa_pathways:
                if len(answer_genes) == 0:
                    overlap_percent = 0
                else:
                    overlap = answer_genes.intersection(ipa['IPA_genes'])
                    overlap_percent = (len(overlap) / len(answer_genes)) * 100
                overlaps.append((ipa['IPA_pathway'], overlap_percent))
            
            above_threshold = [item for item in overlaps if item[1] >= 50]
            if above_threshold:
                for hit, percent in above_threshold:
                    table_data.append({
                        "Pathway": answer_pathway,
                        "Hit": hit,
                        "Overlap %": f"{percent:.2f}",
                        "Original Genes": ", ".join(sorted(answer_genes))
                    })
            else:
                best_hit, best_percent = max(overlaps, key=lambda x: x[1])
                table_data.append({
                    "Pathway": answer_pathway,
                    "Hit": best_hit,
                    "Overlap %": f"{best_percent:.2f}",
                    "Original Genes": ", ".join(sorted(answer_genes))
                })
        df = pd.DataFrame(table_data)
        dfs[filename] = df
        low_counts[filename] = low_count

validated_dir = './validated'
os.makedirs(validated_dir, exist_ok=True)
for filename, df in dfs.items():
    output_filename = os.path.join(validated_dir, filename.replace('.txt', '.csv'))
    df.to_csv(output_filename, index=False)

def extract_config(filename):
    m = re.match(r"o3-(.*)\.txt", filename)
    if m:
        config_raw = m.group(1)
        tokens = config_raw.split("-")
        if tokens[-1].isdigit():
            return " ".join(tokens[:-1])
        else:
            return " ".join(tokens)
    return "unknown"

config_dict = {}
config_low = {}
for filename, df in dfs.items():
    m = re.match(r"o3-(.*)\.txt", filename)
    if m:
        config_raw = m.group(1)
        config = config_raw.replace("-", " ")
    else:
        config = "unknown"
    if df.empty or "Overlap %" not in df.columns:
        config_dict.setdefault(config, []).append(0.0)
    else:
        for val in df["Overlap %"]:
            try:
                num_val = float(val)
            except:
                num_val = 0.0
            config_dict.setdefault(config, []).append(num_val)
    low = low_counts.get(filename, 0)
    config_low[config] = config_low.get(config, 0) + low

config_averages = []
for config, overlaps in config_dict.items():
    avg_overlap = sum(overlaps) / len(overlaps) if overlaps else 0.0
    low_total = config_low.get(config, 0)
    config_averages.append({
        "Configuration": config,
        "Average Overlap %": f"{avg_overlap:.2f}",
        "Pathways with <4 genes": low_total
    })
config_df = pd.DataFrame(config_averages)
config_df.to_csv(os.path.join(validated_dir, "average_overlap_config.csv"), index=False)

data_for_box = []
for filename, df in dfs.items():
    config = extract_config(filename)
    if df.empty or "Overlap %" not in df.columns:
        data_for_box.append({"Configuration": config, "Overlap %": 0.0})
    else:
        for val in df["Overlap %"]:
            try:
                num_val = float(val)
            except:
                num_val = 0.0
            data_for_box.append({"Configuration": config, "Overlap %": num_val})
df_box = pd.DataFrame(data_for_box)

fig = px.box(
    data_frame=df_box,
    x="Configuration",
    y="Overlap %",
    color="Configuration",
    points="all",
    template="plotly_white"
)
fig.update_traces(boxmean=True, pointpos=0, jitter=0.3)
fig.update_layout(
    autosize=False,
    width=1200,
    height=600,
    title=dict(text="Distribution of Overlap % per Configuration", x=0.5, xanchor='center'),
    xaxis_title="Configuration",
    yaxis_title="Overlap %",
    showlegend=True,
    font=dict(family="Arial", size=15, color="Black"),
    margin=dict(b=120)
)
fig.update_yaxes(range=[0, 101], tick0=0, dtick=10)
fig.update_xaxes(tickangle=0, automargin=True)

# Compute average overlap per run
run_data = []
for filename, df in dfs.items():
    config = extract_config(filename)
    if df.empty or "Overlap %" not in df.columns:
        avg_overlap = 0.0
    else:
        try:
            avg_overlap = df["Overlap %"].astype(float).mean()
        except:
            avg_overlap = 0.0
    run_data.append({"Run": filename, "Configuration": config, "Average Overlap %": avg_overlap})
run_df = pd.DataFrame(run_data)

fig_run = px.box(
    data_frame=run_df,
    x="Configuration",
    y="Average Overlap %",
    color="Configuration",
    points="all",
    template="plotly_white"
)
fig_run.update_traces(boxmean=True, pointpos=0, jitter=0.3)
fig_run.update_layout(
    autosize=False,
    width=1200,
    height=600,
    title=dict(text="Average Overlap % per Run", x=0.5, xanchor='center'),
    xaxis_title="Configuration",
    yaxis_title="Average Overlap %",
    showlegend=True,
    font=dict(family="Arial", size=15, color="Black"),
    margin=dict(b=120)
)
fig_run.update_yaxes(range=[0, 101], tick0=0, dtick=10)
fig_run.update_xaxes(tickangle=0, automargin=True)

png_dir = "./PNG"
os.makedirs(png_dir, exist_ok=True)
fig.write_image(os.path.join(png_dir, "overlap_distribution_config.png"))
fig_run.write_image(os.path.join(png_dir, "average_overlap_per_run.png"))

fig.show()
fig_run.show()
