In [32]:
import os
import re
import pandas as pd

# -------------------------
# Step 1: Load and prepare IPA data from IPA.csv
# -------------------------
ipa_df = pd.read_csv('IPA.csv')
print("Columns in IPA.csv:", ipa_df.columns.tolist())

# Use the first column as the pathway name and the last column as the gene list.
ipa_pathways = []
for idx, row in ipa_df.iterrows():
    pathway = row[ipa_df.columns[0]]   # First column: pathway name
    molecules = row[ipa_df.columns[-1]]  # Last column: gene list (Molecules)
    if pd.isna(molecules):
        genes = set()
    else:
        genes = set(g.strip().upper() for g in str(molecules).split(',') if g.strip())
    ipa_pathways.append({'IPA_pathway': pathway, 'IPA_genes': genes})

# -------------------------
# Step 2: Process each .txt file in the "answers" directory and create a DataFrame per file.
# -------------------------
dfs = {}  # Dictionary to hold a DataFrame for each file

answers_dir = 'answers'
for filename in os.listdir(answers_dir):
    if filename.endswith('.txt'):
        filepath = os.path.join(answers_dir, filename)
        with open(filepath, 'r') as file:
            content = file.read()

        # Split file content into blocks based on blank lines.
        blocks = re.split(r'\n\s*\n', content)
        table_data = []  # List to hold rows for this file

        # Parse each block in the file.
        for block in blocks:
            block = block.strip()
            if not block:
                continue
            lines = block.splitlines()
            if len(lines) < 2:
                continue  # Skip blocks without at least two lines
            # Extract the pathway name (removing a trailing colon, if present)
            pathway_name_line = lines[0].strip()
            answer_pathway = pathway_name_line[:-1].strip() if pathway_name_line.endswith(':') else pathway_name_line.strip()
            # Extract the gene list from the second line.
            gene_line = lines[1].strip()
            if gene_line.lower().startswith('genes involved:'):
                gene_list_str = gene_line[len('Genes involved:'):].strip()
            else:
                gene_list_str = gene_line
            answer_genes = set(g.strip().upper() for g in gene_list_str.split(',') if g.strip())

            # Calculate overlap percentages for each IPA pathway.
            overlaps = []
            for ipa in ipa_pathways:
                if len(answer_genes) == 0:
                    overlap_percent = 0
                else:
                    overlap = answer_genes.intersection(ipa['IPA_genes'])
                    overlap_percent = (len(overlap) / len(answer_genes)) * 100
                overlaps.append((ipa['IPA_pathway'], overlap_percent))

            # If one or more IPA pathways have >= 50% overlap, list all hits.
            # Otherwise, list the IPA pathway with the highest overlap percentage.
            above_threshold = [item for item in overlaps if item[1] >= 50]
            if above_threshold:
                for hit, percent in above_threshold:
                    table_data.append({
                        "Pathway": answer_pathway,
                        "Hit": hit,
                        "Overlap %": f"{percent:.2f}",
                        "Original Genes": ", ".join(sorted(answer_genes))
                    })
            else:
                best_hit, best_percent = max(overlaps, key=lambda x: x[1])
                table_data.append({
                    "Pathway": answer_pathway,
                    "Hit": best_hit,
                    "Overlap %": f"{best_percent:.2f}",
                    "Original Genes": ", ".join(sorted(answer_genes))
                })

        # Create a DataFrame for this file and store it in the dictionary.
        df = pd.DataFrame(table_data)
        dfs[filename] = df

# -------------------------
# Step 3: Create a directory for validated files if it doesn't exist and export each DataFrame as CSV.
# -------------------------
validated_dir = './validated'
os.makedirs(validated_dir, exist_ok=True)  # Create the directory if it doesn't exist
#
for filename, df in dfs.items():
    #print(f"\nDataFrame for file: {filename}")
    #display(df)  # In Jupyter Notebook; use print(df) if display() is not available
    output_filename = os.path.join(validated_dir, filename.replace('.txt', '.csv'))
    df.to_csv(output_filename, index=False)
    print(f"Exported {output_filename}")


Columns in IPA.csv: ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
Exported ./validated\o3-with-with-1.csv
Exported ./validated\o3-without-without-1.csv
