In [1]:
import pandas as pd

# Replace this with the name of your input workbook
input_workbook = 'combined_phyloglm_results.xlsx'
output_workbook = 'processed_combined_phyloglm_results.xlsx'

# Read all sheet names from the input workbook
with pd.ExcelFile(input_workbook, engine='openpyxl') as xls:
    sheet_names = xls.sheet_names

# Function to process a single sheet
def process_sheet(sheet_name, input_workbook):
    df = pd.read_excel(input_workbook, sheet_name=sheet_name, engine='openpyxl')
    
    # Remove rows with more than 2 'NA' values
    df = df[df.isna().sum(axis=1) <= 2]
    
    # Create new columns
    df['gene'] = df['orthogroup.id']
    df['niche'] = df['niche'].replace({'rhizo': 'rhizo & soil', 'phyllo': 'phyllo & soil'})
    df['stats_method'] = 'phyloglm'
    
    df['raw_enriched'] = ((df['adj_p.value'] < 0.05) & (df['Estimate'] > 0) & (df['comparison_type'] == 'raw')).astype(int)
    df['raw_depleted'] = ((df['adj_p.value'] < 0.05) & (df['Estimate'] < 0) & (df['comparison_type'] == 'raw')).astype(int)
    df['binary_enriched'] = ((df['adj_p.value'] < 0.05) & (df['Estimate'] > 0) & (df['comparison_type'] == 'binary')).astype(int)
    df['binary_depleted'] = ((df['adj_p.value'] < 0.05) & (df['Estimate'] < 0) & (df['comparison_type'] == 'binary')).astype(int)

    new_df = df[['gene', 'niche', 'stats_method', 'raw_enriched', 'raw_depleted', 'binary_enriched', 'binary_depleted']]
    
    # Group by gene, niche, and stats_method and sum the last 4 columns
    new_df = new_df.groupby(['gene', 'niche', 'stats_method'], as_index=False).sum()
    
    return new_df

# Process all sheets and save to a new workbook
with pd.ExcelWriter(output_workbook, engine='openpyxl') as writer:
    for sheet_name in sheet_names:
        new_df = process_sheet(sheet_name, input_workbook)
        new_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Processed workbook saved to '{output_workbook}'")


Processed workbook saved to 'processed_combined_phyloglm_results.xlsx'
