In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import re
from scipy.sparse import issparse
from pathlib import Path

# --- Configuration ---
# NOTE: Please ensure the input file path is correct for your environment.
# Using a placeholder for demonstration.
# adata_file = '../processed_data/merged_v9/Annotated_phenotype_hypr_seq_MERGED_with_donor_clusters.h5ad'
# Create a dummy AnnData object if the file doesn't exist for demonstration purposes
try:
    adata_file = '/mnt/data/project/25_04_29_Figure3_reanalysis/processed_data/merged_v9/Annotated_phenotype_hypr_seq_MERGED_with_donor_clusters.h5ad'
    adata_full = sc.read_h5ad(adata_file)
except FileNotFoundError:
    raise ValueError("The specified AnnData file does not exist. Please provide a valid file path.")

genotype_col = 'genotype_annotation'
condition_col = 'condition'
stimulated_condition_label = 'IFNG' # Focus on IFN-g condition
wt_genotype_label = 'AAV_control'

# Specific variant genotypes for the plot
gof1_variant_label = 'STAT1-C324R_hom_pure'
gof2_variant_label = 'STAT1-D165G_hom_pure'

# DE Analysis Parameters
de_pval_cutoff = 0.05 # Adjusted p-value cutoff for significan===ce coloring

# List of genes to flag for labeling in R
# genes_to_label_manual = [
#     "SOCS3", "IFIT2", "RSAD2", "STING1", "PIM1", "SOCS1", "IFIT3", "CCL8",
#     "MX2", "CXCL9", "OAS3", "IL1B", "IL18", "TNFRSF11A", "C5AR1", "MYC", "THBD",
#     "IER3", "IRF1", "OASL", "IFITM2", "IFITM3", "ISG15", "IFI44L", "IFI35",
#     "EPSTI1", "IFITM1", "STAT2", "IFI44", "CSNK1G2", "MMP9", "EIF4B", "LTA4H",
#     "CIITA", "HLA-DPA1", "HLA-DRB5", "HLA-DRB1", "HLA-DRA", "HLA-DQB1",
#     "HLA-DQB2", "APOE", "FABP5", "SORT1", "CYP27A1", "NPC1", "CDIPT", "UGCG",
#     "SPHK1", "PTGR1"
# ]

genes_to_label_manual = [
    "IRF1",
    "OASL",
    "IFITM2",
    "IFITM3",
    "ISG15",
    "IFI44L",
    "IFI35",
    "EPSTI1",
    "IFITM1",
    "STAT2",
    "IFI44",
    "CSNK1G2",
    "MMP9",
    "EIF4B",
    "LTA4H",
    "CIITA",
    "HLA-DPA1",
    "HLA-DRB5",
    "HLA-DRB1",
    "HLA-DRA",
    "HLA-DQB1",
    "HLA-DQB2",
    "APOE",
    "FABP5",
    "SORT1",
    "CYP27A1",
    "NPC1",
    "CDIPT",
    "UGCG",
    "SPHK1",
    "PTGR1"
]

output_dir = Path('./Figure3k')
output_dir.mkdir(parents=True, exist_ok=True)
output_csv_file = output_dir / f'{gof1_variant_label.replace("_hom_pure", "")}_vs_{gof2_variant_label.replace("_hom_pure", "")}_LogFC_data_for_R.csv'

# --- 1. Load Data and Prepare ---
print(f"Loading full annotated data...")
try:
    if not isinstance(adata_full, sc.AnnData):
        if isinstance(adata_file, (str, Path)):
            adata_full = sc.read_h5ad(adata_file)
            # use donor 0 only
            # adata_full = adata_full[adata_full.obs['inferred_donor_cluster'] == '0'].copy()
        else:
            raise TypeError("adata_file must be a path string or an AnnData object.")
    print("Full data loaded successfully.")

    # Prepare expression data
    if adata_full.raw is not None:
        print("Using adata_full.raw for processing...")
        adata_processed = adata_full.raw.to_adata()
    else:
        adata_processed = adata_full.copy()
        print("Using adata_full.X for processing...")
    
    sc.pp.normalize_total(adata_processed, target_sum=1e4)
    sc.pp.log1p(adata_processed)
    print("  Normalization and log1p applied.")
    
    adata_processed.obs[genotype_col] = adata_processed.obs[genotype_col].astype('category')
    adata_processed.obs[condition_col] = adata_processed.obs[condition_col].astype('category')

except Exception as e:
    print(f"An error occurred loading/preparing the AnnData file: {e}"); exit()


# --- 2. Perform DE Analysis for each GoF variant vs WT ---
print(f"\nStep 2: Performing DE analysis...")

de_results_dict = {}

for gof_label in [gof1_variant_label, gof2_variant_label]:
    print(f"  Running DE for '{gof_label}' vs '{wt_genotype_label}'...")
    
    de_target_genotypes = [gof_label, wt_genotype_label]
    adata_de_subset = adata_processed[
        (adata_processed.obs[genotype_col].isin(de_target_genotypes)) &
        (adata_processed.obs[condition_col] == stimulated_condition_label)
    ].copy()

    if len(adata_de_subset.obs[genotype_col].cat.categories) < 2:
        print(f"  Warning: Not enough groups for DE analysis for '{gof_label}'. Skipping.")
        continue

    sc.tl.rank_genes_groups(adata_de_subset, groupby=genotype_col, reference=wt_genotype_label,
                            method='wilcoxon', use_raw=False, corr_method='benjamini-hochberg',
                            n_genes=adata_de_subset.n_vars)

    de_results_dict[gof_label] = sc.get.rank_genes_groups_df(adata_de_subset, group=gof_label).set_index('names')
    print(f"  DE results for '{gof_label}' extracted.")

if gof1_variant_label not in de_results_dict or gof2_variant_label not in de_results_dict:
    print("Error: DE results for one or both GoF variants not available. Exiting."); exit()


# --- 3. Prepare Data for Export ---
print("\nStep 3: Preparing data for export...")

# Combine logFC and p-values from both DE analyses
export_df = pd.DataFrame({
    'gene': de_results_dict[gof1_variant_label].index,
    'logfc_gof1': de_results_dict[gof1_variant_label]['logfoldchanges'],
    'p_adj_gof1': de_results_dict[gof1_variant_label]['pvals_adj'],
    'logfc_gof2': de_results_dict[gof2_variant_label]['logfoldchanges'],
    'p_adj_gof2': de_results_dict[gof2_variant_label]['pvals_adj']
}).dropna()

# Add a column to flag which genes should be labeled
export_df['label'] = np.where(export_df['gene'].isin(genes_to_label_manual), export_df['gene'], '')

# Add a color category column for easy plotting in R
def get_color_category(row):
    sig_gof1 = row['p_adj_gof1'] < de_pval_cutoff
    sig_gof2 = row['p_adj_gof2'] < de_pval_cutoff
    if sig_gof1 and sig_gof2:
        return 'Significant in Both'
    elif sig_gof1:
        return f'Significant in {gof1_variant_label.replace("_hom_pure", "")} Only'
    elif sig_gof2:
        return f'Significant in {gof2_variant_label.replace("_hom_pure", "")} Only'
    else:
        return 'Not Significant'

export_df['color_category'] = export_df.apply(get_color_category, axis=1)

# Add a size column for plotting
export_df['-log10_p_adj_max'] = -np.log10(export_df[['p_adj_gof1', 'p_adj_gof2']].min(axis=1).replace(0, np.finfo(float).eps))


# --- 4. Save to CSV ---
print(f"\nStep 4: Saving data to CSV file at: {output_csv_file}")
export_df.to_csv(output_csv_file, index=False)

print("\n--- Data export complete. You can now use the CSV file in R. ---")


Loading full annotated data...
Full data loaded successfully.
Using adata_full.X for processing...
  Normalization and log1p applied.

Step 2: Performing DE analysis...
  Running DE for 'STAT1-C324R_hom_pure' vs 'AAV_control'...
  DE results for 'STAT1-C324R_hom_pure' extracted.
  Running DE for 'STAT1-D165G_hom_pure' vs 'AAV_control'...
  DE results for 'STAT1-D165G_hom_pure' extracted.

Step 3: Preparing data for export...

Step 4: Saving data to CSV file at: Figure3k/STAT1-C324R_vs_STAT1-D165G_LogFC_data_for_R.csv

--- Data export complete. You can now use the CSV file in R. ---
