In [None]:
import scanpy as sc
import pandas
import anndata as ad
import scvi
import numpy
import subprocess
import sys
import importlib
import rpy2.robjects
readRDS = rpy2.robjects.r['readRDS']
saveRDS= rpy2.robjects.r["saveRDS"]
import os
from rpy2.robjects import pandas2ri
import scipy
from scipy.sparse import csr_matrix
pandas2ri.activate()

In [None]:
# Read data

metadata_path = "./attachement"
meta_peng = pandas.read_csv(os.path.join(metadata_path, "peng_metadata.csv"), index_col=0, sep=";")
meta_baron = pandas.read_csv(os.path.join(metadata_path, "baron_metadata.csv"), index_col=0, sep=";")
meta_baron.index = meta_baron.index.str.replace("-",".")
meta_raghavan = pandas.read_csv(os.path.join(metadata_path, "raghavan_metadata.csv"), index_col=0, sep=";")
(df_counts_peng,df_counts_baron,df_counts_raghavan) = get_ref_counts_df()


sc_pen = ad.AnnData(X= df_counts_peng.T, obs=meta_peng)
sc_baron = ad.AnnData(X= df_counts_baron.T, obs=meta_baron)
sc_raghavan = ad.AnnData(X= df_counts_raghavan.T, obs=meta_raghavan)


In [None]:
def create_pseudo_bulk(anndata, groupby, method='sum'):
    """
    Create pseudo-bulk data from a single-cell AnnData object.

    Parameters:
        anndata: AnnData
            Single-cell AnnData object.
        groupby: str
            Column in `anndata.obs` used to group cells.
        method: str
            Aggregation method ('sum' or 'mean'). Default is 'sum'.
    
    Returns:
        pseudo_bulk_matrix: pd.DataFrame
            Pseudo-bulk expression matrix with groups as rows and genes as columns.
    """

    # Ensure the groupby column exists in `anndata.obs`
    if groupby not in anndata.obs.columns:
        raise ValueError(f"'{groupby}' is not a column in anndata.obs")
    
    # Convert sparse matrix to dense if necessary
    X = anndata.X
    if isinstance(X, csr_matrix):
        X = X.toarray()
    
    # Group by the specified column
    groups = anndata.obs[groupby]
    unique_groups = groups.unique()

    # Create an empty DataFrame to store pseudo-bulk data
    pseudo_bulk_data = []
    group_names = []

    for group in unique_groups:
        # Find cells in the current group
        group_mask = groups == group
        group_data = X[group_mask, :]
        
        # Aggregate expression values
        if method == 'sum':
            aggregated = group_data.sum(axis=0)
        elif method == 'mean':
            aggregated = group_data.mean(axis=0)
        else:
            raise ValueError(f"Unsupported method '{method}'. Use 'sum' or 'mean'.")
        
        pseudo_bulk_data.append(aggregated)
        group_names.append(group)

    # Create a DataFrame for the pseudo-bulk matrix
    pseudo_bulk_matrix = pandas.DataFrame(
        numpy.vstack(pseudo_bulk_data),
        index=group_names,
        columns=anndata.var_names
    )
    
    return pseudo_bulk_matrix


    

In [None]:
# create and save pseudo bulk data

pseudo_bulk_pen_sum = create_pseudo_bulk(sc_pen, groupby="cell_type", method='sum').T
pseudo_bulk_baron_sum = create_pseudo_bulk(sc_baron, groupby="cell_type", method='sum').T
pseudo_bulk_raghavan_sum = create_pseudo_bulk(sc_raghavan, groupby="cell_type", method='sum').T

pseudo_bulk_pen_sum.to_csv("peng_pseudo_bulk_sum.csv")
pseudo_bulk_baron_sum.to_csv("baron_pseudo_bulk_sum.csv")
pseudo_bulk_raghavan_sum.to_csv("raghavan_pseudo_bulk_sum.csv")

