In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import scanpy as sc

In [3]:
import os
import pandas as pd
import numpy as np

def generate_ground_truth_gex_for_samples(st_path, sc_path, output_base_dir, num_samples=5):
    """
    Generates ground-truth gene expression matrices for multiple samples, saving them in separate folders for each sample.

    Parameters:
    - st_path (str): Base path to the spot index files.
    - sc_path (str): Base path to the single-cell data files.
    - output_base_dir (str): Base directory to save ground-truth matrices.
    - num_samples (int): Number of samples (default: 5).
    """
    cell_types = [
        "B-cells",
        "Cancer Epithelial",
        "Myeloid",
        "PVL",
        "T-cells",
        "CAFs",
        "Endothelial",
        "Normal Epithelial",
        "Plasmablasts"
    ]
    
    for i in range(num_samples):
        # Construct file paths for the current sample
        index_path = os.path.join(st_path, f"Wu_ST_{i}_index.csv")
        data_path = os.path.join(sc_path, f"Wu_SC_{i}_data.csv")
        
        # Load the data
        index_df = pd.read_csv(index_path)
        data_df = pd.read_csv(data_path, index_col=0)
        
        # Convert log-normalized counts to regular counts
        counts_df = np.expm1(data_df).round()
        
        # Sort genes alphabetically
        counts_df = counts_df.sort_index()
        
        # Sort spots numerically based on the number in the 'spot' column
        index_df['spot_num'] = index_df['spot'].str.extract(r'spot_(\d+)').astype(int)
        index_df = index_df.sort_values(by='spot_num')
        
        # Get the sorted list of all unique spots
        all_spots = index_df['spot'].unique()
        
        # Initialize a dictionary to store ground-truth matrices for each cell type
        ground_truth_matrices = {}
        
        # Loop through all specified cell types
        for cell_type in cell_types:
            if cell_type in index_df['Cell_type'].unique():
                # Filter cells for the current cell type
                cell_type_df = index_df[index_df['Cell_type'] == cell_type]
                
                # Initialize a DataFrame with zeros for all spots and genes, sorted
                cell_type_counts = pd.DataFrame(0, index=all_spots, columns=counts_df.index)
                
                # Dictionary to store aggregated expression for each spot
                spot_expression_dict = {}
                
                # Aggregate expression per spot
                for spot in cell_type_df['spot'].unique():
                    # Get cells that map to this spot for the current cell type
                    spot_cells = cell_type_df[cell_type_df['spot'] == spot]['Cell']
                    
                    # Sum expression for these cells across genes
                    spot_expression_dict[spot] = counts_df[spot_cells].sum(axis=1)
                
                # Add calculated spot expressions to cell_type_counts DataFrame
                for spot, expression in spot_expression_dict.items():
                    cell_type_counts.loc[spot] = expression
                
                # Store the resulting spot x gene matrix for this cell type
                ground_truth_matrices[cell_type] = cell_type_counts
            else:
                # If the cell type is not present, create a zero matrix
                ground_truth_matrices[cell_type] = pd.DataFrame(0, index=all_spots, columns=counts_df.index)
        
        # Define output directory for the current sample
        sample_output_dir = os.path.join(output_base_dir, f"sample_{i}", "layers")
        os.makedirs(sample_output_dir, exist_ok=True)
        
        # Write each cell type's DataFrame to a CSV file
        for cell_type, df in ground_truth_matrices.items():
            filename = f"{cell_type}_GT.csv"
            filepath = os.path.join(sample_output_dir, filename)
            df.to_csv(filepath)
        
        print(f"Ground-truth matrices for sample {i} saved in '{sample_output_dir}' directory.")

In [4]:
generate_ground_truth_gex_for_samples(
    st_path = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_sim",
    sc_path = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/SC_sim",
    output_base_dir = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim"
)

Ground-truth matrices for sample 0 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim/sample_0/layers' directory.
Ground-truth matrices for sample 1 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim/sample_1/layers' directory.
Ground-truth matrices for sample 2 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim/sample_2/layers' directory.
Ground-truth matrices for sample 3 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim/sample_3/layers' directory.
Ground-truth matrices for sample 4 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/mixed/ST_GEX_sim/sample_4/layers' directory.


In [5]:
generate_ground_truth_gex_for_samples(
    st_path = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_sim",
    sc_path = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/SC_sim",
    output_base_dir = "/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim"
)

Ground-truth matrices for sample 0 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim/sample_0/layers' directory.
Ground-truth matrices for sample 1 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim/sample_1/layers' directory.
Ground-truth matrices for sample 2 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim/sample_2/layers' directory.
Ground-truth matrices for sample 3 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim/sample_3/layers' directory.
Ground-truth matrices for sample 4 saved in '/bgfs/alee/LO_LAB/Personal/Brent_Schlegel/Projects/Wu_Visium/Simulations/scCube_12k/replicates/high_seg/ST_GEX_sim/sample_4/layers' directory.
