In [1]:
import scipy.io
import numpy as np
import pandas as pd
import gzip

def get_pseudobulk_from_mtx_gz(mtx_gz_path, features_tsv_gz_path):
    """
    Compute pseudobulk counts from a single mtx.gz file.
    """
    # Load sparse matrix and sum across cells
    with gzip.open(mtx_gz_path, 'rt') as f:
        mtx = scipy.io.mmread(f)
    pseudobulk_counts = np.array(mtx.sum(axis=1)).flatten()
    
    # Load gene names (second column of features.tsv.gz)
    features = pd.read_csv(features_tsv_gz_path, sep='\t', header=None, compression='gzip')
    gene_names = features[1].values
    
    # Create DataFrame
    return pd.DataFrame({
        'gene_name': gene_names[:len(pseudobulk_counts)],
        'pseudobulk_count': pseudobulk_counts
    })

def combine_pseudobulk_shared_genes(df_rep1, df_rep2):
    """
    Combine pseudobulk for shared genes only, summing counts.
    """
    # Inner join to keep only common genes
    combined_df = pd.merge(df_rep1, df_rep2, on='gene_name', how='inner', suffixes=('_rep1', '_rep2'))
    
    # Sum counts
    combined_df['pseudobulk_count'] = combined_df['pseudobulk_count_rep1'] + combined_df['pseudobulk_count_rep2']
    
    # Select relevant columns
    return combined_df[['gene_name', 'pseudobulk_count']]

def combined_pseudobulk_from_mtx(mtx1_path, feat1_path, mtx2_path, feat2_path):
    """
    Main function: Compute and combine pseudobulk from two replicate file pairs.
    """
    df_rep1 = get_pseudobulk_from_mtx_gz(mtx1_path, feat1_path)
    df_rep2 = get_pseudobulk_from_mtx_gz(mtx2_path, feat2_path)
    return combine_pseudobulk_shared_genes(df_rep1, df_rep2)
#replicate 1 link: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5687481
#replicate 2 link: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5687482
# Example usage (replace with your actual file paths):
combined_df = combined_pseudobulk_from_mtx(
    'GSM5687481_k562_rep1_matrix.mtx.gz', 
    'GSM5687481_k562_rep1_features.tsv.gz', 
    'GSM5687482_k562_rep2_matrix.mtx.gz', 
    'GSM5687482_k562_rep2_features.tsv.gz'
)

In [2]:
print(combined_df.head())  # Preview
combined_df.to_csv('10x_combined_pseudobulk_both_replicates.txt', index=False,header=False, sep='\t')  # Save to file

     gene_name  pseudobulk_count
0  MIR1302-2HG                 0
1      FAM138A                 0
2        OR4F5                16
3   AL627309.1               311
4   AL627309.3                40
