In [None]:
import anndata as ad
import scanpy as sc
import pandas as pd


In [None]:
# Load podd et al data with log2(TPM+1) transformed 
podd_et_al = pd.read_csv("/Users/z5155527/Desktop/Benchmark-2025-Sep/phase_1_datasets/bulk_RNA-seq/podd_et_al/GSE158403_raw_counts_GRCh38.p13_NCBI.tsv", sep = '\t')
podd_et_al.index = podd_et_al["GeneID"].tolist()
podd_et_al = podd_et_al.drop(columns="GeneID").T
podd_et_al


In [None]:
# read clinical data of podd_et_al
podd_et_al_clinical = pd.read_excel("/Users/z5155527/Desktop/Benchmark-2025-Sep/phase_1_datasets/bulk_RNA-seq/podd_et_al/podd_et_al_clinical.xlsx")
podd_et_al_clinical


In [None]:
# map sample list of clinical data
clinical_sample_list = podd_et_al_clinical["Sample.ID"].tolist()
sample_list_mapping = pd.read_excel("/Users/z5155527/Desktop/Benchmark-2025-Sep/phase_1_datasets/bulk_RNA-seq/podd_et_al/podd_et_al_sample_mapping.xlsx")
# Check the overlap between clinical_sample_list and sample_list_mapping
# Assuming the mapping file has columns "Sample_title" and "Sample_geo_accession"
mapped_sample_list = sample_list_mapping["Sample_title"].tolist()

# Convert to sets for comparison
clinical_sample_set = set(clinical_sample_list)
mapped_sample_set = set(mapped_sample_list)

# Find intersection and differences
overlap_samples = clinical_sample_set & mapped_sample_set
only_in_clinical = clinical_sample_set - mapped_sample_set
only_in_mapping = mapped_sample_set - clinical_sample_set

print(f"Number of overlapping samples: {len(overlap_samples)}")
print(f"Samples only in clinical: {only_in_clinical}")
print(f"Samples only in mapping: {only_in_mapping}")


In [None]:
# map the sample name to gene expression data
# Map the sample names in podd_et_al (currently the index) to GEO accession using the sample_list_mapping
# Sample_list_mapping has columns: Sample_title (matching the clinical data and podd_et_al index) and Sample_geo_accession (matching expression data)

# Set Sample_title as index for easier lookup
sample_list_mapping_indexed = sample_list_mapping.set_index("Sample_title")

# Map Sample_geo_accession to expression data index using the Sample_title
# If podd_et_al.index are GEO accessions (e.g. GSM...), then we want to reverse-map:
# Otherwise, we create a mapping dict: Sample_title => Sample_geo_accession
sample_title_to_geo = sample_list_mapping.set_index("Sample_title")["Sample_geo_accession"].to_dict()

# Now, try to map podd_et_al's index (GEO accession) to sample title (using the reverse mapping)
geo_to_sample_title = sample_list_mapping.set_index("Sample_geo_accession")["Sample_title"].to_dict()

# Add column to podd_et_al for sample_title (if present in mapping)
podd_et_al["sample_title"] = podd_et_al.index.map(geo_to_sample_title)

# If you want to save the mapping result, or check alignment:
print(podd_et_al[["sample_title"]].head())


In [None]:
podd_et_al_tosave = podd_et_al.copy()
podd_et_al_tosave.index = podd_et_al_tosave["sample_title"]
podd_et_al_tosave.drop(columns=["sample_title"], inplace=True)
podd_et_al_tosave


In [None]:
# now map the gene names through the table
human_annotation_table = pd.read_csv("/Users/z5155527/Desktop/Benchmark-2025-Sep/phase_1_datasets/bulk_RNA-seq/sup/Human.GRCh38.p13.annot.tsv", sep = '\t')
human_annotation_table


In [None]:
# map GeneID to columns of podd_et_al_tosave
# Create a mapping from GeneID (as string) to Symbol
geneid_to_symbol = dict(zip(human_annotation_table["GeneID"], human_annotation_table["Symbol"]))
# Optional: check a few columns after renaming
#podd_et_al_tosave
