# PyHarmony Integration Pipeline

In [3]:
# Load libraries
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd

## Prepare the Parse Biosciences dataset

In [4]:
data_path = "/Users/ianschrack/Desktop/parse_analysis_10882-IS/all-sample/DGE_filtered/"

AllTissue = sc.read_mtx(data_path + 'count_matrix.mtx')

# Read in gene & cell data
gene_data = pd.read_csv(data_path + "all_genes.csv")
cell_meta = pd.read_csv(data_path + "cell_metadata.csv")

# Find genes with nan values and filter
gene_data = gene_data[gene_data.gene_name.notnull()]
notNA = gene_data.index
notNA = notNA.to_list()

# Remove genes with nan values and assign gene names
AllTissue = AllTissue[:,notNA]
AllTissue.var = gene_data
AllTissue.var.set_index('gene_name', inplace = True)
AllTissue.var_names_make_unique()

# Add cell meta data to anndata object
AllTissue.obs = cell_meta
AllTissue.obs.set_index('bc_wells', inplace=True)
AllTissue.obs.index.name = None
AllTissue.obs_names_make_unique()

sc.pp.filter_cells(AllTissue, min_genes=300)
sc.pp.filter_genes(AllTissue, min_cells=5)

## Add meta data to Parse dataset

In [5]:
sample_annotations = {
    "10882-IS-1": {"tissue": "lung", "Day": "D0", "replicate": "M1"}, 
    "10882-IS-2": {"tissue": "lung", "Day": "D0", "replicate": "M2"},
    "10882-IS-3": {"tissue": "lung", "Day": "D0", "replicate": "M3"},
    "10882-IS-4": {"tissue": "lung", "Day": "D0", "replicate": "M4"},
    "10882-IS-5": {"tissue": "scaffold", "Day": "D0", "replicate": "M1"},
    "10882-IS-6": {"tissue": "scaffold", "Day": "D0", "replicate": "M2"},
    "10882-IS-7": {"tissue": "scaffold", "Day": "D0", "replicate": "M3"},
    "10882-IS-8": {"tissue": "scaffold", "Day": "D0", "replicate": "M4"},
    "10882-IS-9": {"tissue": "blood", "Day": "D0", "replicate": "M1"},
    "10882-IS-10": {"tissue": "blood", "Day": "D0", "replicate": "M2"},
    "10882-IS-11": {"tissue": "blood", "Day": "D0", "replicate": "M3"},
    "10882-IS-12": {"tissue": "blood", "Day": "D0", "replicate": "M4"},
    "10882-IS-13": {"tissue": "lung", "Day": "D7", "replicate": "M1"},
    "10882-IS-14": {"tissue": "lung", "Day": "D7", "replicate": "M2"},
    "10882-IS-15": {"tissue": "lung", "Day": "D7", "replicate": "M3"},
    "10882-IS-16": {"tissue": "lung", "Day": "D7", "replicate": "M4"},
    "10882-IS-17": {"tissue": "scaffold", "Day": "D7", "replicate": "M1"},
    "10882-IS-18": {"tissue": "scaffold", "Day": "D7", "replicate": "M2"},
    "10882-IS-19": {"tissue": "scaffold", "Day": "D7", "replicate": "M3"},
    "10882-IS-20": {"tissue": "scaffold", "Day": "D7", "replicate": "M4"},
    "10882-IS-21": {"tissue": "blood", "Day": "D7", "replicate": "M1"},
    "10882-IS-22": {"tissue": "blood", "Day": "D7", "replicate": "M2"},
    "10882-IS-23": {"tissue": "blood", "Day": "D7", "replicate": "M3"},
    "10882-IS-24": {"tissue": "blood", "Day": "D7", "replicate": "M4"},
    "10882-IS-25": {"tissue": "lung", "Day": "D14", "replicate": "M1"},
    "10882-IS-26": {"tissue": "lung", "Day": "D14", "replicate": "M2"},
    "10882-IS-27": {"tissue": "lung", "Day": "D14", "replicate": "M3"},
    "10882-IS-28": {"tissue": "lung", "Day": "D14", "replicate": "M4"},
    "10882-IS-29": {"tissue": "scaffold", "Day": "D14", "replicate": "M1"},
    "10882-IS-30": {"tissue": "scaffold", "Day": "D14", "replicate": "M2"},
    "10882-IS-31": {"tissue": "scaffold", "Day": "D14", "replicate": "M3"},
    "10882-IS-32": {"tissue": "scaffold", "Day": "D14", "replicate": "M4"},
    "10882-IS-33": {"tissue": "blood", "Day": "D14", "replicate": "M1"},
    "10882-IS-34": {"tissue": "blood", "Day": "D14", "replicate": "M2"},
    "10882-IS-35": {"tissue": "blood", "Day": "D14", "replicate": "M3"},
    "10882-IS-36": {"tissue": "blood", "Day": "D14", "replicate": "M4"},
    "10882-IS-37": {"tissue": "lung", "Day": "D21", "replicate": "M1"},
    "10882-IS-38": {"tissue": "lung", "Day": "D21", "replicate": "M2"},
    "10882-IS-39": {"tissue": "lung", "Day": "D21", "replicate": "M3"},
    "10882-IS-40": {"tissue": "lung", "Day": "D21", "replicate": "M4"},
    "10882-IS-41": {"tissue": "scaffold", "Day": "D21", "replicate": "M1"},
    "10882-IS-42": {"tissue": "scaffold", "Day": "D21", "replicate": "M2"},
    "10882-IS-43": {"tissue": "scaffold", "Day": "D21", "replicate": "M3"},
    "10882-IS-44": {"tissue": "scaffold", "Day": "D21", "replicate": "M4"},
    "10882-IS-45": {"tissue": "blood", "Day": "D21", "replicate": "M1"},
    "10882-IS-46": {"tissue": "blood", "Day": "D21", "replicate": "M2"},
    "10882-IS-47": {"tissue": "blood", "Day": "D21", "replicate": "M3"},
    "10882-IS-48": {"tissue": "blood", "Day": "D21", "replicate": "M4"},
}

for sample, info in sample_annotations.items():
    mask = AllTissue.obs["sample"] == sample
    for key, value in info.items():
        AllTissue.obs.loc[mask, key] = value

In [7]:
AllTissue.obs['dataset'] = "Parse"

In [16]:
AllTissue.obs

Unnamed: 0,sample,species,gene_count,tscp_count,mread_count,bc1_wind,bc2_wind,bc3_wind,bc1_well,bc2_well,bc3_well,n_genes,tissue,Day,replicate,dataset
01_01_30__s1,10882-IS-1,GRCm39-tdTomato,1048,5775,33480,1,1,30,A1,A1,C6,1048,lung,D0,M1,Parse
01_01_40__s1,10882-IS-1,GRCm39-tdTomato,452,2002,10297,1,1,40,A1,A1,D4,452,lung,D0,M1,Parse
01_01_64__s1,10882-IS-1,GRCm39-tdTomato,1224,3139,19892,1,1,64,A1,A1,F4,1224,lung,D0,M1,Parse
01_02_38__s1,10882-IS-1,GRCm39-tdTomato,1468,7766,45401,1,2,38,A1,A2,D2,1468,lung,D0,M1,Parse
01_03_33__s1,10882-IS-1,GRCm39-tdTomato,1067,4959,28643,1,3,33,A1,A3,C9,1067,lung,D0,M1,Parse
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
09_92_40__s8,10882-IS-9,GRCm39-tdTomato,833,2174,13369,9,92,40,A9,H8,D4,833,blood,D0,M1,Parse
09_95_57__s8,10882-IS-9,GRCm39-tdTomato,1146,2721,17384,9,95,57,A9,H11,E9,1146,blood,D0,M1,Parse
09_95_65__s8,10882-IS-9,GRCm39-tdTomato,604,916,6503,9,95,65,A9,H11,F5,604,blood,D0,M1,Parse
09_96_43__s8,10882-IS-9,GRCm39-tdTomato,1894,5688,34481,9,96,43,A9,H12,D7,1894,blood,D0,M1,Parse


# Prepare DropSeq data

scaf.h.d7.data <- read.table("scaf.h.d7.txt", header = TRUE, row.names = 1, sep = "\t", as.is = TRUE)

In [12]:
d7_h_scaf.head(5)

Unnamed: 0_level_0,CTGCCGGTCATG,CGTAAAACAGGG,GCAGAAGTTAGC,GGGGAGTCTGAT,AAAGACTCAGCG,GAACGGTTTGCC,AAGTGCCCTTGC,ATAGCCACATCA,CACGCACAGATA,GTTGTAAGGCGG,...,TTCCGTACATCC,CGGTGTATTGTC,GTGTAGTGTCCG,GTGTAGTGTCCC,GTGTAGTGTCCA,GTGTAGTGTCCT,GGAATCCTAATT,AAGTGCCTTCCG,CCGCAGTCGCAT,CGCTTCACTTTC
GENE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007N19Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
0610009B22Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009D07Rik,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
d7_h_scaf = pd.read_csv(filepath_or_buffer="/Users/ianschrack/Desktop/Sophia_scRNA/geo_sub_4T1scRNAseq_LungScaffoldSpleen_d0d7d14d21_DropSeq/scaf.h.d7.txt", 
                        header=0, sep="\t", index_col=0, dtype=str)
d7_h_scaf = d7_h_scaf.T
d7_h_scaf_adata = sc.AnnData(X=d7_h_scaf)
d7_h_scaf_adata.var_names = d7_h_scaf.columns.to_list()

# Add meta data
d7_h_scaf_adata.obs['dataset'] = "dropseq"
d7_h_scaf_adata.obs['sample'] = "d7_h_scaf"
d7_h_scaf_adata.obs['Tissue'] = "scaffold"
d7_h_scaf_adata.obs['Day'] = "D0" # We are treating healthy samples as if they are day 0
d7_h_scaf_adata.obs['replicate'] = "M5"

In [32]:
d7_d_scaf = pd.read_csv(filepath_or_buffer="/Users/ianschrack/Desktop/Sophia_scRNA/geo_sub_4T1scRNAseq_LungScaffoldSpleen_d0d7d14d21_DropSeq/scaf.h.d7.txt", 
                        header=0, sep="\t", index_col=0, dtype=str)
d7_d_scaf = d7_h_scaf.T
d7_d_scaf_adata = sc.AnnData(X=d7_h_scaf)
d7_d_scaf_adata.var_names = d7_h_scaf.columns.to_list()

# Add meta data
d7_d_scaf_adata.obs['dataset'] = "dropseq"
d7_d_scaf_adata.obs['sample'] = "d7_h_scaf"
d7_d_scaf_adata.obs['Tissue'] = "scaffold"
d7_d_scaf_adata.obs['Day'] = "D7"
d7_d_scaf_adata.obs['replicate'] = "M5"

In [None]:
import pandas as pd
import scanpy as sc
import os

# --- Define Path to Data Directory ---
data_path = "/Users/ianschrack/Desktop/Sophia_scRNA/geo_sub_4T1scRNAseq_LungScaffoldSpleen_d0d7d14d21_DropSeq/"

# --- Sample Metadata ---
sample_info = [
    ('scaf', 'h', 'd7', 0),
    ('scaf', 'd', 'd7', 7),
    ('scaf', 'h', 'd14', 0),
    ('scaf', 'd', 'd14', 14),
    ('scaf', 'h', 'd21', 0),
    ('scaf', 'd', 'd21', 21),
    ('lung', 'h', 'd7', 0),
    ('lung', 'd', 'd7', 7),
    ('lung', 'h', 'd14', 0),
    ('lung', 'd', 'd14', 14),
    ('lung', 'h', 'd21', 0),
    ('lung', 'd', 'd21', 21),
    ('spleen', 'h', 'd7', 0),
    ('spleen', 'd', 'd7', 7),
    ('spleen', 'h', 'd14', 0),
    ('spleen', 'd', 'd14', 14),
    ('spleen', 'h', 'd21', 0),
    ('spleen', 'd', 'd21', 21)
]

# --- Load, Process, and Annotate Each Dataset ---
adatas = []
sample_names = []

for tissue, state, day, time in sample_info:
    sample = f"{tissue}.{state}.{day}"
    file_path = os.path.join(data_path, f"{sample}.txt")

    # Load data
    df = pd.read_csv(file_path, sep="\t", index_col=0)

    # Transpose to cells x genes for AnnData
    adata = sc.AnnData(df.T)
    adata.var_names_make_unique()

    # Filter genes
    sc.pp.filter_genes(adata, min_cells=3)

    # Annotate metadata
    adata.obs['sample'] = sample
    adata.obs['Tissue'] = tissue
    adata.obs['Experiment'] = day.upper()
    adata.obs['Time'] = time

    # Normalize and scale
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata)
    sc.pp.scale(adata)

    # Store for concatenation
    adatas.append(adata)
    sample_names.append(sample)

# --- Merge All Samples ---
merged = sc.concat(adatas, label='sample', keys=sample_names, merge='same')

# --- Final Metadata ---
merged.obs['basic'] = merged.obs['Tissue'] + "_" + merged.obs['Time'].astype(str)

  utils.warn_names_duplicates("obs")


In [35]:
merged.obs

Unnamed: 0,sample,Tissue,Experiment,Time,basic
CTGCCGGTCATG,scaf.h.d7,scaf,D7,0,scaf_0
CGTAAAACAGGG,scaf.h.d7,scaf,D7,0,scaf_0
GCAGAAGTTAGC,scaf.h.d7,scaf,D7,0,scaf_0
GGGGAGTCTGAT,scaf.h.d7,scaf,D7,0,scaf_0
AAAGACTCAGCG,scaf.h.d7,scaf,D7,0,scaf_0
...,...,...,...,...,...
TTACACAAAAAC,spleen.d.d21,spleen,D21,21,spleen_21
ACACCATCCCTA,spleen.d.d21,spleen,D21,21,spleen_21
TATGATGGTCAG,spleen.d.d21,spleen,D21,21,spleen_21
AAATAAGTCGCC,spleen.d.d21,spleen,D21,21,spleen_21
