

This pipeline processes the mouse brain snRNAseq by selecting the microglia barcodes, mapped human orthologs, log normalized, integrated, concatenated, and metadata updated.


In [None]:
#import library
import os
import time
import scanpy as sc
import pandas as pd
import numpy as np
import scvi
import anndata as ad
import torch
import scanorama
import copy
import scvelo as scv
import numba
import scipy.sparse
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from mousipy import translate


In [None]:
import os

#Set working directory for aggregated wt mice
os.chdir('/media/drive_c/Project_Brain_snRNAseq')

In [None]:
import os
import pandas as pd


# Creating a df of the samples and the path to their filtered matrix.h5 files 
# Define the root directory where your sample folders are located
root_dir = "/media/drive_c/Project_Brain_snRNAseq/per_sample_outs"

# Initialize an empty list to store sample names and file paths
data = []

# Loop through each subfolder in the root directory
for sample_folder in os.listdir(root_dir):
    sample_path = os.path.join(root_dir, sample_folder, 'count', 'sample_filtered_feature_bc_matrix.h5')
    
    # Check if the file exists
    if os.path.isfile(sample_path):
        # Append sample name and file path to the list
        data.append({'Sample': sample_folder, 'Path': sample_path})

# Create a DataFrame from the list
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Microglia Subsetting

In [None]:
# Load the list of microglia barcodes from the CSV file

barcode_file = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Microglia_analysis/microglia_cell_barcodes.csv"
barcodes_to_keep = pd.read_csv(barcode_file, header=None).squeeze().tolist()
barcodes_to_keep = set(barcodes_to_keep)  # Convert to set for faster lookup




In [None]:

"

The ingest loops through a list of mouse single-cell datasets 
    - filters cells by barcode
    - translates mouse genes to human orthologs
    - normalizes and log-transforms the data
    - stores each processed dataset in a dictionary
    - also saves the mouse–human gene map to CSV for later use...

"

import pandas as pd
import scanpy as sc

# Dictionary to store processed AnnData objects
filtered_adata_dict = {}

# DataFrame to store unique mouse-human ortholog mappings
df_ortholog = pd.DataFrame(columns=["Human_Gene", "Mouse_Gene"])

# Loop through each row in the input DataFrame
for _, row in df.iterrows():
    sample_name = row["Sample"]
    file_path = row["Path"]

    # Load data
    adata = sc.read_10x_h5(file_path, genome=None, gex_only=True)

    # Subset to valid barcodes
    filtered_adata = adata[adata.obs.index.isin(barcodes_to_keep)].copy()

    # Ensure unique gene names
    if filtered_adata.var_names.duplicated().any():
        filtered_adata.var_names_make_unique()

    # Translate mouse gene names to human
    humanized_adata = translate(filtered_adata)

    # Extract ortholog mapping
    ortholog_map = humanized_adata.var[["original_gene_symbol"]].reset_index()
    ortholog_map.columns = ["Human_Gene", "Mouse_Gene"]
    df_ortholog = pd.concat([df_ortholog, ortholog_map]).drop_duplicates().reset_index(drop=True)

    # Drop genes without valid names
    humanized_adata = humanized_adata[:, humanized_adata.var_names.notna()].copy()
    humanized_adata.var_names = humanized_adata.var_names.astype(str)

    # Store gene names in 'features' column
    humanized_adata.var['features'] = humanized_adata.var['original_gene_symbol'].copy()
    humanized_adata.var = humanized_adata.var[['features']]

    if humanized_adata.n_obs > 0:
        # Drop all columns from .obs and add metadata
        humanized_adata.obs = pd.DataFrame(index=humanized_adata.obs.index)
        humanized_adata.obs["Sample"] = sample_name
        humanized_adata.obs["Study_Designation"] = "Naive" if "Mock" in sample_name else "Infected"
        humanized_adata.obs["Dataset"] = "mouse"

        # Save raw before normalization
        humanized_adata.raw = humanized_adata.copy()

        # Normalize, log-transform
        sc.pp.normalize_total(humanized_adata, target_sum=1e4)
        sc.pp.log1p(humanized_adata)

        # Optional scaling (commented out, better to scale after merged/concatenated)
        #sc.pp.scale(humanized_adata, zero_center=True, max_value=10)

        # Store in dictionary
        filtered_adata_dict[sample_name] = humanized_adata

# Save ortholog table
df_ortholog.to_csv("mouse_human_orthologs.csv", index=False)

# Summary
print(f"✅ Processed {len(filtered_adata_dict)} samples.")
print(f"🧬 Saved {df_ortholog.shape[0]} unique mouse-human orthologs to 'mouse_human_orthologs.csv'.")



In [None]:
filtered_adata_dict

In [None]:
import scanorama

# Make a list of the datasets
datas = list(filtered_adata_dict.values())

# Perform Scanorama integration
integrated_data = scanorama.integrate_scanpy(datas)



In [None]:
filtered_adata_dict

In [None]:


# Concatenate all mouse datasets
integrated_mouse_adata = sc.concat(datas, join='inner', label='Sample', keys=filtered_adata_dict.keys())



In [None]:
integrated_mouse_adata
integrated_mouse_adata.var_names


In [None]:
# Save as h5ad
integrated_mouse_adata.write("/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Microglia_analysis/integrated_mouseMG_data.h5ad")


In [None]:
print(sc.__version__)