# Lynn's Brain snRNAseq Experiment

### Description
Lynn took fixed brains from mice infected with Candida albicans or left untreated, and had nucli isolated for snRNAseq.

path for files: /media/jmk/drive_a/Project_Brain_snRNAseq
snRNAseq 10x configuration: /media/jmk/drive_a/Project_Brain_snRNAseq/config.csv

In [None]:
import os
import pandas as pd

# Creating a df of the samples and the path to their filtered matrix.h5 files 
# Define the root directory where your sample folders are located
root_dir = "/media/drive_c/Project_Brain_snRNAseq/per_sample_outs"

# Initialize an empty list to store sample names and file paths
data = []

# Loop through each subfolder in the root directory
for sample_folder in os.listdir(root_dir):
    sample_path = os.path.join(root_dir, sample_folder, 'count', 'sample_filtered_feature_bc_matrix.h5')
    
    # Check if the file exists
    if os.path.isfile(sample_path):
        # Append sample name and file path to the list
        data.append({'Sample': sample_folder, 'Path': sample_path})

# Create a DataFrame from the list
df = pd.DataFrame(data)

df.to_csv('sample_paths.csv', index=False)

# Display the DataFrame
print(df)



# Preparing andata for analysis

## Function for doublet detection and filtering cells

In [None]:
import scanpy as sc
import scvi
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def pp(h5_path, sample):
    # Read the h5 file
    adata = sc.read_10x_h5(h5_path, genome=None, gex_only=True)
    # Make variables unique
    adata.var_names_make_unique()
    
    # Filter genes
    sc.pp.filter_genes(adata, min_cells=10)
    # Filter for HVGs
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True, flavor='seurat_v3')


    # Doublet detection with scvi model
    ## training the scvi model
    scvi._settings.ScviConfig(dl_num_workers=60)
    scvi.model.SCVI.setup_anndata(adata)
    vae = scvi.model.SCVI(adata)
    vae.train()

    # Train the solo model
    solo = scvi.external.SOLO.from_scvi_model(vae)
    solo.train()

    # df for doublet prediction
    dfp = solo.predict()
    dfp['prediction'] = solo.predict(soft = False)
    dfp['dif'] = dfp.doublet - dfp.singlet
    sns.displot(dfp[dfp.prediction == 'doublet'], x = 'dif')
    # Save the plot as a PDF file
    plt.savefig("/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Injest/{}_doubletPlot.pdf".format(sample))

    # Re-reading the h5 file
    adata = sc.read_10x_h5(h5_path, genome=None, gex_only=True)
    # Make variables unique
    adata.var_names_make_unique()

    # Remove the predicted doublets
    adata = adata[~adata.obs.index.isin(dfp.index[dfp['prediction'] == 'doublet'])]


    # Add sample_id to adata.obs
    adata.obs['Sample'] = sample

    # Comiting raw data to .raw and a 'counts' layer
    adata.layers["counts"] = adata.X
    
  
    # Filter cells on gene counts and cells
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
    upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
    adata = adata[adata.obs.n_genes_by_counts < upper_lim]
    sc.pp.filter_genes(adata, min_cells=3)

    
    # Calculate filter mitochondrial QC metrics
    adata.var['mt'] = adata.var_names.str.startswith('mt')
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    adata = adata[adata.obs.pct_counts_mt < 5]

       
    # Normalize, log-transform, comiting normalized data to a layer, Identify highly variable genes
    sc.pp.normalize_total(adata, inplace=True, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.layers['log_norm'] = adata.X.copy()
    sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True, flavor='seurat_v3')

    adata.raw = adata
        
    return adata

In [None]:
# Create an empty list to store AnnaData objects
datas = []

## Import, Filter, Concat, and Var/groups addition

In [None]:
# Loop through each row in the DataFrame
for index, row in df.iterrows():
    # Get the file path and sample ID from the current row
    h5_path = row['Path']
    sample_id = row['Sample']
    print(f"Processing row {index}: h5_path='{h5_path}', sample_id='{sample_id}'")
    
    # Record the start time
    start_time = time.time()
    
    # Run the preprocessing function on the current sample
    adata = pp(h5_path, sample_id)
    
    # Record the end time
    end_time = time.time()
    
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    
    # Print the elapsed time
    print(f"Processing time for row {index}: {elapsed_time:.2f} seconds")
    
    # Append the preprocessed AnnData object to the list
    datas.append(adata)


In [None]:
datas

In [None]:
# Concatenate the list of AnnData objects into a single AnnData object
adata = sc.concat(datas, join ='outer', uns_merge = 'same')
adata

In [None]:

'''

### SKIP THIS AFTER INITIAL ANALYSIS ###

# Adding back the var that were lost during concatenation
# grab all var DataFrames from our dictionary
all_var = [x.var for x in datas]
# concatenate them
all_var = pd.concat(all_var, join="outer")
# remove duplicates
all_var = all_var[~all_var.index.duplicated()]
# add var to adata
adata.var = all_var.loc[adata.var_names]

'''
# Note: 'gene_ids', 'feature_types', 'genome', 'n_cells_by_counts', 'mean_counts', /
#       'pct_dropout_by_counts', 'total_counts', 'n_cells', 'mt', 'highly_variable', /
#       'highly_variable_rank', 'means', 'variances', 'variances_norm'

# Define which columns you want to keep
keep_cols = ["gene_ids"]

# Grab .var from each dataset in the list and subset to columns we want (if they exist)
all_var = [
    x.var.loc[:, [c for c in keep_cols if c in x.var.columns]]
    for x in datas
]

# Concatenate and drop duplicate gene names
all_var = pd.concat(all_var, join="outer")
all_var = all_var[~all_var.index.duplicated()]

# Subset to adata genes
adata.var = all_var.loc[adata.var_names]



In [None]:
# Creating a treatment group obs for downstream analysis

# Define sample groups
naive_samples = ['Mock-1', 'Mock-2', 'Mock-3']
infected_samples = ['OG-1', 'OG-2', 'OG-3']

# Create a new column 'treatment' in adata.obs
adata.obs['treatment'] = adata.obs['Sample'].apply(
    lambda x: 'Naive' if x in naive_samples else ('Infected' if x in infected_samples else 'Unknown')
)

# Convert 'treatment' column to categorical type
adata.obs['treatment'] = adata.obs['treatment'].astype('category')



In [None]:
adata

# Integrating Samples with Scanorama

In [None]:
import scanorama

scanorama.integrate_scanpy(datas)

In [None]:
# Get all the integrated matrices
scanorama_int = [ad.obsm['X_scanorama'] for ad in datas]

# make into one matrix
adata.obsm["Scanorama"] = np.concatenate(scanorama_int)

In [None]:
adata

# Save Load Point:       Brain_snRNAseq_adata_injest.h5ad

In [None]:
# saving the adata.X as the raw counts (layers 'counts')
adata.X = adata.layers['counts']
adata.write_h5ad('/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_adata_injest.h5ad')

In [None]:
import anndata as ad

# Reading in the adata from h5ad
adata = ad.read_h5ad('/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_adata_injest.h5ad')

In [None]:
adata

# MapMyCells

Publication: A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain (PMID: 38092916)
Source: https://github.com/AllenInstitute/cell_type_mapper


In [None]:
# Create a copy of the gene symbols and store them in a new column 'gene_symbol'
adata.var['gene_symbol'] = adata.var.index

# Set the index of 'adata.var' to the 'gene_ids' column
adata.var.set_index('gene_ids', inplace=True)


In [None]:
adata.var.index.values


In [None]:
# rewriting the h5ad file so gene_ids are the index and map my cells is happy
adata.write_h5ad('/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_index.gene_ids.h5ad')

In [None]:
adata = ad.read_h5ad('/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_index.gene_ids.h5ad')

In [None]:
adata


### Preparing to run cell type mapper...

Note:
Cells are mapped with the original whole mouse brain atlas taxonomy from using MapMyCells

(https://portal.brain-map.org/atlases-and-data/bkp/mapmycells)




In [None]:
# Run this through terminal

!python -m cell_type_mapper.cli.from_specified_markers \
--query_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_index.gene_ids.h5ad \
--extended_result_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.json \
--csv_result_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.csv \
--drop_level CCN20230722_SUPT \
--cloud_safe False \
--query_markers.serialized_lookup /home/jmk/cell_type_mapper/Taxonomies/WMB/mouse_markers_230821.json \
--precomputed_stats.path /home/jmk/cell_type_mapper/Taxonomies/WMB/precomputed_stats_ABC_revision_230821.h5 \
--type_assignment.normalization raw \
--type_assignment.n_processors 60

In [None]:
#Note: The following command is for mapping the cells without using the GPU

!CUDA_VISIBLE_DEVICES="" python -m cell_type_mapper.cli.from_specified_markers \
--query_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/Brain_snRNAseq_index.gene_ids.h5ad \
--extended_result_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.json \
--csv_result_path /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.csv \
--drop_level CCN20230722_SUPT \
--cloud_safe False \
--query_markers.serialized_lookup /home/jmk/cell_type_mapper/Taxonomies/WMB/mouse_markers_230821.json \
--precomputed_stats.path /home/jmk/cell_type_mapper/Taxonomies/WMB/precomputed_stats_ABC_revision_230821.h5 \
--type_assignment.normalization raw \
--type_assignment.n_processors 60


### CVS file

In [None]:
csv_results = pd.read_csv("/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.csv", comment='#')
csv_results

In [None]:
list(csv_results.columns)

## Add Cell Type Mapping to AnnData

In [None]:
# Merge the results into the AnnData object's obs
adata.obs = adata.obs.merge(csv_results, how='left', left_index=True, right_index=True)

# Verify the merge
print(adata.obs.head())

In [None]:
adata.obs

## Json file:  taxonomy_tree

In [None]:
import json

json_path = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Mapping/mapping_output.json"

print(f"=======READING {json_path}=======")
with open(json_path, 'r') as src:  # use 'r' for text mode with json.load
    json_results = json.load(src)

print(json_results.keys())  # should show keys like 'taxonomy_tree', etc.

taxonomy_tree = json_results['taxonomy_tree']
print(taxonomy_tree.keys())  # to confirm what levels/nodes are inside


In [None]:
# convert cell type mapping into a dict keyed on each cell's barcode
mapping_result = {c['cell_id']: c for c in json_results['results']}

# Celltype UMAP

In [None]:
# For the function
import numpy as np
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# The function to plot UMAP embeddings
def plot_embedding(
        barcode_to_label,
        label_order,
        fontsize=15,
        raw_data=None,
        save_path=None,
        filtered_adata=None,  # ➡️ Name for the saved .h5ad file
        palette_name='tab10'):
    """
    Generate and plot a UMAP embedding of the raw data using Scanpy, 
    and optionally save the plot and filtered AnnData object.

    Parameters
    ----------
    barcode_to_label:
        A dict mapping cell barcodes to labels (only includes
        cells that we want included in the embedding).
    label_order:
        Order in which we want labels to appear on the color map.
    fontsize:
        Size of font to be used in the legend.
    raw_data:
        The AnnData object containing the raw data (cells, genes, and .X['counts']).
    save_path:
        The path where the plot will be saved. If None, the plot is not saved.
    filtered_adata:
        Name for the saved filtered AnnData object (.h5ad).

    Returns
    -------
    str
        The path to the saved filtered AnnData object.
    """
    if raw_data is None:
        raise ValueError("raw_data must be provided as an AnnData object.")

    if filtered_adata is None:
        raise ValueError("You must provide a name for the filtered AnnData object.")

    # Creating a mask to filter out only the cells with barcodes in barcode_to_label
    barcodes = raw_data.obs.index.values
    filter_mask = np.array([b in barcode_to_label for b in barcodes])

    # Filter the AnnData object to include only the cells of interest
    filtered_adata_obj = raw_data[filter_mask, :].copy()

    # Add the labels to the AnnData object (used for coloring the UMAP plot)
    labels = np.array([barcode_to_label[barcode] for barcode in filtered_adata_obj.obs.index])
    filtered_adata_obj.obs['labels'] = labels

    # Use Scanpy's UMAP function to compute UMAP
    sc.pp.neighbors(filtered_adata_obj, n_pcs=50, n_neighbors=100, use_rep="Scanorama")
    sc.tl.umap(filtered_adata_obj, min_dist=1, spread=.8)

    # Rotate UMAP coordinates by -90 degrees (clockwise)
    umap_coords = filtered_adata_obj.obsm['X_umap']
    rotated_umap_coords = np.zeros_like(umap_coords)
    rotated_umap_coords[:, 0] = -umap_coords[:, 1]  # New x = -old y
    rotated_umap_coords[:, 1] = umap_coords[:, 0]   # New y = old x
    filtered_adata_obj.obsm['X_umap'] = rotated_umap_coords

    # Handle color palette
    if isinstance(palette_name, str):
        palette = sns.color_palette(palette_name, n_colors=len(label_order))
    elif isinstance(palette_name, list):
        palette = palette_name
    else:
        raise ValueError("palette_name must be a string or a list of colors.")

    # Plot the UMAP
    fig, ax = plt.subplots(figsize=(10, 8))
    sc.pl.umap(
        filtered_adata_obj,
        color='labels',
        legend_loc='right margin',
        title="Cell Class",
        legend_fontsize=fontsize,
        palette=palette,
        ax=ax,
        show=False,
    )

    # Save the plot if save_path is provided
    if save_path:
        svg_path = Path(save_path).with_suffix(".svg")
        fig.savefig(svg_path, dpi=300, bbox_inches='tight', format='svg')

    
    filtered_adata_obj.obs.index = filtered_adata_obj.obs.index.map(str)
    for col in filtered_adata_obj.obs.columns:
        filtered_adata_obj.obs[col] = filtered_adata_obj.obs[col].astype(str)

    # Save the filtered AnnData object as an .h5ad file
    adata_save_path = f"{filtered_adata}.h5ad"
    filtered_adata_obj.write(adata_save_path)

    # Display the plot
    plt.show()

    return adata_save_path


In [None]:
# General categorization of cell types for plotting

level = 'CCN20230722_CLAS'
corr_cut = 0.4

node_to_label = dict()
for node in taxonomy_tree[level]:
    name = taxonomy_tree['name_mapper'][level][node]['name']
    if 'Glut' in name:
        label = 'Glut'
    elif 'GABA' in name:
        label = 'GABA'
    elif 'Dopa' in name:
        label = 'Dopa'
    elif 'Sero' in name:
        label = 'Sero'
    elif 'Astro' in name:
        label = 'Astro'
    elif 'OPC' in name:
        label = 'OPC'
    elif 'OEC' in name:
        label = 'OEC'
    elif 'Vascular' in name:
        label = 'Vascular'
    elif 'Immune' in name:
        label = 'Immune'
    else:
        label = name
    node_to_label[node] = label

barcode_to_label = dict()
for barcode in mapping_result:
    cell = mapping_result[barcode]
    if cell[level]['avg_correlation'] < corr_cut:
        continue
    barcode_to_label[barcode] = node_to_label[cell[level]['assignment']]

label_order = ['Glut', 'GABA', 'Dopa', 'Sero', 'Astro', 'OPC', 'OEC', 'Vascular', 'Immune']
for label in set(barcode_to_label.values()):
    if label not in label_order:
        label_order.append(label)


In [None]:
# Plotting cell classes

%%time
plot_embedding(barcode_to_label=barcode_to_label, save_path='/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/UMAP/UMAP_Cell_Class', filtered_adata="/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/adata_filtered", label_order=label_order, raw_data=adata, palette_name='tab10')

## Celltype UMAP Count Plot

In [None]:
adata = sc.read_h5ad(f"/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/adata_filtered.h5ad")

In [None]:
adata.obs


In [None]:
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# ========= i/o =========
adata_path = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/adata_filtered.h5ad"
save_path  = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/UMAP/UMAP_Cell_Class_counts.svg"  
# =================================

# Load AnnData
adata = sc.read_h5ad(adata_path)

# Extract needed columns
df = adata.obs[['labels', 'Sample']].copy()

# Create group column (Mock or OG)
df['group'] = df['Sample'].apply(lambda x: 'Mock' if 'Mock' in x else 'OG')

# Count cells per label per sample
count_df = (
    df.groupby(['Sample', 'labels'])
      .size()
      .reset_index(name='count')
)

# Pivot so samples are rows, labels are columns
label_count_matrix = count_df.pivot_table(
    index='Sample',
    columns='labels',
    values='count',
    fill_value=0
).reset_index()

# Add group column
label_count_matrix['group'] = label_count_matrix['Sample'].apply(
    lambda x: 'Mock' if 'Mock' in x else 'OG'
)

# Melt for seaborn
long_df = label_count_matrix.melt(
    id_vars=['Sample', 'group'],
    var_name='label',
    value_name='count'
)

# Compute t-tests per label
stats_results = []
for label in long_df['label'].unique():
    sub = long_df[long_df['label'] == label]
    mock_vals = sub[sub['group'] == 'Mock']['count']
    og_vals = sub[sub['group'] == 'OG']['count']
    t_stat, p_val = ttest_ind(mock_vals, og_vals, equal_var=False)
    stats_results.append({'label': label, 'p_value': p_val})

stats_df = pd.DataFrame(stats_results)
print(stats_df.sort_values('p_value'))

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(
    data=long_df,
    x='label',
    y='count',
    hue='group',
    errorbar='se',
    capsize=0.1,
    errwidth=1.5,
    palette='Set2'
)
sns.stripplot(
    data=long_df,
    x='label',
    y='count',
    hue='group',
    dodge=True,
    color='black',
    alpha=0.7
)

# Remove duplicate legend entries
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:2], labels[:2], title="Group")

plt.ylabel("Cell Count")
plt.xlabel("Cell Type")
plt.title("Cell Counts per Type (Mock vs OG) ± SEM")

plt.tight_layout()

# Save as SVG
plt.savefig(save_path, format='svg')
plt.show()


## Subsetting Immune Cells for Subtype

In [None]:
# Filter to Immune cells
immune_cells = adata[adata.obs["labels"].str.contains("Immune")]

# Get unique cluster_name values from those cells
immune_clusters = immune_cells.obs["cluster_name"].unique().tolist()

print(immune_clusters)


In [None]:
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# ========= i/o =========
adata_path = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/adatas/adata_filtered.h5ad"
save_path  = "/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/UMAP/UMAP_Immune_Cluster_counts.svg"  
# =================================

# Load AnnData
adata = sc.read_h5ad(adata_path)

# ===== Subset to Immune cells =====
immune_cells = adata[adata.obs['labels'] == 'Immune'].copy()

# Extract needed columns
df = immune_cells.obs[['cluster_name', 'Sample']].copy()

# Create group column (Mock or OG)
df['group'] = df['Sample'].apply(lambda x: 'Mock' if 'Mock' in x else 'OG')

# Count cells per cluster_name per sample
count_df = (
    df.groupby(['Sample', 'cluster_name'])
      .size()
      .reset_index(name='count')
)

# Pivot so samples are rows, cluster_names are columns
cluster_count_matrix = count_df.pivot_table(
    index='Sample',
    columns='cluster_name',
    values='count',
    fill_value=0
).reset_index()

# Add group column
cluster_count_matrix['group'] = cluster_count_matrix['Sample'].apply(
    lambda x: 'Mock' if 'Mock' in x else 'OG'
)

# Melt for seaborn
long_df = cluster_count_matrix.melt(
    id_vars=['Sample', 'group'],
    var_name='cluster_name',
    value_name='count'
)

# Compute t-tests per cluster_name
stats_results = []
for cname in long_df['cluster_name'].unique():
    sub = long_df[long_df['cluster_name'] == cname]
    mock_vals = sub[sub['group'] == 'Mock']['count']
    og_vals = sub[sub['group'] == 'OG']['count']
    t_stat, p_val = ttest_ind(mock_vals, og_vals, equal_var=False)
    stats_results.append({'cluster_name': cname, 'p_value': p_val})

stats_df = pd.DataFrame(stats_results)
print(stats_df.sort_values('p_value'))

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=long_df,
    x='cluster_name',
    y='count',
    hue='group',
    errorbar='se',
    capsize=0.1,
    errwidth=1.5,
    palette='Set2'
)
sns.stripplot(
    data=long_df,
    x='cluster_name',
    y='count',
    hue='group',
    dodge=True,
    color='black',
    alpha=0.7
)

# Remove duplicate legend entries
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[:2], labels[:2], title="Group")

plt.ylabel("Cell Count")
plt.xlabel("Immune Cluster Name")
plt.title("Immune Cluster Counts per Sample (Mock vs OG) ± SEM")

plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Save as SVG
plt.savefig(save_path, format='svg')
plt.show()


# Extracting Cell Subsets

## Microgila barcodes


Path to: /media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Microglia_analysis/

Note: Exported for Microglia mouse to human comparison

In [None]:
import pandas as pd

# 1. Subset to Microglia cells based on cluster_name
microglia_cells = adata[adata.obs['cluster_name'].str.contains('Microglia', case=False, na=False)].copy()

# 2. Extract cell barcodes (index) into a DataFrame
barcode_df = pd.DataFrame(microglia_cells.obs.index, columns=['cell_barcode'])

# 3. Save to CSV
barcode_df.to_csv("/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Microglia_analysis/microglia_cell_barcodes.csv", index=False)



## Astrocyte Barcodes

Note: Exported for Path Analysis

In [None]:
import pandas as pd

# 1. Subset to Astrocyte cells based on cluster_name
astro_cells = adata[adata.obs['cluster_name'].str.contains('astro', case=False, na=False)].copy()

# 2. Extract cell barcodes (index) into a DataFrame
barcode_df = pd.DataFrame(astro_cells.obs.index, columns=['cell_barcode'])

# 3. Save to CSV
barcode_df.to_csv("/media/drive_c/Project_Brain_snRNAseq/Analysis/Results/Astrocyte_analysis/astrocyte_cell_barcodes.csv", index=False)
