In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import squidpy as sq
from matplotlib import rcParams

FIGSIZE = (3, 3)
rcParams["figure.figsize"] = FIGSIZE

In [None]:
path = "./"
filelist_all = os.listdir(path)
h5ad_file = [x for x in filelist_all if x.endswith(".h5ad")]
assert len(h5ad_file) == 1
adata = sc.read_h5ad(h5ad_file[0])

gene_annotation_file = [x for x in filelist_all if x.endswith(".csv")]
assert len(gene_annotation_file) == 1
gene_info = pd.read_csv(gene_annotation_file[0])

## QC metrics

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(10, 3))

axs[0].set_title("Total transcripts per cell")
sns.histplot(
    adata.obs["total_counts"],
    kde=False,
    ax=axs[0],
)

axs[1].set_title("Unique transcripts per cell")
sns.histplot(
    adata.obs["n_genes_by_counts"],
    kde=False,
    ax=axs[1],
)

axs[2].set_title("Nucleus area")
sns.histplot(
    adata.obs["nucleus_area"],
    kde=False,
    ax=axs[2],
)


## Clustering

In [None]:
sc.pl.umap(
    adata,
    color=[
        "total_counts",
        "n_genes_by_counts",
        "leiden",
    ],
    wspace=0.4,
)

In [None]:
sq.pl.spatial_scatter(
    adata,
    library_id="spatial",
    shape=None,
    color=[
        "leiden",
    ],
    wspace=0.4,
)

## Marker genes

In [None]:
n_genes = 10
sc.pl.rank_genes_groups(adata, n_genes = n_genes)

In [None]:
ranked_genes = adata.uns['rank_genes_groups']
gene_names = ranked_genes['names']
scores = ranked_genes['scores']

genes_dict = {}
for group in gene_names.dtype.names:
    genes_with_score = list(zip(gene_names[group], scores[group]))
    sorted_genes = sorted(genes_with_score, key=lambda x: x[1], reverse=True)  # Sort by fold change in descending order
    top_genes = [gene for gene, score in sorted_genes[:n_genes]]  # Take the top n_genes
    genes_dict[group] = top_genes

In [None]:
gene_annot_map = dict(zip(gene_info.gene, gene_info.annotation))

def find_keys_for_all_values(gene_annot_map, genes_dict):
    result = []
    for group, genes in genes_dict.items():
        for gene in genes:
            annotation = gene_annot_map.get(gene, None)
            result.append({'group': group, 'gene': gene, 'annotation': annotation})
    return result
table_data = find_keys_for_all_values(gene_annot_map, genes_dict)

df = pd.DataFrame(table_data).drop_duplicates()
pd.set_option('display.max_rows', 500)
df

In [None]:
n_top_markers = 3
top_markers = []
for group in gene_names.dtype.names:
    top_markers.append(genes_dict[group][:n_top_markers])

top_markers = set([x for x in top_markers if (~x.startswith("BLANK") and ~x.startswith("NegControl"))])
sc.pl.umap(adata, color = top_markers, cmap = 'plasma', ncols = n_top_markers)