In [None]:
!date

# Comparison between control and experiment datasets
#### Unfiltered data obtained from using kb with multimapping function. Ref genome includes WRE.

___

In [None]:
%config InlineBackend.figure_format = 'retina'
%load_ext blackcellmagic

In [None]:
import sys
import anndata
# import scvi

import scanpy as sc
import numpy as np
from scipy import stats

from upsetplot import UpSet

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import cm

# Custom violinplot code
from plotting_funcs import violinplot

import gget

import pandas as pd

sc.set_figure_params(figsize=(6, 6), frameon=False)
sc.settings.n_jobs=2

In [None]:
# set random seed
np.random.seed(926)

In [None]:
def nd(arr):
    """
    Function to transform numpy matrix to nd array.
    """
    return np.asarray(arr).reshape(-1)

___

# Load AnnData object

In [None]:
adata = anndata.read_h5ad("../../finchseq_data/all_celltype.h5ad")
adata

Define masks to separate control and experiment datasets:

In [None]:
control_mask = np.logical_or(adata.obs["batch"]=="control1", adata.obs["batch"]=="control2")
experiment_mask = np.logical_or(adata.obs["batch"]=="experiment1", adata.obs["batch"]=="experiment2")

Add new obs column to separate between control and experiment in general, without separating between batches:

In [None]:
adata.obs["batch_g"] = ""

adata.obs.loc[control_mask, "batch_g"] = "control"
adata.obs.loc[experiment_mask, "batch_g"] = "experiment"

# Create columns containing general celltype assignment - ignoring cluster separation
adata.obs['celltype_g'] = adata.obs['celltype'].str.replace('\d+', '')

adata.obs.head()

Split experiment and control data into separate AnnData objects for violin plots:

In [None]:
adata_exp = adata[adata.obs.batch_g=="experiment"]
adata_ctrl = adata[adata.obs.batch_g=="control"]

Define standard set of celltypes to plot (excluding clusters with < 100 cells):

In [None]:
celltypes_standard = [
    "GABAergic neurons 1",
    "GABAergic neurons 2",
    "astrocytes 1",
    "astrocytes 2",
    "glutamatergic neurons 1",
    "glutamatergic neurons 2",
    "glutamatergic neurons 3",
    "glutamatergic neurons 4",
    "microglia 1",
    "microglia 2",
    "migrating neuroblasts",
    "mural / vascular endothelial cells 1",
    "mural cells 2",
    "oligodendrocyte precursor cells",
    "oligodendrocytes 1",
    "oligodendrocytes 2",
    "radial glia 1",
    "radial glia 2",
    "red blood cells",
]

___

# Load marker genes

In [None]:
marker_gene_mat = pd.read_excel('marker_genes.xlsx', sheet_name="matrix_v2")

Find gene ID for each gene:

In [None]:
marker_gene_mat["gene_name_id"] = np.arange(len(marker_gene_mat))

In [None]:
# index counter
i = 0

for gene in marker_gene_mat["Gene"].values:
    gene_name_id = []
    
    if gene.startswith("ENS"):
        gni = adata.var.iloc[np.where(adata.var.index.str.contains(gene))]
    
    else:
        gni = adata.var.iloc[np.where(adata.var.index.str.startswith(gene))]

    if len(gni) > 0:
        gene_name_id = gni.index[0]
    else:
        gene_name_id = np.nan

    marker_gene_mat["gene_name_id"][i] = gene_name_id
    i += 1

Notes on the marker genes:  
FNDC9 is a synonym for FNTM2 (HVC-X marker).  
PDGFRA (ENSTGUG00000007756.2) not annotated.  
or107-1 is a synonym for ZF1A (neither, not even ensembl ID, can be found in ref).  
Ki67 (ENSTGUG00000021193) not annotated. 

SOX4 and PDGFRA not in reference genome gtf. This is weird because Colquitt et al report using them as a markers and they use the same ref. genome.

ZF1A, EJZER1, and AR46s also not in ref gtf (not used in Colquitt paper).

In [None]:
# Drop rows with genes not found in dataset
marker_gene_mat = marker_gene_mat.dropna(axis=0)

# Drop columns of celltypes containing only 0s
marker_gene_mat = marker_gene_mat.loc[:, (marker_gene_mat != 0).any(axis=0)]
 
#Set gene_name_id as index and drop "gene" coplumn
marker_gene_mat = marker_gene_mat.set_index("gene_name_id")
marker_gene_mat = marker_gene_mat.drop("Gene", axis=1)

marker_gene_mat.head()

___

# Plot WRE expression

In [None]:
genes = [
    "WRE_WRE"
    ]
labels = genes
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="WRE",
    fold_change_min=1.5,
    figsize=(15,1)
)

___

# Where are neurogenesis markers, immediate early genes, and proliferation markers expressed?

In [None]:
neurogenesis = marker_gene_mat.loc[marker_gene_mat['mammalian neurogenesis'] != 0].index.values
activity = marker_gene_mat.loc[marker_gene_mat['immediate early genes'] != 0].index.values
# proliferation = marker_gene_mat.loc[marker_gene_mat['proliferation'] != 0].index.values

In [None]:
genes = neurogenesis
labels = genes
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="neurogenesis",
    fold_change_min=1.5,
    figsize=None
)

In [None]:
genes = activity
labels = genes
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="IEG",
    fold_change_min=1.5,
    figsize=None
)

___

## Does each celltype cluster contain an equal number of cells from each batch/dataset?

Let's look at the actual number of cells from each batch for each celltype:

In [None]:
# Normalize each celltype count to total number of cells in that batch by dividing
df_normalized = (adata.obs.groupby("celltype")["batch"].value_counts() / adata.obs.groupby("batch")["species"].count()).unstack().fillna(0)
df_normalized.columns = df_normalized.columns.astype(str)
df_normalized["total normalized count"] = df_normalized.sum(axis=1).values

df_normalized["control1_fraction"] = (df_normalized["control1"] / df_normalized["total normalized count"]).values
df_normalized["control2_fraction"] = (df_normalized["control2"] / df_normalized["total normalized count"]).values
df_normalized["experiment1_fraction"] = (df_normalized["experiment1"] / df_normalized["total normalized count"]).values
df_normalized["experiment2_fraction"] = (df_normalized["experiment2"] / df_normalized["total normalized count"]).values

df_normalized["total cellcount"] = adata.obs.groupby("celltype").size().values.astype(int)

df_normalized

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
width = 0.75

clusters = df_normalized.index.values
cellcounts = df_normalized["total cellcount"].values

ax.bar(
    clusters,
    df_normalized["control1_fraction"].values,
    width,
    color="navy",
    label="Control 1",
)
ax.bar(
    clusters,
    df_normalized["control2_fraction"].values,
    width,
    bottom=df_normalized["control1_fraction"].values,
    color="mediumblue",
    label="Control 2",
)
ax.bar(
    clusters,
    df_normalized["experiment1_fraction"].values,
    width,
    bottom=df_normalized["control2_fraction"].values
    + df_normalized["control1_fraction"].values,
    color="orange",
    label="TetX 1",
)
ax.bar(
    clusters,
    df_normalized["experiment2_fraction"].values,
    width,
    bottom=df_normalized["experiment1_fraction"].values
    + df_normalized["control2_fraction"].values
    + df_normalized["control1_fraction"].values,
    color="darkorange",
    label="TetX 2",
)

# Add value above each bar
for index, value in enumerate(cellcounts):
    ax.text(x = index, y = 1.01, s = value, size = 10, ha='center')

ax.set_xticklabels(clusters, rotation=45, ha="right")

ax.legend(bbox_to_anchor=(1.001, 1.025), loc="upper left")

ax.set(
    **{
        "title": "Cell count distribution (normalized to total number of cells in batch)",
        "ylabel": "Fraction of cells",
        "xlabel": "Celltypes"
    }
)

ax.axhline(y=0.5, color="r", linestyle="-")

ax.margins(x=0.01, y=0.06)
ax.grid(False)

plt.savefig("figures/4_cellcounts_perc_joint_clustered.png", dpi=300, bbox_inches="tight")
plt.savefig("figures/4_cellcounts_perc_joint_clustered.pdf", dpi=300, bbox_inches="tight")

fig.show()

# Plot ln(FC)

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))

exp = df_normalized[["experiment1", "experiment2"]].sum(axis=1)
ctrl = df_normalized[["control1", "control2"]].sum(axis=1)

x = df_normalized.index.values
y = []
cell_counts = df_normalized["total cellcount"].values
cell_color = "navy"

counter = 0
del_idx = []

# Calculate log(fold change)
for i, exp_value in enumerate(exp):
    if cell_counts[i] < 100:
        # Collect indeces of labels to delete from x labels
        del_idx.append(i)
        continue
    else:
        y_temp = np.log(exp_value / ctrl[i])
        y.append(y_temp)
        if exp_value >= ctrl[i]:
            ax.text(x = counter, y = y_temp + 0.1, s = cell_counts[i], size = 10, ha='center', c=cell_color)
        if ctrl[i] >= exp_value:
            ax.text(x = counter, y = y_temp - 0.2, s = cell_counts[i], size = 10, ha='center', c=cell_color)
        counter +=1

x = np.delete(x, del_idx)

# Plot stem/scatter plot
ax.vlines(x, 0, y, color="black", linestyle="-", lw=1, zorder=-1)
ax.scatter(x , y, c="crimson", s=75)

ax.set(
    **{
        "title": "ln(Fold Change) in cellcount between experiment and control",
        "ylabel": "ln(cellcount$_{tetx}$ / cellcount$_{control}$)",
        "ylim": (-1.7, 1.7)
        #     "xlabel": "Celltypes"
    }
)

# ax.set_yscale('log')

# labels = ["", "3", "2", "1", "0", "1", "2", "3"]
# ax.set_yticklabels(labels)

ax.axhline(y=0, color="black", linestyle="-", lw=1, zorder=-1)
ax.set_xticklabels(x, rotation=45, ha="right")

ax.text(0, np.log(3)+0.05, "Fold Change = 3", ha="left", size=10, c='red')
ax.axhline(y=np.log(3), color="red", linestyle="--", lw=1, zorder=-1)
ax.axhline(y=-np.log(3), color="red", linestyle="--", lw=1, zorder=-1)

# Add blocks and text separating each comparison category
ax.axhspan(-1.7, 0, facecolor='aliceblue', alpha=0.5, zorder=-1)
ax.axhspan(0, 1.7, facecolor='moccasin', alpha=0.5, zorder=-1)
ax.text(0, -1.55, "cellcount$_{control}$ > cellcount$_{tetx}$", ha="left", size=13)
ax.text(0, 1.45, "cellcount$_{tetx}$ > cellcount$_{control}$", ha="left", size=13)
ax.text(18, 1.45, "Total number of cells in cluster", ha="right", c=cell_color, size=13)

ax.margins(x=0.01, y=0.01)
ax.grid(False)
ax.xaxis.grid(color='gray', ls='--', lw=0.1)
ax.yaxis.grid(color='gray', ls='--', lw=0.1)
ax.set_axisbelow(True)

plt.savefig("figures/4_cellcounts_joint_clustered.png", dpi=300, bbox_inches="tight")
plt.savefig("figures/4_cellcounts_joint_clustered.pdf", dpi=300, bbox_inches="tight")

fig.show()

It looks like there are a fold change of 2 more microglia in the experiment data. This is consistent for all clusters of microglia. There is also one cluster of glutamatergic neurons that has 2x more cells in the control data.

### Same for connectivity inside glutamatergic neurons

In [None]:
# Normalize each celltype count to total number of cells in that batch by dividing
df_conn_norm = (adata.obs.groupby("connectivity")["batch"].value_counts() / adata.obs.groupby("batch")["species"].count()).unstack().fillna(0)
df_conn_norm.columns = df_conn_norm.columns.astype(str)
df_conn_norm["total normalized count"] = df_conn_norm.sum(axis=1).values

df_conn_norm["control1_fraction"] = (df_conn_norm["control1"] / df_conn_norm["total normalized count"]).values
df_conn_norm["control2_fraction"] = (df_conn_norm["control2"] / df_conn_norm["total normalized count"]).values
df_conn_norm["experiment1_fraction"] = (df_conn_norm["experiment1"] / df_conn_norm["total normalized count"]).values
df_conn_norm["experiment2_fraction"] = (df_conn_norm["experiment2"] / df_conn_norm["total normalized count"]).values

df_conn_norm["total cell count"] = adata.obs.groupby("connectivity").size().values.astype(int)

# Drop rows that do not correspond to glutamatergic neurons
df_conn_norm = df_conn_norm[-4:]

df_conn_norm

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

exp = df_conn_norm[["experiment1", "experiment2"]].sum(axis=1)
ctrl = df_conn_norm[["control1", "control2"]].sum(axis=1)

x = df_conn_norm.index.values
y = []
cell_counts = df_conn_norm["total cell count"].values
cell_color = "navy"

counter = 0

# Calculate fold change in both directions (exp > ctrl and ctrl > exp)
for i, exp_value in enumerate(exp):
    if exp_value > ctrl[i]:
        y_temp = (exp_value-ctrl[i]) / ctrl[i]
        y.append(y_temp)
        ax.text(x = counter, y = y_temp + 0.2, s = cell_counts[i], size = 10, ha='center', c=cell_color)
        counter +=1
    else:
        y_temp = np.negative((ctrl[i]-exp_value) / exp_value)
        y.append(y_temp)
        ax.text(x = counter, y = y_temp - 0.45, s = cell_counts[i], size = 10, ha='center', c=cell_color)
        counter +=1    

# Plot stem/scatter plot
ax.vlines(x, 0, y, color="black", linestyle="-", lw=1, zorder=-1)
ax.scatter(x , y, c="crimson", s=75)

ax.set(
    **{
        "title": "Fold change in cellcount between experiment and control",
        "ylabel": "$Δcellcount / cellcount_0$ ",
        "ylim": (-3, 3)
        #     "xlabel": "Celltypes"
    }
)

labels = ["3", "2", "1", "0", "1", "2", "3"]
ax.set_yticklabels(labels)

ax.axhline(y=0, color="black", linestyle="-", lw=1, zorder=-1)
ax.set_xticklabels(x, rotation=45, ha="right")

# Add blocks and text separating each comparison category
ax.axhspan(-3, 0, facecolor='aliceblue', alpha=0.5, zorder=-1)
ax.axhspan(0, 3, facecolor='moccasin', alpha=0.5, zorder=-1)
ax.text(-0.1, -2.75, "Control > experiment", ha="left", size=13)
ax.text(-0.1, 2.5, "Experiment > control", ha="left", size=13)
ax.text(3.1, 2.5, "Total number of cells in cluster", ha="right", c=cell_color, size=13)

ax.margins(x=0.05, y=0.05)
ax.grid(False)
ax.xaxis.grid(color='gray', ls='--', lw=0.1)
ax.yaxis.grid(color='gray', ls='--', lw=0.1)
ax.set_axisbelow(True)

fig.show()

___

# DE genes in microglia between control and exp

In [None]:
adata_mglia = adata[np.char.startswith(nd(adata.obs.celltype.values).astype(str), "microglia")]
adata_mglia

In [None]:
sc.tl.rank_genes_groups(adata_mglia, groupby="batch_g", use_raw=False)

In [None]:
sc.pl.rank_genes_groups(adata_mglia, n_genes=10, sharey=True, save=False, ncols=2)

In [None]:
mglia_markers = pd.DataFrame(adata_mglia.uns['rank_genes_groups']['names']).head(20)

Perform pathway enrichment analysis on gene upregulated in tetX microglia:

In [None]:
mglia_de_genes = []
for gene in mglia_markers["experiment"].values:
    mglia_de_genes.append(gene.split("_")[1])
    
enrichr_df = gget.enrichr(mglia_de_genes, database="ontology", ensembl=True, plot=True)

In [None]:
mglia_markers["experiment_gene"] = [
    "class I histocompatibility antigen, F10 alpha chain-like",
    "interferon, alpha-inducible protein 6",
    "C-C motif chemokine 3-like (CCL3L3)",
    "ferritin, higher subunit",
    "ribosomal protein S18-like (RPS18)",
    "novel gene",
    "ribosomal protein L17",
    "ribosomal protein S7",
    "HRAS-like suppressor-like",
    "ribosomal protein S20",
    "novel gene",
    "Ribosomal Protein L6",
    "novel gene",
    "small nuclear ribonucleoprotein G",
    "class I histocompatibility antigen, F10 alpha chain-like",
    "Ribosomal Protein L23",
    "novel gene",
    "novel gene",
    "novel gene",
    "Serglycin"
]

mglia_markers["experiment_gene_function"] = [
    "Antigen processing and presentation of peptide antigen via MHC class I (UniProtKB - P15979 (HA1F_CHICK))",
    "Plays a role in apoptosis, negatively regulating the intrinsinc apoptotic signaling pathway and TNFSF10-induced apoptosis (PubMed:15685448, PubMed:17823654, PubMed:26244642)",
    "Cytokines are a family of secreted proteins that function in inflammatory and immunoregulatory processes. The protein encoded by this gene binds to several chemokine receptors, including chemokine binding protein 2 and chemokine (C-C motif) receptor 5 (CCR5). CCR5 is a co-receptor for HIV, and binding of this protein to CCR5 inhibits HIV entry.",
    "Ferritin is the major intracellular iron storage protein in prokaryotes and eukaryotes. ",
    "This gene encodes a ribosomal protein that is a component of the 40S subunit.",
    "novel gene",
    "Ribosomal protein that is a component of the 60S subunit. Among its related pathways are MAPK Signaling: Mitogens and Viral mRNA Translation.",
    "This gene encodes a ribosomal protein that is a component of the 40S subunit. Among its related pathways are Viral mRNA Translation.",
    "The protein encoded by this gene has both phospholipase and acyltransferase activities and acts as a tumor suppressor. ",
    "This gene encodes a ribosomal protein that is a component of the 40S subunit.",
    "novel gene",
    "component of the 60S ribosomal subunit",
    "novel gene",
    "component of the U1, U2, U4, and U5 small nuclear ribonucleoprotein complexes",
    "Antigen processing and presentation of peptide antigen via MHC class I",
    "component of the 60S subunit",
    "novel gene",
    "novel gene",
    "novel gene",
    "encodes a protein best known as a hematopoietic cell granule proteoglycan. Proteoglycans stored in the secretory granules of many hematopoietic cells also contain a protease-resistant peptide core, which may be important for neutralizing hydrolytic enzymes. This encoded protein was found to be associated with the macromolecular complex of granzymes and perforin, which may serve as a mediator of granule-mediated apoptosis. Might activate NMDAR https://jneuroinflammation.biomedcentral.com/articles/10.1186/s12974-019-1504-6"
]

pd.set_option('display.max_colwidth', None)
mglia_markers

___

# DE genes in glutamatergic neurons between control and exp

In [None]:
adata_glut = adata[np.char.startswith(nd(adata.obs.celltype.values).astype(str), "glut")]
adata_glut

In [None]:
sc.tl.rank_genes_groups(adata_glut, groupby="batch_g", use_raw=False)

In [None]:
sc.pl.rank_genes_groups(adata_glut, n_genes=10, sharey=True, save=False, ncols=2)

In [None]:
mglut_markers = pd.DataFrame(adata_glut.uns['rank_genes_groups']['names']).head(20)

In [None]:
mglut_markers["experiment_gene"] = [
    "class I histocompatibility antigen, F10 alpha chain-like",
    "interferon, alpha-inducible protein 6",
    "Beta-2-Microglobulin (B2M)",
    "ectonucleotide pyrophosphatase/phosphodiesterase 2 ",
    "Tubulin Beta 4B Class IVb",
    "Casein Kinase 1 Delta",
    "ferritin, higher subunit",
    "ribonuclease kappa (RBCK)",
    "histone deacetylase 7-like (HDAC7)",
    "novel gene",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    ""
]

mglut_markers["experiment_gene_function"] = [
    "Antigen processing and presentation of peptide antigen via MHC class I (UniProtKB - P15979 (HA1F_CHICK))",
    "Plays a role in apoptosis, negatively regulating the intrinsinc apoptotic signaling pathway and TNFSF10-induced apoptosis (PubMed:15685448, PubMed:17823654, PubMed:26244642",
    "Component of the class I major histocompatibility complex (MHC). Involved in the presentation of peptide antigens to the immune system. ",
    "The protein encoded by this gene functions as both a phosphodiesterase, which cleaves phosphodiester bonds at the 5' end of oligonucleotides, and a phospholipase, which catalyzes production of lysophosphatidic acid (LPA) in extracellular fluids. LPA evokes growth factor-like responses including stimulation of cell proliferation and chemotaxis.",
    "Tubulin is the major constituent of microtubules.",
    "This gene is a member of the casein kinase I (CKI) gene family whose members have been implicated in the control of cytoplasmic and nuclear processes, including DNA replication and repair. The encoded protein may also be involved in the regulation of apoptosis, circadian rhythm, microtubule dynamics, chromosome segregation, and p53-mediated effects on growth.",
    "Ferritin is the major intracellular iron storage protein in prokaryotes and eukaryotes.",
    "Endoribonuclease which preferentially cleaves ApU and ApG phosphodiester bonds. Hydrolyzes UpU bonds at a lower rate (RNK_HUMAN,Q6P5S7). Required for the initial stages of clathrin-mediated endocytic uptake of a diverse set of viruses, including dengue, West Nile, Sindbis, Rift Valley Fever, and influenza viruses (PubMed:26056282). Not required for clathrin-mediated endocytosis and macropinocytosis (PubMed:26056282).",
    "Responsible for the deacetylation of lysine residues on the N-terminal part of the core histones (H2A, H2B, H3 and H4). Histone deacetylation gives a tag for epigenetic repression and plays an important role in transcriptional regulation, cell cycle progression and developmental events.",
    "novel gene",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    ""
]

pd.set_option('display.max_colwidth', None)
mglut_markers

___

# DE genes in GABAergic neurons between control and exp

In [None]:
adata_gaba = adata[np.char.startswith(nd(adata.obs.celltype.values).astype(str), "GABA")]
adata_gaba

In [None]:
sc.tl.rank_genes_groups(adata_gaba, groupby="batch_g", use_raw=False)

In [None]:
sc.pl.rank_genes_groups(adata_gaba, n_genes=10, sharey=True, save=False, ncols=2)

In [None]:
mgaba_markers = pd.DataFrame(adata_gaba.uns['rank_genes_groups']['names']).head(20)

In [None]:
mgaba_markers["experiment_gene"] = [
    "Class I histocompatibility antigen, F10 alpha chain-like",
    "Interferon Alpha Inducible Protein 6",
    "Casein Kinase 1 Delta",
    "novel gene",
    "Extracellular fatty acid-binding protein-like",
    "Clusterin",
    "Apolipoprotein A1",
    "Ferritin, higher subunit",
    "Histone deacetylase 7-like (HDAC7)",
    "Sodium voltage-gated channel alpha subunit 2 (SCN2A)",
        "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    ""
]

mgaba_markers["experiment_gene_function"] = [
    "Antigen processing and presentation of peptide antigen via MHC class I (UniProtKB - P15979 (HA1F_CHICK))",
    "Plays a role in apoptosis, negatively regulating the intrinsinc apoptotic signaling pathway and TNFSF10-induced apoptosis (PubMed:15685448, PubMed:17823654, PubMed:26244642). However, it has also been shown to have a pro-apoptotic activity (PubMed:27673746). Has an antiviral activity towards hepatitis C virus/HCV by inhibiting the EGFR signaling pathway, which activation is required for entry of the virus into cells (PubMed:25757571).",
    "Essential serine/threonine-protein kinase that regulates diverse cellular growth and survival processes including Wnt signaling, DNA repair and circadian rhythms.",
    "novel gene",
    "**This gene was also DE in the hvcxra undefined cluster",
    "The protein encoded by this gene is a secreted chaperone that can under some stress conditions also be found in the cell cytosol. It has been suggested to be involved in several basic biological events such as cell death, tumor progression, and neurodegenerative disorders.",
    "This gene encodes apolipoprotein A-I, which is the major protein component of high density lipoprotein (HDL) in plasma. The encoded preproprotein is proteolytically processed to generate the mature protein, which promotes cholesterol efflux from tissues to the liver for excretion, and is a cofactor for lecithin cholesterolacyltransferase (LCAT), an enzyme responsible for the formation of most plasma cholesteryl esters. This gene is closely linked with two other apolipoprotein genes on chromosome 11. Defects in this gene are associated with HDL deficiencies, including Tangier disease, and with systemic non-neuropathic amyloidosis.",
    "Ferritin is the major intracellular iron storage protein in prokaryotes and eukaryotes.",
    "Responsible for the deacetylation of lysine residues on the N-terminal part of the core histones (H2A, H2B, H3 and H4). Histone deacetylation gives a tag for epigenetic repression and plays an important role in transcriptional regulation, cell cycle progression and developmental events. Histone deacetylases act via the formation of large multiprotein complexes. Involved in muscle maturation by repressing transcription of myocyte enhancer factors such as MEF2A, MEF2B and MEF2C. During muscle differentiation, it shuttles into the cytoplasm, allowing the expression of myocyte enhancer factors (By similarity). May be involved in Epstein-Barr virus (EBV) latency, possibly by repressing the viral BZLF1 gene. Positively regulates the transcriptional repressor activity of FOXP3 (PubMed:17360565). Serves as a corepressor of RARA, causing its deacetylation and inhibition of RARE DNA element binding (PubMed:28167758). In association with RARA, plays a role in the repression of microRNA-10a and thereby in the inflammatory response (PubMed:28167758).",
    "Mediates the voltage-dependent sodium ion permeability of excitable membranes. Assuming opened or closed conformations in response to the voltage difference across the membrane, the protein forms a sodium-selective channel through which Na(+) ions may pass in accordance with their electrochemical gradient (PubMed:1325650, PubMed:17021166, PubMed:28256214, PubMed:29844171). Implicated in the regulation of hippocampal replay occurring within sharp wave ripples (SPW-R) important for memory (By similarity). SCN2A_HUMAN,Q99250",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    ""
]

pd.set_option('display.max_colwidth', None)
mgaba_markers

___

# DE genes in astrocytes between control and exp

In [None]:
adata_astro = adata[np.char.startswith(nd(adata.obs.celltype.values).astype(str), "astro")]
adata_astro

In [None]:
sc.tl.rank_genes_groups(adata_astro, groupby="batch_g", use_raw=False)

In [None]:
sc.pl.rank_genes_groups(adata_astro, n_genes=10, sharey=True, save=False, ncols=2)

In [None]:
astro_markers = pd.DataFrame(adata_gaba.uns['rank_genes_groups']['names']).head(20)

In [None]:
pd.set_option('display.max_colwidth', None)
astro_markers

___

# Violin plots and heat maps of genes that came up during DE gene analysis

# Expression of DCX

In [None]:
adata.var[adata.var.index.str.contains("DCX")]

In [None]:
genes = [
    "DCX_ENSTGUG00000006180.2",    
    'TBR1_ENSTGUG00000006709.2', 
    'FABP7_ENSTGUG00000011826.2',
    'NOTCH2_ENSTGUG00000017231.2',
    "NELL1_ENSTGUG00000004458.2",
    ]
labels = genes
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="neurogenesis_markers",
    fold_change_min=1.5,
)

# Expression of ADORA2A

In [None]:
gget.search("adenosine A2a receptor", "guttata")

In [None]:
adata.var[adata.var.index.str.contains("ENSTGUG00000010054")]

In [None]:
genes = ["UPB1_ENSTGUG00000010054.2"]
labels = ["UPB1"]
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="ADORA2A",
    fold_change_min=2,
)

# Expression of synaptogenesis genes 
DE from CTRL undefined glut neurons

In [None]:
syanpto = [
    "NPTXR_ENSTGUG00000024678.1",
    "NPTX1_ENSTGUG00000025297.1", 
    "NPTX2_ENSTGUG00000008623.2",  
    "NRXN1_ENSTGUG00000005731.2",
    "CNTN4_ENSTGUG00000010102.2",
    "CACNA2D1_ENSTGUG00000002536.2",
]

In [None]:
genes = syanpto
labels = syanpto
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="synapto",
    fold_change_min=2,
)

# Genes related to MHC Class 1
Regulation of CNS synapses by neuronal MHC class I https://pubmed.ncbi.nlm.nih.gov/17420446/

In [None]:
mhc_genes = ["_ENSTGUG00000017273.2", "_ENSTGUG00000004607.2", "CD3E_ENSTGUG00000022317.1", "_ENSTGUG00000010325.2"]
mhc_gene_names = ["MHC1", "B2M", "CD3E", "CCL3L3"]

Abbreviations:  
MHC1 = Major Histocompatibility Complex, Class I (alpha chain-like)
B2M = Beta-2-Microglobulin

In [None]:
genes = mhc_genes
labels = mhc_gene_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="MHC_short",
    fold_change_min=1.5,
)

Per celltype instead of per cluster:

In [None]:
violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    np.sort(adata.obs.celltype_g.values.unique()),
    fig_name="MHC_short_fake_bulk",
    fold_change_min=1.5,
)

In [None]:
# vmin = 0
# vmax = [60, 50, 30, 90]
# size = 5

# for i, gene in enumerate(mhc_genes):
#     fig, axs = plt.subplots(figsize=(15,4), ncols=3)

#     condition = ["control", "experiment"]
#     color_map = "Reds"

#     sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(mhc_gene_names[i], condition[0]), vmax=vmax[i], vmin=vmin, color_map=color_map, use_raw=True, show=False, ax=axs[0])
#     sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(mhc_gene_names[i], condition[1]), vmax=vmax[i], vmin=vmin, color_map=color_map, use_raw=True, show=False, ax=axs[1])
#     sc.pl.umap(adata, color="celltype", show=False, ax=axs[2])

# Plot the expression of all MHC1-associated genes from ENSEMBL

In [None]:
# ## MHCII associated genes
# mhc2_genes = [
#     "_ENSTGUG00000014649.2",
#     "ENSTGUG00000024543",
#     "_ENSTGUG00000029364.1",
#     "FGL2_ENSTGUG00000002746.2",
# ]
# mhc2_gene_names = [
#     "MHC2-like1",
#     "MHC2-like2",
#     "RFX1-like",
#     "Fibrinogen like 2",
# ]

In [None]:
# MHCI associated genes
mhc_genes = [
    "_ENSTGUG00000017273.2",
    "_ENSTGUG00000028417.1",
    "_ENSTGUG00000004607.2",
    "_ENSTGUG00000010325.2",
    "CX3CR1_ENSTGUG00000018365.2",
    "_ENSTGUG00000024320.1",
#     "_ENSTGUG00000010847.2",
    "CD3E_ENSTGUG00000022317.1",
    "IFI30_ENSTGUG00000000388.2",
    "FGL2_ENSTGUG00000002746.2",
    "TAF7_ENSTGUG00000006593.2",
    "CALR_ENSTGUG00000014982.2",
    "_ENSTGUG00000015337.2",
    "CTSH_ENSTGUG00000009976.2",
    "PDIA3_ENSTGUG00000010220.2",
#     "CD8A_ENSTGUG00000010841.2",
    "IDE_ENSTGUG00000008951.2",
    "ATP5F1A_ENSTGUG00000001582.2",
    "ATP5F1B_ENSTGUG00000015620.2",
]
mhc_gene_names = [
    "MHC1 F10 alpha chain-like 1",
    "MHC1 F10 alpha chain like 2",
    "Beta-2-microglobulin",
    "CCL3L3",
    "CX3CR1",
    "CX3CL1",
#     "MHC1 protein binding like",
    "CD3E",
    "IFI30",
    "Fibrinogen like 2",
    "TAF7",
    "Calreticulin",
    "Antigen peptide transporter like",
    "Cathepsin H",
    "PDIA3",
#     "CD8A",
    "IDE",
    "ATP5F1A",
    "ATP5F1B",
]

#### Notes on MHC1 associated genes listed above:
novel gene (Zebra finch Gene)
ENSTGUG00000017273 RRCB01000109.1:1545781-1566436:1
Class I histocompatibility antigen, F10 alpha chain-like [Source:NCBI gene;Acc:115493126]
LOC100231469 (NCBI gene (formerly Entrezgene) record; description: class I histocompatibility antigen, F10 alpha chain-like,) is an external reference matched to Gene ENSTGUG00000017273

novel gene (Zebra finch Gene)
ENSTGUG00000028417 RRCB01000109.1:254090-279387:1
Class I histocompatibility antigen, F10 alpha chain-like [Source:NCBI gene;Acc:115493083]
LOC115493083 (NCBI gene (formerly Entrezgene) record; description: class I histocompatibility antigen, F10 alpha chain-like,) is an external reference matched to Gene ENSTGUG00000028417

novel gene (Zebra finch Gene)
ENSTGUG00000004607 10:4098707-4102774:-1
GO:0042612 (GO record; description: MHC class I protein complex,) is an external reference matched to Transcript ENSTGUT00000004794  
-- UniProtKB match:Beta-2-microglobulin

CD3E (Zebra finch Gene)
ENSTGUG00000022317 24:6524361-6530450:-1
CD3e molecule [Source:NCBI gene;Acc:115498349].

IFI30 (Zebra finch Gene)
ENSTGUG00000000388 28:3544686-3547884:1
IFI30 lysosomal thiol reductase [Source:HGNC Symbol;Acc:HGNC:5398]
GO:0042590 (GO record; description: antigen processing and presentation of exogenous peptide antigen via MHC class I,) is an external reference matched to Transcript ENSTGUT00000000403

TAF7 (Zebra finch Gene)
ENSTGUG00000006593 4A:18660736-18666314:1
TATA-box binding protein associated factor 7 like [Source:NCBI gene;Acc:100228159]
GO:0045344 (GO record; description: negative regulation of MHC class I biosynthetic process,) is an external reference matched to Transcript ENSTGUT00000006864

CALR (Zebra finch Gene)
ENSTGUG00000014982 30:3757974-3761873:1
Calreticulin [Source:NCBI gene;Acc:100190479]
GO:0042824 (GO record; description: MHC class I peptide loading complex,) is an external reference matched to Transcript ENSTGUT00000015593

novel gene (Zebra finch Gene)
ENSTGUG00000015337 RRCB01000090.1:2677018-2687645:1
Antigen peptide transporter 1-like [Source:NCBI gene;Acc:105760759]
GO:0002479 (GO record; description: antigen processing and presentation of exogenous peptide antigen via MHC class I, TAP-dependent,) is an external reference matched to Transcript ENSTGUT00000015946

CTSH (Zebra finch Gene)
ENSTGUG00000009976 10:21449104-21547116:-1
Cathepsin H [Source:HGNC Symbol;Acc:HGNC:2535]
GO:0030108 (GO record; description: HLA-A specific activating MHC class I receptor activity,) is an external reference matched to Transcript ENSTGUT00000010401

PDIA3 (Zebra finch Gene)
ENSTGUG00000010220 10:21775416-21783116:-1
Protein disulfide isomerase family A member 3 [Source:NCBI gene;Acc:101234078]
GO:0042824 (GO record; description: MHC class I peptide loading complex,) is an external reference matched to Transcript ENSTGUT00000010675

IDE (Zebra finch Gene)
ENSTGUG00000008951 6:19632451-19686491:1
Insulin degrading enzyme [Source:NCBI gene;Acc:100232369]
GO:0019885 (GO record; description: antigen processing and presentation of endogenous peptide antigen via MHC class I,) is an external reference matched to Transcript ENSTGUT00000036510

ATP5F1A (Zebra finch Gene)
ENSTGUG00000001582 Z:40984572-41030045:1
ATP synthase F1 subunit alpha [Source:NCBI gene;Acc:751975]
GO:0042288 (GO record; description: MHC class I protein binding,) is an external reference matched to Transcript ENSTGUT00000032148

ATP5F1B (Zebra finch Gene)
ENSTGUG00000015620 29:1469203-1601380:-1
ATP synthase F1 subunit beta [Source:NCBI gene;Acc:100221153]
GO:0042288 (GO record; description: MHC class I protein binding,) is an external reference matched to Transcript ENSTGUT00000016243

In [None]:
genes = mhc_genes
labels = mhc_gene_names
celltypes = celltypes_standard

violinplot(adata_exp, adata_ctrl, genes, labels, celltypes, "MHC1-associated")

# Genes related to iron metabolism

In [None]:
iron_genes = ["TFRC_ENSTGUG00000009510.2", "LTF_ENSTGUG00000006576.2", "MELTF_ENSTGUG00000008871.2", "_ENSTGUG00000009830.2", "_ENSTGUG00000015659.2", "_ENSTGUG00000008813.2", "FTH1_ENSTGUG00000005913.2"]
iron_gene_names = ["TFRC", "LTF", "MELTF", "MELTF-like", "ferritin, hs", "ferritin, lc", "ferritin, hc"]

Abbreviations:  
Transferrin receptor (TFRC)  
lactotransferrin (LTF)    
melanotransferrin (MELTF)  
ferritin, hs (ferritin, higher subunit)  
ferritin, lc (ferritin, light chain)  
ferritin, hc (ferritin, heavy chain)  

In [None]:
genes = iron_genes
labels = iron_gene_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="iron_metabolism",
    fold_change_min=2,
)

In [None]:
vmin = 0
vmax = [4.5, 5.5, 4, 3.5, 7, 3.5, 5.5]
size = 5

for i, gene in enumerate(iron_genes):
    fig, axs = plt.subplots(figsize=(15,4), ncols=3)

    condition = ["control", "experiment"]
    color_map = "Reds"

    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(iron_gene_names[i], condition[0]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[0])
    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(iron_gene_names[i], condition[1]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[1])
    sc.pl.umap(adata, color="celltype", show=False, ax=axs[2])

# Synapse formation

In [None]:
syn_genes = ["NPTXR_ENSTGUG00000024678.1", "NPTX1_ENSTGUG00000025297.1", "NPTX2_ENSTGUG00000008623.2", "SERPINI1_ENSTGUG00000011067.2"]
syn_gene_names = ["NPTXR", "NPTX1", "NPTX2", "SERPINI1"]

Abbreviations:  
NPTXR = Neuronal Pentraxin Receptor  
NPTX1 = Neuronal Pentraxin 1  
NPTX2 = Neuronal Pentraxin 2  
SERPINI1 = Serpin Family I Member 1  

In [None]:
genes = syn_genes
labels = syn_gene_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="synapse_formation",
    fold_change_min=2,
)

In [None]:
vmin = 0
vmax = [6.5, 5.5, 5.5, 6.5]
size = 5

for i, gene in enumerate(syn_genes):
    fig, axs = plt.subplots(figsize=(15,4), ncols=3)

    condition = ["control", "experiment"]
    color_map = "Reds"

    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(syn_gene_names[i], condition[0]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[0])
    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(syn_gene_names[i], condition[1]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[1])
    sc.pl.umap(adata, color="celltype", show=False, ax=axs[2])

# Synaptic transmission

In [None]:
ion_genes = ["GRIA4_ENSTGUG00000012676.2", "GRIA2_ENSTGUG00000005484.2", "RAB39B_ENSTGUG00000019747.1", "_ENSTGUG00000007152.2"]
ion_gene_names = ["GRIA4", "GRIA2", "RAB39B", "SCN2A"]

Abbreviations:  
GRIA2 = Glutamate Ionotropic Receptor AMPA Type Subunit 2  
RAB39B = Member RAS Oncogene Family   
SCN2A = Sodium Voltage-Gated Channel Alpha Subunit 2

In [None]:
genes = ion_genes
labels = ion_gene_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="DE_ion_channels",
    fold_change_min=2,
)

In [None]:
vmin = 0
vmax = [10, 6, 5, 5.5]
size = 5

for i, gene in enumerate(ion_genes):
    fig, axs = plt.subplots(figsize=(15,4), ncols=3)

    condition = ["control", "experiment"]
    color_map = "Reds"

    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(ion_gene_names[i], condition[0]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[0])
    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(ion_gene_names[i], condition[1]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[1])
    sc.pl.umap(adata, color="celltype", show=False, ax=axs[2])

# Expression of AMPA/NMDA receptor associated genes (from Ensembl)

In [None]:
nmda = [
    ## AMPA
    # cornichon family AMPA receptor auxiliary proteins
    "CNIH1_ENSTGUG00000013093.2",
    "CNIH3_ENSTGUG00000004070.2",
    "CNIH4_ENSTGUG00000025113.1",
    # Glutamate ionotropic receptor AMPA type subunits
    "GRIA1_ENSTGUG00000000726.2",
    "GRIA2_ENSTGUG00000005484.2",
    "GRIA3_ENSTGUG00000003563.2",
    "GRIA4_ENSTGUG00000012676.2",
    # Part of AMPA glutamate receptor complex
    "OLFM3_ENSTGUG00000029430.1",
    "SHISA9_ENSTGUG00000005074.2",
    "VWC2_ENSTGUG00000007906.2",
    "VWC2L_ENSTGUG00000002893.2",
    "CACNG1_ENSTGUG00000004397.2",
    "CACNG2_ENSTGUG00000010725.2",
    "CACNG3_ENSTGUG00000006208.2",
    "CACNG4_ENSTGUG00000004385.2",
    "CACNG5_ENSTGUG00000026853.1",
    "ABHD6_ENSTGUG00000007463.2",
    "ABHD12_ENSTGUG00000004985.2",
    # AMPA receptor activity modulators
    "ADRB1_ENSTGUG00000010890.2",   # Adrenoceptors
    "ADRB2_ENSTGUG00000000286.2",
    #
    ## Possibly involved in receptor clustering on the cell membrane
    "SLC6A6_ENSTGUG00000008337.2",  # From DE genes
    "SLC7A11_ENSTGUG00000001504.2",
    "DLG3_ENSTGUG00000007374.2",
    ## General receptor activity regulators
    "NLGN1_ENSTGUG00000021670.1",
    "RELN_ENSTGUG00000002836.2",
    #
    ## NMDA
    # NMDA receptor synaptonuclear signaling and neuronal migration factor
    "NSMF_ENSTGUG00000002736.2",
    # Receptor subunits
    "GRIN1_ENSTGUG00000002568.2",
    "GRINA_ENSTGUG00000021030.1",
    "GRIN2A_ENSTGUG00000004747.2",
    "GRIN2B_ENSTGUG00000009456.2",
    "GRIN2C_ENSTGUG00000008858.2",
    "GRIN2D_ENSTGUG00000021251.1",
    "GRIN3A_ENSTGUG00000000661.2",
    "GRIN3B_ENSTGUG00000000668.2",
    # Part of NMDA selective glutamate receptor complex
    "EPS8_ENSTGUG00000012399.2",
    # NMDA receptor activity modulators
    "DAPK1_ENSTGUG00000000606.2",
    "RASGRF2_ENSTGUG00000007037.2",
    "DRD1_ENSTGUG00000000340.2",
    "CRH_ENSTGUG00000011277.2",
]

In [None]:
genes = nmda
labels = nmda
celltypes = [
    "GABAergic neurons 1",
    "GABAergic neurons 2",
    "astrocytes 1",
    "astrocytes 2",
    "glutamatergic neurons 1",
    "glutamatergic neurons 2",
    "glutamatergic neurons 3",
    "glutamatergic neurons 4",
    "microglia 1",
    "microglia 2",
]

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="NMDA_AMPA",
    fold_change_min=1.5,
)

Heatmap of genes that showed the most difference in glutamatergic neurons:

In [None]:
# vmin = 0
# vmax = [15, 3, 11, 25, 12]
# size = 5

# titles = ["DAPK1", "VWC2L", "CACNG4", "CACNG5", "RASGRF2"]

# for i, gene in enumerate(["DAPK1_ENSTGUG00000000606.2", "VWC2L_ENSTGUG00000002893.2", "CACNG4_ENSTGUG00000004385.2", "CACNG5_ENSTGUG00000026853.1", "RASGRF2_ENSTGUG00000007037.2"]):
#     fig, axs = plt.subplots(figsize=(15,4), ncols=3)

#     condition = ["control", "experiment"]
#     color_map = "Reds"

#     sc.pl.umap(adata_glut[np.char.startswith(nd(adata_glut.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(titles[i], condition[0]), vmax=vmax[i], vmin=vmin, color_map=color_map, use_raw=True, show=False, ax=axs[0])
#     sc.pl.umap(adata_glut[np.char.startswith(nd(adata_glut.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(titles[i], condition[1]), vmax=vmax[i], vmin=vmin, color_map=color_map, use_raw=True, show=False, ax=axs[1])
#     sc.pl.umap(adata_glut, color="celltype", show=False, ax=axs[2])

# Expression of pre-synatpic proteins

In [None]:
# Find SNARE-associated genes
synaptotagmin = np.sort(adata.var.index[adata.var.index.str.contains("SYT")])
syntaxin = np.sort(adata.var.index[adata.var.index.str.contains("STX")])
ankyrin = np.sort(adata.var.index[adata.var.index.str.contains("ANKRD")])
SNAP = np.sort(adata.var.index[adata.var.index.str.contains("SNAP")])
SNARE = np.sort(adata.var.index[adata.var.index.str.contains("SNARE")])
VAMP = np.sort(adata.var.index[adata.var.index.str.contains("VAMP")])
nexin = np.sort(adata.var.index[adata.var.index.str.contains("SNX")])
complexin = np.sort(adata.var.index[adata.var.index.str.contains("CPLX")])
vps = np.sort(adata.var.index[adata.var.index.str.contains("VPS")])
# Other proteins possibly involved in receptor transport to the membrane:
PSD = np.sort(adata.var.index[adata.var.index.str.contains("PSD")])
NSF = np.sort(adata.var.index[adata.var.index.str.contains("NSF")])

In [None]:
pre_synaptic = np.concatenate((synaptotagmin, syntaxin, ankyrin, SNAP, SNARE, VAMP, nexin, complexin, vps, PSD, NSF))

# Get gene names
pre_synaptic_labels = []
for i in pre_synaptic:
    pre_synaptic_labels.append(i.split("_")[0])

In [None]:
genes = pre_synaptic
labels = pre_synaptic_labels
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="pre-synaptic_vesicle",
    fold_change_min=1.5,
)

# GABA receptor-associated genes from Ensembl

Use gget to find GABA-associated genes:

In [None]:
searchwords = ["gaba", "gamma-aminobutyric acid"]
species = "taeniopygia_guttata"

df1 = gget.search(searchwords, species, limit=None)
df1.head()

In [None]:
# Find index IDs of genes
GABA = []
for ID in df1["Ensembl_ID"].values:
    GABA.append(adata.var.index[adata.var.index.str.contains(ID)][0])

In [None]:
# Genes with the searchwords in the Ensembl description
GABA

Plot vplot of genes with the searchwords in the Ensembl description:

In [None]:
genes = GABA
labels = GABA
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="GABA",
    fold_change_min=1.5,
)

Same plot but showing only two genes and neuronal celltypes:

In [None]:
genes = ["_ENSTGUG00000026007.1", "_ENSTGUG00000009131.2"]
labels = ["GABARAP_ENSTGUG00000026007.1", "GABARR_typeA_ENSTGUG00000009131.2"]
celltypes = [
    "GABAergic neurons 1",
    "GABAergic neurons 2",
    "astrocytes 1",
    "astrocytes 2",
    "glutamatergic neurons 1",
    "glutamatergic neurons 2",
    "glutamatergic neurons 3",
    "glutamatergic neurons 4",
]

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="GABA_short",
    fold_change_min=1.5,
)

In [None]:
# gab_markers = [
#     "GAD1_ENSTGUG00000008060.2",
#     "GAD2_ENSTGUG00000001148.2",
#     "SST_ENSTGUG00000027783.1",
#     "PVALB_ENSTGUG00000010713.2",
#     "CALB1_ENSTGUG00000011825.2",
#     "CALB2_ENSTGUG00000005987.2",
# ]

In [None]:
# genes = gab_markers
# labels = gab_markers
# celltypes = celltypes_standard

# violinplot(
#     adata_exp,
#     adata_ctrl,
#     genes,
#     labels,
#     celltypes,
#     fig_name="GABA_cellmarkers",
#     fold_change_min=1.5,
# )

# Gephyrin
"Recent studies clarified a dynamic regulation of the intracellular trafficking of GABA(A) receptors and its involvement in the pathophysiology of epilepsy. GABA(A) synaptic inhibition decreased in the hippocampal CA1 area of patients with intractable temporal lobe epilepsy (TLE). The reduction of GABAergic inhibition was accompanied by a decrease in the expression of gephyrin, a scaffolding protein, and GABA(A) receptor gamma2 subunit. These findings indicate that the reduction of gephyrin impairs the clustering and fixation of GABA(A) receptors in postsynaptic membranes, leading to a decrease in number of GABA(A) receptor subunits and GABA(A) synaptic inhibition. In contrast, the GABA(A) synaptic inhibition was lastingly potentiated in the dentate gyrus of kindled animals and the expression of GABA(A) receptor subunits(especially alpha2) was significantly increased in TLE patients. It is plausible that the potentiation of dentate GABAergic inhibition counteracts a hyperexcitability of granule cells as a defense mechanism in epilepsy. In status epilepticus, furthermore, the hippocampal GABA(A) receptor beta3 subunits were significantly disphosphorylated, resulting in a facilitation of the endocytosis of GABA(A) receptors and reduced benzodiazepine sensitivity."  
https://reader.elsevier.com/reader/sd/pii/S0306452205004367?token=C076C3C79DF94D756BE63C0A22F4CAC10D216E059BBE9C76A5BD6F318E1372FA9CCCC22C51B44132A529CC586E0344FD&originRegion=us-east-1&originCreation=20220306072456

In [None]:
search("gephyrin", "guttata")

In [None]:
adata.var[adata.var.index.str.contains("ENSTGUG00000011577")]

In [None]:
gep_markers =[
    "GPHN_ENSTGUG00000011577.2",
    "ZDHHC12_ENSTGUG00000019700.1",
    "NRXN1_ENSTGUG00000005731.2"
]

In [None]:
genes = gep_markers
labels = gep_markers
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="gephyrin",
    fold_change_min=1.5,
)

# Zebra finch sodium, calcium and chloride channels from https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-5871-2

In [None]:
df_temp = pd.read_csv("zebrafinch_Na_Ca_Cl_channels.txt", sep=" ", header=None)
    
Na_Ca_Cl = []
for ID in df_temp[0].values:
    # Ignore genes that cannot be found based on the display label
    if len(adata.var.index[adata.var.index.str.contains(ID)]) > 0:
        Na_Ca_Cl.append(adata.var.index[adata.var.index.str.contains(ID)][0])

In [None]:
genes = Na_Ca_Cl
labels = Na_Ca_Cl
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="Na_Ca_Cl_Mello",
    fold_change_min=1.5,
)

Select the genes that are up- or down-regulated in glutamatergic and GABAergic neurons:

In [None]:
Na_Ca_Cl_GABA_glut = [
    "SCN3B_ENSTGUG00000000607.2",
    "SCN4B_ENSTGUG00000000230.2",
    "ASIC2_ENSTGUG00000003215.2",
    "ASIC4_ENSTGUG00000006157.2",
    "SCNN1B_ENSTGUG00000005936.2",
    "CACNA2D3_ENSTGUG00000006966.2",
    "CACNG3_ENSTGUG00000006208.2",
    "CACNB2_ENSTGUG00000001247.2",
    "ITPR3_ENSTGUG00000001798.2",
    "_ENSTGUG00000011653.2",
    "_ENSTGUG00000007338.2",
    "CLCN2_ENSTGUG00000010323.2",
    "_ENSTGUG00000002023.2",
    "ANO1_ENSTGUG00000005385.2",
    "ANO4_ENSTGUG00000009008.2",
    "ANO5_ENSTGUG00000004532.2",
    "ANO6_ENSTGUG00000006024.2",
    "ANO10_ENSTGUG00000003830.2",
    "BEST1_ENSTGUG00000005934.2",
    "LRRC8A_ENSTGUG00000004233.2",
    "LRRC8B_ENSTGUG00000006230.2",
    "CFTR_ENSTGUG00000004828.2",
    "CLNS1A_ENSTGUG00000013030.2",
]

In [None]:
genes = Na_Ca_Cl_GABA_glut
labels = Na_Ca_Cl_GABA_glut
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="Na_Ca_Cl_GABA_glut_Mello",
    fold_change_min=1.5,
)

# Zebra finch potassium channels from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3711925/

In [None]:
df_temp = pd.read_csv("zebrafinch_K_channels.txt", sep=" ", header=None)
    
K = []
for ID in df_temp[0].values:
    # Ignore genes that cannot be found based on the display label
    if len(adata.var.index[adata.var.index.str.contains(ID)]) > 0:
        K.append(adata.var.index[adata.var.index.str.contains(ID)][0])

In [None]:
genes = K
labels = K
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="K_Mello",
    fold_change_min=1.5,
)

Select only those genes that are up- or down-regulated in GABA-/glutamatergic neurons:

In [None]:
K_GABA_glut = [
    "KCNAB1_ENSTGUG00000011253.2",
    "KCNB2_ENSTGUG00000011511.2",
    "KCNS1_ENSTGUG00000005049.2",
    "KCND2_ENSTGUG00000004691.2",
    "KCNIP1_ENSTGUG00000014900.2",
    "KCNIP4_ENSTGUG00000009548.2",
    "_ENSTGUG00000003202.2",
    "KCNH5_ENSTGUG00000012967.2",
    "KCNH6_ENSTGUG00000001965.2",
    "KCNH7_ENSTGUG00000006950.2",
    "KCNQ2_ENSTGUG00000007434.2",
    "KCNQ5_ENSTGUG00000012688.2",
    "KCNN1_ENSTGUG00000014658.2",
    "KCNT2_ENSTGUG00000004186.2",
    "KCNJ2_ENSTGUG00000002878.2",
    "KCNJ9_ENSTGUG00000000586.2",
    "KCNJ15_ENSTGUG00000004997.2",
    "KCNK1_ENSTGUG00000010202.2",
    "KCNK2_ENSTGUG00000002899.2",
    "KCNK10_ENSTGUG00000012403.2",
    "KCNK12_ENSTGUG00000005557.2",
    "HCN2_ENSTGUG00000000617.2",
    "KCTD6_ENSTGUG00000009191.2",
    "KCTD17_ENSTGUG00000010672.2",
    "KCNAB1_ENSTGUG00000011253.2",
    "KCNB2_ENSTGUG00000011511.2",
    "_ENSTGUG00000012696.2",
    "_ENSTGUG00000005525.2",
]

In [None]:
genes = K_GABA_glut
labels = K_GABA_glut
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="K_GABA_glut_Mello",
    fold_change_min=1.5,
)

# General cell cycle progression and developmental events

In [None]:
other_genes = ["_ENSTGUG00000029762.1", "CSNK1D_ENSTGUG00000003904.2", "TUBB4B_ENSTGUG00000019371.1", "IFI6_ENSTGUG00000021692.1"]
other_gene_names = ["HDAC7", "CSNK1D", "TUBB4B", "IFI6"]

Abbreviations:  
HDAC7 = Histone deacetylase 7-like  
CSNK1D = Casein Kinase 1 Delta  
TUBB4B = Tubulin Beta 4B Class IVb  
IFI6 = Interferon Alpha Inducible Protein 6

In [None]:
genes = other_genes
labels = other_gene_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="cellcycle_dev",
    fold_change_min=2,
)

In [None]:
vmin = 0
vmax = [5.5, 5.2, 4.2, 6]
size = 5

for i, gene in enumerate(other_genes):
    fig, axs = plt.subplots(figsize=(15,4), ncols=3)

    condition = ["control", "experiment"]
    color_map = "Reds"

    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[0])], size=size, color=gene, title="{} expression in {}".format(other_gene_names[i], condition[0]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[0])
    sc.pl.umap(adata[np.char.startswith(nd(adata.obs.batch.values).astype(str), condition[1])], size=size, color=gene, title="{} expression in {}".format(other_gene_names[i], condition[1]), vmin=vmin, vmax=vmax[i], color_map=color_map, use_raw=True, show=False, ax=axs[1])
    sc.pl.umap(adata, color="celltype", show=False, ax=axs[2])

# Are immediate early genes (IEGs) upregulated in experiment?

In [None]:
# Get IEGs
IEG = marker_gene_mat.loc[marker_gene_mat["immediate early genes"] != 0].index.values

# Get gene names
IEG_labels = []
for i in IEG:
    IEG_labels.append(i.split("_")[0])

In [None]:
genes = IEG
labels = IEG_labels
celltypes = celltypes_standard

violinplot(
    adata_exp, adata_ctrl, genes, labels, celltypes, fig_name="IEGs", fold_change_min=2
)

# Are immune response genes upregulated in experiment?
Mitochondrial: MCU  
Apoptosis: AIFM1, CASP3  

From https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6016-3  
Metabolic process: APOD, LRAT, RBP4    
Immunity-related transcripts (in hippocampus): IL-1β, RSAD2, SOCS3, CTGF, GPR75, IRF1, EIF4EBP1    
Red blood cells: CCL5, TLR3, LRRK1, SAAL1, MAP3K8, IFIH1  
Significanlty up in all tissues: STEAP4, TOR1B, ZNFX1  

In [None]:
immune_response = marker_gene_mat.loc[marker_gene_mat['immune response'] != 0].index.values
immune_response_labels = []
for i in immune_response:
    immune_response_labels.append(i.split("_")[0])

In [None]:
genes = immune_response
labels = immune_response_labels
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="immune_response",
    fold_change_min=2,
)

# Plot the expression of GABAergic neurons, glutatamergic neurons, microglia and astrocytes DE genes (control vs. exp) across all celltypes
DE genes here are the top 10 highest z-score genes for each celltype (control versus experiment) (including all clusters for that celltype). 

In [None]:
all_DE_exp = np.unique(
    (mglut_markers["experiment"].values).tolist()
    + (mgaba_markers["experiment"].values).tolist()
    + (mglia_markers["experiment"].values).tolist()
    + (astro_markers["experiment"].values).tolist()
)

all_DE_exp_names = [
    "Apolipoprotein A1 (APOA1)",
    "Bestrophin 1 (BEST1)",
    "Clusterin (CLU)",
    "Casein Kinase 1 Delta (CSNK1D)",
    "Ectonucleotide Pyrophosphatase/Phosphodiesterase\n (2ENPP2)",
    "Heat Shock Protein 90 Beta Family Member 1\n (HSP90B1)",
    "Heat Shock Protein Family A (Hsp70) Member 5\n (HSPA5)",
    "Interferon Alpha Inducible Protein 6 (IFI6)",
    "Ribosomal Protein L17 (RPL17)",
    "Ribosomal Protein L23 (RPL23)",
    "Ribosomal Protein L6 (RPL6)",
    "Ribosomal Protein S20 (RPS20)",
    "Ribosomal Protein S7 (RPS7)",
    "Serglycin (SRGN)",
    "Tubulin Beta 4B (TUBB4B)",
    "Small Nuclear Ribonucleoprotein G (SNRPG)",
    "Beta-2-microglobulin (B2M)",
    "Sodium Voltage-gated Channel α Subunit 2\n (SCN2A)",
    "HRAS suppressor-like",
    "C-C Motif Chemokine 3-like (CCL3)",
    "Ferritin (higher subunit)",
    "Class I histocompatibility antigen,\n F10 α chain-like 1",
    "ENSTGUG00000019671\n (novel protein-coding gene)",
    "ENSTGUG00000021104\n (novel protein-coding gene)",
    "Extracellular fatty acid-binding protein-like",
    "ENSTGUG00000022493\n (lncRNA)",
    "ENSTGUG00000023828\n (lncRNA)",
    "ENSTGUG00000024196\n (lncRNA)",
    "Ribosomal Protein S18-like",
    "ENSTGUG00000026858\n (lncRNA)",
    "Ribonuclease Kappa 1",
    "ENSTGUG00000028243\n (lncRNA)",
    "Class I histocompatibility antigen,\n F10 α chain-like 2",
    "ENSTGUG00000028479\n (novel protein-coding gene)",
    "ENSTGUG00000028531\n (novel protein-coding gene)",
    "Ribonuclease Kappa 2",
    "ENSTGUG00000029343\n (novel protein-coding gene)",
    "Histone Deacetylase 7-like (HDAC7)",
]

df_DE_exp = pd.DataFrame()
df_DE_exp["gene_ID"] = all_DE_exp
df_DE_exp["gene_name"] = all_DE_exp_names
df_DE_exp = df_DE_exp.sort_values("gene_name")
df_DE_exp

In [None]:
genes = df_DE_exp["gene_ID"].values
labels = df_DE_exp["gene_name"].values
celltypes = celltypes_standard

violinplot(adata_exp, adata_ctrl, genes, labels, celltypes, fig_name="DE_EXP")

In [None]:
all_DE_ctrl = np.unique(
    (mglut_markers["control"].values).tolist()
    + (mgaba_markers["control"].values).tolist()
    + (mglia_markers["control"].values).tolist()
    + (astro_markers["control"].values).tolist()
)

all_DE_ctrl_names = [
    "Adenylosuccinate Synthase 1 (ADSS1)",
    "Arachidonate 5-Lipoxygenase (ALOX5)",
    "Basic, Immunoglobulin-Like Variable Motif Containing\n (BIVM)",
    "Fatty Acid Binding Protein 7 (FABP7)",
    "Filamin C (FLNC)",
    "Fms Related Receptor Tyrosine Kinase 3 (FLT3)",
    "Hyaluronan And Proteoglycan Link Protein 4 (HAPLN4)",
    "Heme Oxygenase 1 (HMOX1)",
    "Insulin Like Growth Factor 2 (IGF2)",
    "Microsomal Glutathione S-Transferase 1 (MGST1)",
    "Neurotensin (NTS)",
    "Purinergic Receptor P2Y12 (P2RY12)",
    "Proenkephalin (PENK)",
    "Prosaposin (PSAP)",
    "Ribosomal Protein L17 (RPL17)",
    "Semaphorin 3E (SEMA3E)",
    "Solute Carrier Family 6 Member 6 (SLC6A6)",
    "Synaptotagmin 2 (SYT2)",
    "Synaptotagmin 4 (SYT4)",
    "Urotensin 2B (UTS2B)",
    "WRE",
    "ENSTGUG00000004224\n (novel protein-coding gene)",
    "Beta-1,4-Galactosyltransferase 6 (B4GALT6)",
    "ENSTGUG00000019084\n (novel protein-coding gene)",
    "ENSTGUG00000019157\n (novel protein-coding gene)",
    "ENSTGUG00000019418\n (novel protein-coding gene)",
    "ENSTGUG00000020332\n (lncRNA)",
    "ENSTGUG00000021421\n (lncRNA)",
    "ENSTGUG00000022636\n (novel protein-coding gene)",
    "ENSTGUG00000023435\n (novel protein-coding gene)",
    "ENSTGUG00000023451\n (lncRNA)",
    "ENSTGUG00000026378\n (novel protein-coding gene)",
    "ENSTGUG00000027801\n (novel protein-coding gene)",
    "ENSTGUG00000027913\n (novel protein-coding gene)",
    "ENSTGUG00000028256\n (novel protein-coding gene)",
    "ENSTGUG00000029050\n (lncRNA)",
    "Tubulin Beta-7 chain",
    "All-Trans Retinoic Acid Induced\n Differentiation Factor (ATRAID)",
    "ENSTGUG00000029616\n (novel protein-coding gene)",
    "ENSTGUG00000029645\n (novel protein-coding gene)",
]

df_DE_ctrl = pd.DataFrame()
df_DE_ctrl["gene_ID"] = all_DE_ctrl
df_DE_ctrl["gene_name"] = all_DE_ctrl_names
df_DE_ctrl = df_DE_ctrl.sort_values("gene_name")
df_DE_ctrl

In [None]:
genes = df_DE_ctrl["gene_ID"].values
labels = df_DE_ctrl["gene_name"].values
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="DE_CTRL",
)

Follow-up on all-trans retinoic acid induced differentiation factor (ATRAID) based on https://pubmed.ncbi.nlm.nih.gov/10985355/

In [None]:
aldh = [
    "ALDH1A1_ENSTGUG00000000800.2",
    "ALDH1A2_ENSTGUG00000006178.2",
    "ALDH1A3_ENSTGUG00000008854.2",
    #     "ALDH1L2_ENSTGUG00000011095.2"
]

aldh_names = [
    "ALDH1A1",
    "ALDH1A2",
    "ALDH1A3",
    #     "ALDH1L2"
]

genes = aldh
labels = aldh_names
celltypes = celltypes_standard

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="ALDHs",
    fold_change_min=1.5,
)

# DE genes from analysis of separately clustered exp and ctrl data
DE genes here are a selection of the top 10 highest z-score genes for each celltype (control versus experiment) (including all clusters for that celltype) from the separately clustered experiment and control data".

In [None]:
all_markers = [
    "SLC17A6_ENSTGUG00000004570.2",
    "UTS2B_ENSTGUG00000009278.2",
    "HPCAL1_ENSTGUG00000013044.2",
    "ALDH1A2_ENSTGUG00000006178.2",
    "DCN_ENSTGUG00000008178.2",
    "FNDC9_ENSTGUG00000000654.2",
    "NTS_ENSTGUG00000027877.1",
#    
    "_ENSTGUG00000008926.2", 
    'PENK_ENSTGUG00000011112.2',
    'CXCL14_ENSTGUG00000017226.2', 
    '_ENSTGUG00000001973.2',
    'NPTX2_ENSTGUG00000008623.2',
#    
    'CCK_ENSTGUG00000004682.2',
    'NTS_ENSTGUG00000027877.1',
    'CHL1_ENSTGUG00000009997.2',
    'PTN_ENSTGUG00000025134.1',
    'VIP_ENSTGUG00000020465.1',
#    
    "GFRA1_ENSTGUG00000018882.1",
    "SCUBE1_ENSTGUG00000012016.2",    
    "CACNA1G_ENSTGUG00000009049.2",
    "GRIA2_ENSTGUG00000005484.2",
    "GRIA4_ENSTGUG00000012676.2",
#
    "BDNF_ENSTGUG00000004743.2",
    "FOSL2_ENSTGUG00000024611.1",
    "HOMER1_ENSTGUG00000003711.2",
    "ARC_ENSTGUG00000012727.2",
    "TNFAIP8L3_ENSTGUG00000028388.1",
    "EGR1_ENSTGUG00000000003.2",
#    
    'TBR1_ENSTGUG00000006709.2', 
    'FABP7_ENSTGUG00000011826.2',
    'NOTCH2_ENSTGUG00000017231.2',
    "NELL1_ENSTGUG00000004458.2",
#
    "_ENSTGUG00000017273.2",
    "_ENSTGUG00000028417.1",
    "_ENSTGUG00000004607.2",
#
    "NPTXR_ENSTGUG00000024678.1",
    "NPTX1_ENSTGUG00000025297.1", 
    "NPTX2_ENSTGUG00000008623.2",  
    "NRXN1_ENSTGUG00000005731.2",
    "CNTN4_ENSTGUG00000010102.2",
    "CACNA2D1_ENSTGUG00000002536.2",
]

In [None]:
genes = all_markers
labels = all_markers
celltypes = celltypes_standard

violinplot(adata_exp, adata_ctrl, genes, labels, celltypes, "sep_DE_genes")

# Marker for injury-induced transcriptional reprogramming
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7590250/  
Abstract: Primary somatosensory neurons are specialized to transmit specific types of sensory information through differences in cell size, myelination, and the expression of distinct receptors and ion channels, which together define their transcriptional and functional identity. By profiling sensory ganglia at single-cell resolution, we find that the all somatosensory neuronal subtypes undergo a similar transcriptional response to peripheral nerve injury that both promotes axonal regeneration and suppresses cell identity. This transcriptional reprogramming, which is not observed in non-neuronal cells, resolves over a similar time course as target reinnervation and is associated with the restoration of original cell identity. Injury-induced transcriptional reprogramming requires ATF3, a transcription factor which is induced rapidly after injury and necessary for axonal regeneration and functional recovery. Our findings suggest that transcription factors induced early after peripheral nerve injury likely confer the cellular plasticity required for sensory neurons to transform into a regenerative state.

In [None]:
genes = ["ATF3_ENSTGUG00000003050.2", "WRE_WRE"]
labels = ["ATF3", "WRE"]
celltypes = celltypes_standard

violinplot(adata_exp, adata_ctrl, genes, labels, celltypes, "injury")

# Markers for cell identity maintenance
https://www.nature.com/articles/nn.2623  
Abstract: Transcriptional cascades are required for the specification of serotonin (5-HT) neurons and behaviors modulated by 5-HT. Several cascade factors are expressed throughout the lifespan, which suggests that their control of behavior might not be temporally restricted to programming normal numbers of 5-HT neurons. We used new mouse conditional targeting approaches to investigate the ongoing requirements for Pet-1 (also called Fev), a cascade factor that is required for the initiation of 5-HT synthesis, but whose expression persists into adulthood. We found that Pet-1 was required after the generation of 5-HT neurons for multiple steps in 5-HT neuron maturation, including axonal innervation of the somatosensory cortex, expression of appropriate firing properties, and the expression of the Htr1a and Htr1b autoreceptors. Pet-1 was still required in adult 5-HT neurons to preserve normal anxiety-related behaviors through direct autoregulated control of serotonergic gene expression. These findings indicate that Pet-1 is required across the lifespan of the mouse and that behavioral pathogenesis can result from both developmental and adult-onset alterations in serotonergic transcription.

https://pubmed.ncbi.nlm.nih.gov/2576011/  
Abstract: The homeo-box-containing gene mec-3 of the nematode Caenorhabditis elegans, is expressed in several sensory neurons, as assayed by expression of a mec-3-lacZ fusion. These cells are the touch receptors, which mediate the response to gentle touch, and the FLP and PVD neurons. PVD mediates a response to harsh mechanical stimuli, and FLP has an ultrastructure suggestive of a mechanoreceptor, but its function is unknown. mec-3 is necessary for the differentiation of the touch receptors, because in mec-3 mutants, the touch receptors do not function and have none of their distinguishing features. mec-3 is also needed for PVD function: The PVD neurons no longer mediate a response to harsh mechanical stimuli in the mutants. The expression of the mec-3-lacZ fusion, and presumably mec-3 itself, is altered by mutations in several genes originally identified by their effects on touch cell development. unc-86, another homeo-box-containing gene, is necessary for all mec-3-lacZ expression, but also affects several other lineages and cells in which mec-3 is not expressed. mec-3 activity appears to be required for maintained expression of the mec-3-lacZ fusion in all cells in which it is expressed. In a mec-17 mutant, mec-3-lacZ expression is not maintained in the touch receptors, but is not affected in the FLP and PVD neurons. These findings suggest that combinatorial mechanisms of gene regulation control both the expression of mec-3 itself and its action in promoting the terminal differentiation of various cell types.

In [None]:
genes = ["FEV_ENSTGUG00000027918.1", "_ENSTGUG00000002776.2", "HTR1B_ENSTGUG00000012613.2", "_ENSTGUG00000007091.2"]
labels = ["Pet-1 / Fev", "Htr1a", "Htr1b", "HUS1 / mec-3?"]
celltypes = celltypes_standard

violinplot(adata_exp, adata_ctrl, genes, labels, celltypes, "cell_identity_maintenance")

___

# Plot correlation R2 between genes per celltype comparing control and exp
### !!! Is chromosomal location of correlated genes close???

___

# "Fake" bulk RNA seq disregarding celltypes
Compare to findings from https://www.nature.com/articles/s41593-019-0419-y.pdf

___

# Distance calculations

Calculate distance between PCA centroids within clusters

DE on highest distance clusters

___

## Within celltype clusters, does the expression of (marker) genes differ between batches?

In [None]:
# gene_list = df_leiden["celltype"].values
# gene_list.sort()
# gene_list

In [None]:
# for celltype in gene_list:
#     sc.pl.stacked_violin(adata[adata.obs["{}".format("celltype")]=="{}".format(celltype)], 
#                          marker_gene_mat.index.values, 
#                          groupby='batch', 
#                          title="{}".format(celltype), 
#                          rotation=90, 
#                          sharey=True,
#                          use_raw=False
#                         )

___

## Pairwise differential expression - Volcano plots

### Which genes are significantly different between the control and experiment batches?

Compute arrays with mean number of counts for each gene for each dataset (containing two batches each):

In [None]:
control = np.array(adata[control_mask].X.mean(axis=0))[0]
experiment = np.array(adata[experiment_mask].X.mean(axis=0))[0]

Compute dataframe comparing the two datasets:

In [None]:
# Create df
df_volcano = pd.DataFrame()

## Add columns with gene names and mean counts for each gene for each batch
df_volcano["GeneNames"] = adata.var.index
df_volcano["control"] = control
df_volcano["experiment"] = experiment

## Compute log fold change
# Since we already normalized and logged the values, we just subtract them from each other in order to get the log fold change.
df_volcano["logFC"] = df_volcano["control"] - df_volcano["experiment"]
df_volcano["logFC"] = df_volcano["logFC"].fillna(0)

## Compute p-value
df_volcano["p-value"] = 0
# Dense sparse matrices
matrix_1 = adata[control_mask].X.todense()
matrix_2 = adata[experiment_mask].X.todense()
# Compute and save p-value
_, df_volcano["p-value"] = stats.ttest_ind(matrix_1, matrix_2, equal_var=False)

# df_volcano = df_volcano.dropna()
df_volcano.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

alpha=0.05
xline = np.log(1.5) # Set cutoff such that fold change cutoff is 1.5x
yline = -np.log10(alpha)

x = df_volcano["logFC"].values
y = -np.log10(df_volcano["p-value"].values.astype(float))
labels = df_volcano["GeneNames"]

s=10
a=0.5

ax.scatter(x, y, color="grey", s=s, alpha=a)

mask1 = np.logical_and(x>xline, y>yline)
ax.scatter(x[mask1], y[mask1], color="r", s=s, alpha=1)

mask2 = np.logical_and(x<-xline, y>yline)
ax.scatter(x[mask2], y[mask2], color="r", s=s, alpha=1)

mask = np.logical_or(mask1, mask2)

ax.axvline(x=-xline, color="grey", linestyle="--", zorder=-1)
ax.axhline(y=yline, color="grey", linestyle="--", zorder=-1)
ax.axvline(x=xline, color="grey", linestyle="--", zorder=-1)

ax.set(**{
    "xlabel": "$log_e$ Fold Change",
    "ylabel": "$-log_{10}$ p-value",
    "title": "Volcano plot control vs. experiment across entire datasets"
})

plt.savefig("figures/4_volcano_exp-vs-ctrl_all.png", dpi=300, bbox_inches="tight")
plt.savefig("figures/4_volcano_exp-vs-ctrl_all.pdf", dpi=300, bbox_inches="tight")

ax.set_axisbelow(True)

fig.show()

# !To do: Restrict this to genes with a minimum number of UMIs to remove outliers?

# Repeat per celltype - glutamatergic neurons

Compute arrays with mean number of counts for each gene for each dataset (containing two batches each):

In [None]:
control_glut = np.array(adata[control_mask][np.char.startswith(nd(adata[control_mask].obs.celltype.values).astype(str), "glutamatergic neurons")].X.mean(axis=0))[0]
experiment_glut = np.array(adata[experiment_mask][np.char.startswith(nd(adata[experiment_mask].obs.celltype.values).astype(str), "glutamatergic neurons")].X.mean(axis=0))[0]

Compute dataframe comparing the two datasets:

In [None]:
# Create df
df_volcano_glut = pd.DataFrame()

## Add columns with gene names and mean counts for each gene for each batch
df_volcano_glut["GeneNames"] = adata.var.index
df_volcano_glut["control"] = control_glut
df_volcano_glut["experiment"] = experiment_glut

## Compute log fold change
# Since we already normalized and logged the values, we just subtract them from each other in order to get the log fold change.
df_volcano_glut["logFC"] = df_volcano_glut["control"] - df_volcano_glut["experiment"]
df_volcano_glut["logFC"] = df_volcano_glut["logFC"].fillna(0)

## Compute p-value
df_volcano_glut["p-value"] = 0
# Dense sparse matrices
matrix_1_glut = adata[control_mask][np.char.startswith(nd(adata[control_mask].obs.celltype.values).astype(str), "glutamatergic neurons")].X.todense()
matrix_2_glut = adata[experiment_mask][np.char.startswith(nd(adata[experiment_mask].obs.celltype.values).astype(str), "glutamatergic neurons")].X.todense()
# Compute and save p-value
_, df_volcano_glut["p-value"] = stats.ttest_ind(matrix_1_glut, matrix_2_glut, equal_var=False)

# df_volcano = df_volcano.dropna()
df_volcano_glut.head()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
alpha=0.05

df_volc = df_volcano_glut

xline = np.log(1.5) # Set cutoff such that fold change cutoff is 1.5x
yline = -np.log10(alpha)

x = df_volc["logFC"].values
y = -np.log10(df_volc["p-value"].values.astype(float))
labels = df_volc["GeneNames"]

s=10
a=0.5

ax.scatter(x, y, color="grey", s=s, alpha=a)

mask1 = np.logical_and(x>xline, y>yline)
ax.scatter(x[mask1], y[mask1], color="r", s=s, alpha=1)

mask2 = np.logical_and(x<-xline, y>yline)
ax.scatter(x[mask2], y[mask2], color="r", s=s, alpha=1)

mask = np.logical_or(mask1, mask2)

ax.axvline(x=-xline, color="grey", linestyle="--", zorder=-1)
ax.axhline(y=yline, color="grey", linestyle="--", zorder=-1)
ax.axvline(x=xline, color="grey", linestyle="--", zorder=-1)

ax.set(**{
    "xlabel": "$log_e$ Fold Change",
    "ylabel": "$-log_{10}$ p-value",
    "title": "Volcano plot control vs. experiment within glutamatergic neurons"
})

plt.savefig("figures/4_volcano_exp-vs-ctrl_glut.png", dpi=300, bbox_inches="tight")
plt.savefig("figures/4_volcano_exp-vs-ctrl_glut.pdf", dpi=300, bbox_inches="tight")

ax.set_axisbelow(True)

fig.show()

___

# UpSet plot of perturbed genes per celltype

Create data frame with data for UpSet plot. True values define perturbed genes. A perturbed gene for each celltype is defined as p-value < 0.05 and FC > 1.5 between control and experiment.

In [None]:
# Define alpha with Bonferroni correction
k = len(celltypes_standard) * len(adata.var.index.values)
alpha = 0.05 / k

# Define minimum fold change
min_fold_change = np.log(1.5)

df_upset = pd.DataFrame()
df_upset["gene"] = adata.var.index.values

for celltype in celltypes_standard:
    # for celltype in adata.obs["celltype"].unique():

    ## Compute logFC
    control_mean = np.array(
        adata[control_mask][
            np.char.startswith(
                nd(adata[control_mask].obs.celltype.values).astype(str), celltype
            )
        ].X.mean(axis=0)
    )[0]
    experiment_mean = np.array(
        adata[experiment_mask][
            np.char.startswith(
                nd(adata[experiment_mask].obs.celltype.values).astype(str), celltype
            )
        ].X.mean(axis=0)
    )[0]
    df_upset[f"logFC_{celltype}"] = control_mean - experiment_mean

    ## Compute p-value
    df_upset[f"p-value_{celltype}"] = 0
    # Dense sparse matrices
    matrix_1 = adata[control_mask][
        np.char.startswith(
            nd(adata[control_mask].obs.celltype.values).astype(str), celltype
        )
    ].X.todense()
    matrix_2 = adata[experiment_mask][
        np.char.startswith(
            nd(adata[experiment_mask].obs.celltype.values).astype(str), celltype
        )
    ].X.todense()
    # Compute and save p-value
    _, df_upset[f"p-value_{celltype}"] = stats.ttest_ind(
        matrix_1, matrix_2, equal_var=False
    )

    # Add True/False column to indicate whether the genes are perturbed for this celltype
    # Perturbed (True) is defined as p-value < 0.05 and FC > 1.5
    df_upset[celltype] = df_upset[f"p-value_{celltype}"].map(
        lambda x: x < alpha
    ) & df_upset[f"logFC_{celltype}"].map(
        lambda x: x > min_fold_change or x < -min_fold_change
    )

In [None]:
df_upset.head()

In [None]:
# Look at barplot of (raw) counts of a marker gene
fig, ax = plt.subplots()

arr = nd(adata.raw.X[:,adata.var.index == '_ENSTGUG00000017273.2'].todense()) 
uniq, inverse = np.unique(arr, return_inverse=True)
np.bincount(inverse)

# Exclude counts of 0
height = np.bincount(inverse)[1:]
x = uniq[1:]

ax.bar(x, height)

Make all celltype columns index for plotting with package 'upsetplot':

In [None]:
# celltypes = np.unique(adata.obs["celltype"].values)
celltypes = celltypes_standard
df_upset2 = df_upset.set_index(list(celltypes))

Plot UpSet plot:

In [None]:
# help(UpSet)

In [None]:
# intersection_plot_elements disbales default bar chart
upset = UpSet(df_upset2, min_degree=1, show_counts=True)
    
upset.style_subsets(min_subset_size=15,
                    facecolor="red",
                    label="Number of perturbed genes ≥ 20")

upset.plot()
plt.tight_layout()
plt.show()

### Plot DE genes in violin plot per dataset

In [None]:
adata_ctrl1 = adata[adata.obs["batch"] == "control1"]
adata_ctrl2 = adata[adata.obs["batch"] == "control2"]
adata_exp1 = adata[adata.obs["batch"] == "experiment1"]
adata_exp2 = adata[adata.obs["batch"] == "experiment2"]

# celltypes = list(adata.obs["celltype"].unique())
celltypes = celltypes_standard # Only includes clusters with > 100 cells

colors = ["#0571b0", "#a6cee3", "#e66101", "#fdb863"]

In [None]:
for cluster_idx, cluster in enumerate(celltypes):
    genes = df_upset[df_upset[cluster]==True]["gene"].values
    labels = genes
    
    fig_name = "DE-gene-expression-per-batch"

    fold_change_min=2
    alpha=0.05

    fontsize_star = 20

    fig, axs = plt.subplots(figsize=(20,len(genes)), nrows=len(genes))   

    ## Find the indeces of the celltypes/clusters to be used
    # If the first celltype does not contain a number (e.g. "microglia" versus "microglia_1"),
    # use general celltype (celltype_g) to find all of the clusters for that celltye
    # (This is the case for data from the separately clustered dataset)
    if any(map(str.isdigit, celltypes[0])) == False:
        print("Including all clusters for each celltype (obs column: celltype_g).")
        # Define celltypes and celltype indeces in both datasets
        celltype_ctrl1_idx = [np.where(adata_ctrl1.obs.celltype_g == i)[0] for i in celltypes]
        celltype_ctrl2_idx = [np.where(adata_ctrl2.obs.celltype_g == i)[0] for i in celltypes]
        celltype_exp1_idx = [np.where(adata_exp1.obs.celltype_g == i)[0] for i in celltypes]
        celltype_exp2_idx = [np.where(adata_exp2.obs.celltype_g == i)[0] for i in celltypes]
    # Else, use individual clusters
    else:
        print("Individual cluster analysis (obs column: celltype).")
        # Define celltypes and celltype indeces in both datasets
        celltype_ctrl1_idx = [np.where(adata_ctrl1.obs.celltype == i)[0] for i in celltypes]
        celltype_ctrl2_idx = [np.where(adata_ctrl2.obs.celltype == i)[0] for i in celltypes]
        celltype_exp1_idx = [np.where(adata_exp1.obs.celltype == i)[0] for i in celltypes]
        celltype_exp2_idx = [np.where(adata_exp2.obs.celltype == i)[0] for i in celltypes]

    lidx = np.arange(len(celltypes))*4

    if len(genes) < 2:
        axs = [axs]

    for cidx, (gene, ax) in enumerate(zip(genes, axs)):
        ## Get counts for this gene for all CTRL1 cells
        x_ctrl1_temp = nd(adata_ctrl1.X[:, adata_ctrl1.var.index.str.contains(gene)].todense())
        # Group CTRL normalized UMI counts per celltype
        x_ctrl1=[]
        for idx_array in celltype_ctrl1_idx:
            x_ctrl1.append([x_ctrl1_temp[i] for i in idx_array])

        v1 = ax.violinplot(x_ctrl1, showmeans=True, showextrema=False, positions=lidx-0.9)

        ## Get counts for this gene for all CTRL2 cells
        x_ctrl2_temp = nd(adata_ctrl2.X[:, adata_ctrl2.var.index.str.contains(gene)].todense())
        # Group CTRL normalized UMI counts per celltype
        x_ctrl2=[]
        for idx_array in celltype_ctrl2_idx:
            x_ctrl2.append([x_ctrl2_temp[i] for i in idx_array])

        v2 = ax.violinplot(x_ctrl2, showmeans=True, showextrema=False, positions=lidx-0.3)

        ## Get counts for this gene for all EXP1 cells
        x_exp1_temp = nd(adata_exp1.X[:, adata_exp1.var.index.str.contains(gene)].todense())
        # Group EXP normalized UMI counts per celltype
        x_exp1=[]
        for idx_array in celltype_exp1_idx:
            x_exp1.append([x_exp1_temp[i] for i in idx_array])

        v3 = ax.violinplot(x_exp1, showmeans=True, showextrema=False, positions=lidx+0.3)

        ## Get counts for this gene for all EXP2 cells
        x_exp2_temp = nd(adata_exp2.X[:, adata_exp2.var.index.str.contains(gene)].todense())
        # Group EXP normalized UMI counts per celltype
        x_exp2=[]
        for idx_array in celltype_exp2_idx:
            x_exp2.append([x_exp2_temp[i] for i in idx_array])

        v4 = ax.violinplot(x_exp2, showmeans=True, showextrema=False, positions=lidx+0.9)

    #     ## Welch's t-test and fold change of mean calculation
    #     fold_changes = [] 
    #     p_values = []
    #     for index, cell_array in enumerate(x_exp):
    #         # Perform Welch’s t-test, which does not assume equal population variance
    #         s, p = stats.ttest_ind(cell_array, x_ctrl[index], equal_var=False)
    #         # Save p-value for violin plot body transparency and heatmap
    #         p_values.append(p)

    #         if np.mean(cell_array) > np.mean(x_ctrl[index]):
    #             fold_change = np.mean(cell_array) / np.mean(x_ctrl[index])
    #             if p < alpha and fold_change >= fold_change_min:
    #                 ax.annotate("*", (lidx[index], 0.5*ax.get_ylim()[1]), ha="center", c="crimson", fontsize=fontsize_star)

    #             # Save foldchange for violin plot body transparency and heatmap
    #             fold_changes.append(fold_change)

    #         if np.mean(cell_array) <= np.mean(x_ctrl[index]):
    #             fold_change = np.mean(x_ctrl[index]) / np.mean(cell_array)
    #             if p < alpha and fold_change >= fold_change_min:
    #                 ax.annotate("*", (lidx[index], 0.5*ax.get_ylim()[1]), ha="center", c="blue", fontsize=fontsize_star)

    #             # Save foldchange for violin plot body transparency 
    #             fold_changes.append(fold_change)

        ## Set color and transparency of the violin plot bodies
        # Set transparency based on fold change (FC) and p-value
        # All violin plots showing an FC >= fold_change_min and p < alpha will be 100% opaque; 
        # for FCs < fold_change_min and p > alpha will be 10% opaque
        # Violin plots showing experiment data:
        # Violin plots showing control data:
        for pcidx, pc in enumerate(v1["bodies"]):
            pc.set_facecolor(colors[0])
            pc.set_edgecolor(colors[0])
            pc.set_alpha(1)

        for pcidx, pc in enumerate(v2["bodies"]):
            pc.set_facecolor(colors[1])
            pc.set_edgecolor(colors[1])
            pc.set_alpha(1)

        for pcidx, pc in enumerate(v3["bodies"]):
            pc.set_facecolor(colors[2])  
            pc.set_edgecolor(colors[2])
            pc.set_alpha(1)

        for pcidx, pc in enumerate(v4["bodies"]):
            pc.set_facecolor(colors[3])  
            pc.set_edgecolor(colors[3])
            pc.set_alpha(1)

        # Set colors of mean bar
        v1['cmeans'].set_color("black")
        v2['cmeans'].set_color("black")
        v3['cmeans'].set_color("black")
        v4['cmeans'].set_color("black")

        ## Set up x- and y- tick labels, and distinct top and bottom axes  
        # Get total number of cells per celltype cluster
        cellcounts_ctrl1=[]
        for array in x_ctrl1:
            cellcounts_ctrl1.append(len(array)) 
        cellcounts_ctrl2=[]
        for array in x_ctrl2:
            cellcounts_ctrl2.append(len(array)) 
        cellcounts_exp1=[]
        for array in x_exp1:
            cellcounts_exp1.append(len(array)) 
        cellcounts_exp2=[]
        for array in x_exp2:
            cellcounts_exp2.append(len(array)) 

        xticklabels=[]    
        for i2, (celltype, cellcount) in enumerate(zip(celltypes, cellcounts_exp1)):
            xticklabels.append("{} \n(Control1: {}; Control2: {}; TetX1: {}; TetX2: {})".format(celltype, cellcounts_ctrl1[i2], cellcounts_ctrl2[i2], cellcounts_exp1[i2], cellcounts_exp2[i2]))

        if cidx==0:
            ax_top = ax.twiny()
            ax_top.set_xlim(ax.get_xlim()) # DO NOT DELETE THIS
            ax_top.set_ylim(ax.get_ylim()[0], 2.5*ax.get_ylim()[1]) # Slightly increase space on top of first row of plots
            ax_top.set_xticks(lidx)
            ax_top.set_xticklabels(xticklabels, rotation=45, ha="left")
            ax_top.get_xticklabels()[cluster_idx].set_color('red') # Mark cluster for which these are the significant DE genes in red
            ax_top.spines["top"].set_visible(False)
            ax_top.spines["left"].set_visible(False)
            ax_top.spines["bottom"].set_visible(False)
            ax_top.xaxis.grid(False) 

        if cidx == len(genes)-1:
            ax_bot = ax.twiny()
            ax_bot.set_xticks([])
            ax_bot.set_xticklabels([])
            ax_bot.spines["top"].set_visible(False)
            ax_bot.spines["left"].set_visible(False)
            ax_bot.spines["bottom"].set_visible(False)

        ax.yaxis.tick_right()
        ax.set_ylabel(labels[cidx], color="black",rotation="horizontal", ha="right", va="center")

        ax.spines["top"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.spines["bottom"].set_visible(False)

        ax.set_axisbelow(True)
        ax.xaxis.grid(False) 

        # Set y axis on log scale including 0
        ax.set_yscale('symlog')

        ax.tick_params(
            axis="x",          # changes apply to the x-axis
            which="both",      # both major and minor ticks are affected
            bottom=False,      # ticks along the bottom edge are off
            top=False,         # ticks along the top edge are off
            labelbottom=False) # labels along the bottom edge are off
    
    plt.savefig(f"figures/4_vplot_{fig_name}_{cluster.replace(' ', '-').replace('/-', '')}.png", bbox_inches="tight", dpi=300, transparent=True)
    plt.show()

Look up DE genes:

In [None]:
# Remove gene name and "_" from Ensembl ID
de_genes = []
for gene in df_upset[df_upset[cluster]==True]["gene"].unique():
    de_genes.append(gene.split("_")[1])

In [None]:
df = gget.info(de_genes, wrap_text = True)

___

# Figures for schematics

In [None]:
# def violinplot_mainfig(genes, labels, celltypes, fig_name, alpha=0.05, fold_change_min=2):
#     fig, axs = plt.subplots(figsize=(10, 2), nrows=len(genes))

#     # Define celltypes and celltype indeces in both datasets
#     celltype_exp_idx = [np.where(adata_exp.obs.celltype == i)[0] for i in celltypes]
#     celltype_ctrl_idx = [np.where(adata_ctrl.obs.celltype == i)[0] for i in celltypes]

#     lidx = np.arange(len(celltypes))*2
    
#     fontsize_star = 20

#     for cidx, (gene, ax) in enumerate(zip(genes, axs)):
#         ## Get counts for this gene for all EXP cells
#         x_exp_temp = nd(adata_exp.X[:, adata_exp.var.index.str.contains(gene)].todense())
#         # Group EXP gene counts per celltype
#         x_exp=[]
#         for idx_array in celltype_exp_idx:
#             x_exp.append([x_exp_temp[i] for i in idx_array])

#         v1 = ax.violinplot(x_exp, showmedians=False, showextrema=False, positions=lidx+0.3)

#         for pcidx, pc in enumerate(v1["bodies"]):
#             pc.set_facecolor("red")
#             pc.set_edgecolor("black")
#             pc.set_alpha(1)

#         ## Get counts for this gene for all CTRL cells
#         x_ctrl_temp = nd(adata_ctrl.X[:, adata_ctrl.var.index.str.contains(gene)].todense())
#         # Group CTRL gene counts per celltype
#         x_ctrl=[]
#         for idx_array in celltype_ctrl_idx:
#             x_ctrl.append([x_ctrl_temp[i] for i in idx_array])

#         v2 = ax.violinplot(x_ctrl, showmedians=False, showextrema=False, positions=lidx-0.3)
        
#         for pcidx2, pc2 in enumerate(v2["bodies"]):
#             pc2.set_facecolor("blue")
#             pc2.set_edgecolor("black")
#             pc2.set_alpha(1)
            
#         ## Welch's t-test and fold change of mean calculation
#         for index, cell_array in enumerate(x_exp):
#             s, p = stats.ttest_ind(cell_array, x_ctrl[index], equal_var=False)
            
#             if np.mean(cell_array) > np.mean(x_ctrl[index]):
#                 fold_change = np.mean(cell_array) / np.mean(x_ctrl[index])
#                 if p < alpha and fold_change > fold_change_min:
#                     ax.annotate("*", (lidx[index], 0.6*ax.get_ylim()[1]), ha="center", c="black", fontsize=fontsize_star)
                    
#             if np.mean(cell_array) <= np.mean(x_ctrl[index]):
#                 fold_change = np.mean(x_ctrl[index]) / np.mean(cell_array)
#                 if p < alpha and fold_change > fold_change_min:
#                     ax.annotate("*", (lidx[index], 0.6*ax.get_ylim()[1]), ha="center", c="black", fontsize=fontsize_star)

#         ## Set up x- and y- tick labels, and distinct top and bottom axes  
#         # Get total number of cells per celltype cluster
#         cellcounts_exp=[]
#         for array in x_exp:
#             cellcounts_exp.append(len(array)) 
#         cellcounts_ctrl=[]
#         for array in x_ctrl:
#             cellcounts_ctrl.append(len(array)) 

#         xticklabels=[]    
#         for i2, (celltype, cellcount) in enumerate(zip(celltypes, cellcounts_exp)):
#             xticklabels.append("{}".format(celltype))

#         if cidx==0:
#             ax_top = ax.twiny()
#             ax_top.set_xlim(ax.get_xlim()) # DO NOT DELETE THIS
#             ax_top.set_ylim(ax.get_ylim()[0], 1.2*ax.get_ylim()[1]) # Slightly increase space on top of first row of plots
#             ax_top.set_xticks(lidx)
#             ax_top.set_xticklabels(xticklabels, rotation=45, ha="left")
#             ax_top.spines["top"].set_visible(True)
#             ax_top.spines["left"].set_visible(False)
#             ax_top.spines["bottom"].set_visible(False)
#             ax_top.xaxis.grid(False) 

#         if cidx == len(celltypes)-1:
#             ax_bot = ax.twiny()
#             ax_bot.set_xticks([])
#             ax_bot.set_xticklabels([])
#             ax_bot.spines["top"].set_visible(False)
#             ax_bot.spines["left"].set_visible(False)
#             ax_bot.spines["bottom"].set_visible(True)

# #         ax.set_xticklabels("")
#         ax.yaxis.tick_right()
#         ax.set_ylabel(labels[cidx], color="black",rotation="horizontal", ha="right", va="center")

# #         lim = nd(x.mean(axis=0))[cidx]*4   
# #         ax.set_ylim(-lim*0.1, lim)
# #         ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))

#         ax.spines["top"].set_visible(False)
#         ax.spines["left"].set_visible(False)
#         ax.spines["bottom"].set_visible(False)

#         ax.set_axisbelow(True)
#         ax.xaxis.grid(False) 

#         ax.tick_params(
#             axis="x",          # changes apply to the x-axis
#             which="both",      # both major and minor ticks are affected
#             bottom=False,      # ticks along the bottom edge are off
#             top=False,         # ticks along the top edge are off
#             labelbottom=False) # labels along the bottom edge are off

#     # plt.tight_layout(pad=0, w_pad=0, h_pad=0)
#     plt.savefig("figures/vp_{}.png".format(fig_name), bbox_inches="tight", dpi=300)
#     plt.show()

In [None]:
genes = ["_ENSTGUG00000017273.2", "_ENSTGUG00000004607.2"]
labels = ["MHC1", "B2M"]
celltypes = ['astrocytes 1', 'astrocytes 2','glutamatergic neurons 1', 'glutamatergic neurons 2','glutamatergic neurons 3', 'microglia 1', 'microglia 2']

violinplot(
    adata_exp,
    adata_ctrl,
    genes,
    labels,
    celltypes,
    fig_name="schematic",
    fold_change_min=2,
)