In [1303]:
# Isaac Berez
# 17.01.23
import sys
from scipy.io import mmread
import os
import glob
import pandas as pd
import numpy as np
from pandas_ods_reader import read_ods
from copy import deepcopy
import pprint
import json
import re
from datetime import datetime
import logging
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import HuberRegressor
from sklearn import preprocessing
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from sklearn.manifold import TSNE
from sklearn import metrics
from sklearn.cluster import DBSCAN
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import harmonypy as hm
from matplotlib.cm import ScalarMappable
from datetime import date
import mpld3
import hvplot.pandas
import holoviews as hv
from holoviews import opts
import panel as pn
import bokeh
from bokeh.resources import INLINE

import dimorph_processing as dp
import cell_comparison as cc

today = str(date.today())
%matplotlib notebook
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Table of Contents
1. [Introduction](#introduction)
2. [Data Loading](#data_loading)
3. [GABA Analysis](#gaba_analysis)
4. [Vglut1 Analysis](#vglut1_analysis)
5. [Vglut2 Analysis](#vglut2_analysis)
6. [Nonneuronal Analysis](#nn_analysis)

In [2]:
status_df = dp.intialize_status_df()

## Introduction <a name="introduction"></a>

### This notebook compares the amygdala data from Hochgerner et al.  (https://www.nature.com/articles/s41593-023-01469-3) to the sexual dimorphism data analyzed in both "dimorph_processing_nb.ipynb" (level1) and "dimorph_cell_analysis_nb.ipynb" (level2)

In [241]:
IEG_list = []

with open('/bigdata/isaac/IEG_cluster.txt', 'r') as fh:
    for g in fh:
        IEG_list.append(g[:-1])

In [242]:
IEG_list

['Btg2',
 'Jun',
 'Egr4',
 'Fosb',
 'Junb',
 'Gadd45g',
 'Fos',
 'Arc',
 'Nr4a1',
 'Npas4',
 'Coq10b',
 'Tns1',
 'Per2',
 'Ptgs2',
 'Rnd3',
 'Tnfaip6',
 'Srxn1',
 'Tiparp',
 'Ccnl1',
 'Mcl1',
 'Dnajb5',
 'Nr4a3',
 'Fosl2',
 'Nptx2',
 'Rasl11a',
 'Mest',
 'Sertad1',
 'Egr2',
 'Midn',
 'Gadd45b',
 'Dusp6',
 'Irs2',
 'Plat',
 'Ier2',
 'Rrad',
 'Tpbg',
 'Csrnp1',
 'Peli1',
 'Per1',
 'Kdm6b',
 'Inhba',
 'Plk2',
 'Ifrd1',
 'Baz1a',
 'Trib1',
 'Pim3',
 'Lrrk2',
 'Dusp1',
 'Cdkn1a',
 'Pim1',
 'Sik1',
 'Frat2',
 'Dusp5']

In [427]:
sex_gene_list = []

with open('/bigdata/isaac/sex_gene_list.txt', 'r') as fh:
    for g in fh:
        sex_gene_list.append(g[:-1])

### Data Loading - Load data from Hochgerner et al. <a name="data_loading"></a>

In [3]:
amy_df = pd.read_csv('/bigdata/isaac/amy_fc_data/Amy_FC_allcells_with_metadata_31-Jul-2022.txt', encoding='utf-8', delimiter='\t') 

  amy_df = pd.read_csv('/bigdata/isaac/amy_fc_data/Amy_FC_allcells_with_metadata_31-Jul-2022.txt', encoding='utf-8', delimiter='\t')


In [4]:
amy_df.head()

Unnamed: 0,cellID,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
0,celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
1,sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
2,FC time,2,2,2,2,2,2,2,2,0,...,2,2,2,2,2,2,2,2,2,2
3,batch,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
4,Xkr4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
type(amy_df.columns[1])

str

In [6]:
np.array(amy_df.iloc[:,1:2])

array([['GABA-11-Adora2a-Id4'],
       ['10-1'],
       ['2'],
       ...,
       [0],
       [0],
       [0]], dtype=object)

In [7]:
#save as feather for faster loading
#folder = '/bigdata/isaac/amy_fc_data/'
#amy_df = amy_df.astype(str)
#amy_df.to_feather(folder + 'amy_df.feather')

In [8]:
amy_df.columns

Index(['cellID', 'GGGTATTTCTCGCGTT-1_10-1', 'GGTAACTAGACATCCT-1_18-1',
       'GGTAATCGTGGACTAG-1_18-1', 'TTCCGGTAGTGGTGGT-1_18-1',
       'AGGGTGAAGTACAACA-1_19-1', 'AGTACCATCCCTGGTT-1_19-1',
       'CATGAGTTCCGGCTTT-1_19-1', 'TTGCCTGAGACGGTTG-1_19-1',
       'AAACGCTTCACCATGA-1_23-1',
       ...
       'TCGAACAAGGAGCTGT-1_76-1', 'TGTACAGTCTGCAGCG-1_76-1',
       'AGGACTTTCATGGAGG-1_76-2', 'CCCATTGGTACCTAGT-1_76-2',
       'CTATCTACAATTGCTG-1_76-2', 'CTCATCGTCACCCATC-1_76-2',
       'CTGCCATGTATCGCTA-1_76-2', 'GCATGATTCTCGTCGT-1_76-2',
       'TGGGCGTAGAAGCCAC-1_76-2', 'TTCTGTAGTGGTATGG-1_76-2'],
      dtype='object', length=55515)

In [9]:
amy_df = amy_df.set_index('cellID')

In [10]:
amy_df

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,23-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,2,2,2,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC168977.1,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PISD,9,0,0,3,8,0,0,4,4,0,...,17,0,0,9,0,0,8,10,9,8
DHRSX,0,0,0,0,0,0,0,4,0,0,...,0,0,6,0,6,0,0,0,4,8
Vmn2r122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
amy_df.shape

(28002, 55514)

In [12]:
np.unique(np.array(amy_df.loc['celltype']))

array(['Astro', 'Astro_SC', 'Astro_agt', 'COP', 'EC', 'Epend',
       'GABA-1-Foxp2_Fmod', 'GABA-10-Prkcd-Adora2a',
       'GABA-11-Adora2a-Id4', 'GABA-12-Adora2a-Scn4b',
       'GABA-13-Adora2a-Crh', 'GABA-14-Drd1-Scn4b', 'GABA-15-Drd1-Ebf1',
       'GABA-16-Prkcd-Nts', 'GABA-17-Pdyn-Ebf1', 'GABA-18-Isl1-Tac1',
       'GABA-19-Isl1-Aldoc', 'GABA-2-Foxp2_Adra2a',
       'GABA-20-Gpr101-Gabre', 'GABA-21-Vdr-Nts', 'GABA-22-Gal-Avp',
       'GABA-23-Fign-Lrpprc', 'GABA-24-Fign-Ucn3', 'GABA-25-Lhx8-Th',
       'GABA-26-Cbln4-Sst', 'GABA-27-Lhx6-Nxph2', 'GABA-28-Cbln4-Luzp2',
       'GABA-29-Prlr-Greb1', 'GABA-3-Foxp2_Col6a1', 'GABA-30-Prlr-Calcr',
       'GABA-31-Prlr-Cbln1', 'GABA-32-Prlr-Tac1', 'GABA-33-Prlr-St18',
       'GABA-34-Prlr-Satb1', 'GABA-35-Chodl-Moxd1', 'GABA-36-Sst-Fign',
       'GABA-37-Sst-Npy', 'GABA-38-Sst-Tmtc4', 'GABA-39-Sst-Nek7',
       'GABA-4-Foxp2_Htr1f', 'GABA-40-Rpb4-Sst', 'GABA-41-Moxd1-Pvalb',
       'GABA-42-Moxd1-Vwc2', 'GABA-43-Crabp1-Etv1', 'GABA-44-Pthlh

### seperate out metadata

In [13]:
amy_metadata_df = amy_df.iloc[:4,:]

### add "cell_class" row to metadata

In [14]:
cls = [x.split('-')[0] for x in np.array(amy_metadata_df.loc['celltype'])]

In [244]:
all_cell_classes = list(np.unique(cls))
all_cell_classes

['Astro',
 'Astro_SC',
 'Astro_agt',
 'COP',
 'EC',
 'Epend',
 'GABA',
 'OL',
 'OPC',
 'OPC_cycling',
 'Peri',
 'VGLUT1',
 'VGLUT2',
 'VLMC',
 'VSM',
 'microglia',
 'pvm']

In [16]:
amy_metadata_df.loc['cell_class'] = cls

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amy_metadata_df.loc['cell_class'] = cls


In [17]:
#re.split(r"-|_",markers[0][0])[2:4]

In [18]:
markers = [re.split(r"-|_",x)[2:4] for x in np.array(amy_metadata_df.loc['celltype'])]
markers

[['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 'Id4'],
 ['Adora2a', 

In [19]:
amy_metadata_df.loc['markers'] = markers

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amy_metadata_df.loc['markers'] = markers


In [20]:
amy_metadata_df

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,23-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,2,2,2,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
cell_class,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
markers,"[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]",...,[],[],[],[],[],[],[],[],[],[]


### cell classes of interest: GABA, Vglut1, Vglut2, NN = microglia + epend + astro

In [21]:
amy_df.head()

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,23-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,2,2,2,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
amy_df.loc['celltype'].index

Index(['GGGTATTTCTCGCGTT-1_10-1', 'GGTAACTAGACATCCT-1_18-1',
       'GGTAATCGTGGACTAG-1_18-1', 'TTCCGGTAGTGGTGGT-1_18-1',
       'AGGGTGAAGTACAACA-1_19-1', 'AGTACCATCCCTGGTT-1_19-1',
       'CATGAGTTCCGGCTTT-1_19-1', 'TTGCCTGAGACGGTTG-1_19-1',
       'AAACGCTTCACCATGA-1_23-1', 'TACCCACCAGTGACCC-1_23-1',
       ...
       'TCGAACAAGGAGCTGT-1_76-1', 'TGTACAGTCTGCAGCG-1_76-1',
       'AGGACTTTCATGGAGG-1_76-2', 'CCCATTGGTACCTAGT-1_76-2',
       'CTATCTACAATTGCTG-1_76-2', 'CTCATCGTCACCCATC-1_76-2',
       'CTGCCATGTATCGCTA-1_76-2', 'GCATGATTCTCGTCGT-1_76-2',
       'TGGGCGTAGAAGCCAC-1_76-2', 'TTCTGTAGTGGTATGG-1_76-2'],
      dtype='object', length=55514)

## GABA Analysis <a name="gaba_analysis"></a>

In [1032]:
amy_df_gaba_expr_ge_cv_ls, amy_metadata_df_gaba = cc.process_amy_data_class(amy_df,amy_metadata_df,IEG_list,sex_gene_list,cell_class='GABA')

removing # duplicate gene rows:  65
removing  53  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
removing  6  genes found in  ['Xist', 'Tsix', 'Ddx3y', 'Eif2s3y', 'Kdm5d', 'Uty']
Total genes reduced from 27874 to 14298


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

column (gene) mean after standardization: -0.00
column (gene) sigma after standardization: 1.00


In [1033]:
amy_df_gaba_expr_ge_cv_ls.shape

(519, 13006)

In [1034]:
amy_df_gaba_expr_ge_cv_ls

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,GTCAAACTCCCGATCT-1_76-2,GTGTAACGTGAGACCA-1_76-2,GTGTGATAGGTGAGAA-1_76-2,TACTTCAGTAACTAAG-1_76-2,TCCGATCTCGTTTACT-1_76-2,TCGCTCAAGATTGAGT-1_76-2,TGCAGGCTCTTACACT-1_76-2,TTACAGGTCCGAGATT-1_76-2,TTGCGTCCAACACGAG-1_76-2,TTGCTGCAGCACTCCG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sst,-0.545385,0.306249,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,1.157884,-0.545385,0.443332,...,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385,-0.545385
Npy,1.105514,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,...,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248,-0.442248
Gal,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,...,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651,-0.112651
Avp,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,...,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845,-0.108845
Vip,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,-0.222700,...,4.745730,5.499811,5.231021,4.307494,5.422867,4.210936,5.386956,5.816290,5.602638,5.026132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mdfic,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,...,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916,-0.047916
Pld5,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,2.052045,-0.429433,-0.429433,-0.429433,...,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433,-0.429433
Kcnip4,-0.813752,1.404627,-0.813752,-0.813752,-0.813752,1.178894,-0.813752,-0.813752,1.178894,1.178894,...,-0.813752,0.902620,-0.813752,-0.813752,0.902620,0.902620,1.760806,-0.813752,-0.813752,-0.813752
Tppp3,-0.539772,-0.539772,-0.539772,1.656778,-0.539772,-0.539772,1.656778,-0.539772,2.941677,-0.539772,...,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772,-0.539772


In [1035]:
amy_metadata_df_gaba.shape

(7, 13006)

In [1036]:
amy_metadata_df_gaba

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,GTCAAACTCCCGATCT-1_76-2,GTGTAACGTGAGACCA-1_76-2,GTGTGATAGGTGAGAA-1_76-2,TACTTCAGTAACTAAG-1_76-2,TCCGATCTCGTTTACT-1_76-2,TCGCTCAAGATTGAGT-1_76-2,TGCAGGCTCTTACACT-1_76-2,TTACAGGTCCGAGATT-1_76-2,TTGCGTCCAACACGAG-1_76-2,TTGCTGCAGCACTCCG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh
sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,23-1,...,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,2,2,2,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,2
cell_class,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,...,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA
markers,"[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]",...,"[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]"
cluster_label,11,11,11,11,11,11,11,11,11,11,...,51,51,51,51,51,51,51,51,51,51


In [1037]:
#save to file

folder = '/bigdata/isaac/gaba_files/'

file1 = 'amy_df_gaba_expr_ge_cv_ls_orig' 
file2 = 'amy_metadata_df_gaba_orig' 

amy_df_gaba_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
amy_metadata_df_gaba.to_json(folder+file2+'.json')

In [1038]:
folder = '/bigdata/isaac/gaba_files/'

In [1039]:
amy_df_gaba_expr_ge_cv_ls_orig = pd.read_feather(folder + 'amy_df_gaba_expr_ge_cv_ls_orig.feather')
amy_metadata_df_gaba_orig = pd.read_json(folder + 'amy_metadata_df_gaba_orig.json')

### Load gaba_df_prelinkage_ls saved from "dimorph_cell_analysis_nb.ipynb" and get intersection
### already has feature selected genes

In [1052]:
folder = '/bigdata/isaac/gaba_files/'

GABA_df_prelinkage_ls_orig = pd.read_feather(folder + 'GABA_df_pre_linkage_ls_2024-09-02.feather')
GABA_metadata_df_prelinkage_ls_orig = pd.read_json(folder + 'GABA_meta_data_df_pre_linkage_2024-09-02.json')
GABA_meta_data_df_plis_filtered = pd.read_json(folder + 'GABAmeta_data_df_plis_filtered_2024-09-02.json')


In [1053]:
GABA_df_plis_filtered = GABA_df_prelinkage_ls_orig.reindex(index = GABA_meta_data_df_plis_filtered.columns)

### Get intersection

In [1054]:
amy_df_GABA_expr_ge_cv_ls, GABA_df_plis_filtered = cc.get_df_gene_intersection(GABA_df_plis_filtered,amy_df_gaba_expr_ge_cv_ls,IEG_list)

removing  0  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
Index(['Sst', 'Npy', 'Gal', 'Avp', 'Vip', 'Ucn3', 'Cartpt', 'Ripply3', 'Cck',
       'Cnr1',
       ...
       'Nbl1', 'Jsrp1', 'Sostdc1', 'Shisa6', 'Pdlim5', 'Fam19a2', 'Ppargc1a',
       'Pld5', 'Kcnip4', 'Lhfp'],
      dtype='object', length=376)


In [1055]:
GABA_df_plis_filtered.shape

(376, 10022)

In [1056]:
amy_df_GABA_expr_ge_cv_ls.shape

(376, 13006)

In [1057]:
folder

'/bigdata/isaac/gaba_files/'

### Compute avg expression for all genes, for every cluster

In [1060]:
amy_avgs = dp.compute_avg_expr_per_cluster_label(amy_df_GABA_expr_ge_cv_ls,amy_metadata_df_gaba)

In [1061]:
amy_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,47,48,49,50,51,52,53,54,55,56
Sst,-0.155349,-0.280595,-0.455989,-0.492804,-0.148882,-0.314029,0.423542,-0.263417,-0.395131,-0.501013,...,0.155029,-0.447587,-0.450272,-0.514228,-0.427441,-0.440583,-0.451893,-0.427688,-0.345629,-0.444237
Npy,-0.359158,-0.22505,0.068509,0.900939,-0.086038,-0.150077,-0.38883,-0.387799,-0.364963,-0.282428,...,-0.333043,-0.372397,-0.366435,-0.401907,-0.352834,-0.374402,-0.396366,-0.403158,-0.264757,-0.154209
Gal,-0.112651,-0.112651,-0.089581,-0.112651,-0.074812,-0.069099,-0.112651,-0.087391,-0.068467,-0.112651,...,-0.097408,-0.105911,-0.050846,0.248763,-0.100601,-0.041433,-0.112651,-0.112651,-0.112651,-0.0807
Avp,-0.108845,-0.071297,-0.028345,-0.108845,-0.052378,-0.021984,-0.073287,-0.055146,-0.05929,-0.052248,...,-0.086099,-0.053048,0.014219,0.012632,0.005873,-0.026616,-0.08045,-0.026667,-0.108845,-0.067684
Vip,-0.211028,-0.20211,-0.196752,-0.158658,-0.172872,-0.205202,-0.204007,-0.19946,-0.199775,-0.193786,...,-0.172384,-0.192407,-0.048683,2.22628,4.872482,-0.2227,-0.116881,0.244626,3.712796,-0.111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Fam19a2,-0.017843,-0.491078,0.078994,0.086547,0.612419,-0.051607,-0.517742,-0.527878,-0.35638,-0.344517,...,-0.001721,0.100983,0.527271,-0.000624,0.678563,-0.361065,0.000579,0.544766,0.589401,-0.154694
Ppargc1a,0.367969,-0.343563,-0.508327,-0.56609,-0.381029,-0.377084,-0.113144,-0.000711,0.085683,0.054693,...,0.26656,0.462964,0.207191,-0.251954,-0.049666,-0.366242,-0.364607,-0.226413,-0.300026,-0.28419
Pld5,0.36542,0.951358,-0.240701,-0.005046,0.487558,0.130078,0.07014,-0.385895,-0.406659,-0.206469,...,-0.077231,-0.328914,0.729462,0.605465,-0.204186,-0.336766,-0.136654,-0.280694,-0.142431,-0.358911
Kcnip4,0.535392,0.517315,0.247973,0.685572,0.714297,0.258686,-0.060652,-0.047448,0.049714,0.321894,...,-0.681736,0.666536,0.522226,-0.083875,0.150193,-0.050973,-0.617036,0.345685,0.713779,-0.487957


In [1062]:
sd_avgs = dp.compute_avg_expr_per_cluster_label(GABA_df_plis_filtered,GABA_meta_data_df_plis_filtered)

In [1063]:
sd_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
Sst,-0.194687,-0.358431,-0.345789,-0.207372,0.090203,-0.34105,-0.370597,-0.361555,-0.350722,0.180259,...,-0.33354,-0.326563,-0.339515,-0.368771,0.23559,-0.211878,-0.072571,-0.112722,-0.342363,2.50577
Npy,-0.150086,-0.279075,-0.258327,-0.302156,-0.275489,-0.264786,0.177284,-0.069338,-0.287106,0.146964,...,-0.26414,-0.239101,-0.276908,-0.295374,0.130791,-0.246612,0.510632,-0.244332,-0.202569,-0.01045
Gal,1.748356,-0.126186,-0.126186,-0.126186,-0.126186,-0.126186,-0.126186,-0.126186,-0.126186,-0.107422,...,-0.115246,-0.126186,-0.105918,-0.126186,-0.026339,0.266744,-0.126186,-0.060247,0.200657,-0.068699
Avp,-0.201913,0.009672,-0.071693,-0.201913,-0.033852,0.119244,-0.018978,-0.201913,-0.072571,-0.093718,...,0.011376,-0.080687,-0.101037,-0.035125,0.253785,-0.031275,0.003348,-0.077676,0.073884,-0.001628
Vip,-0.154338,0.219668,6.220309,-0.154338,-0.146482,-0.143328,-0.138736,-0.129051,-0.154338,-0.144733,...,-0.148738,-0.154338,0.077692,-0.134358,-0.043747,-0.154338,-0.154338,-0.0836,-0.046081,-0.139625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Fam19a2,-0.347327,0.612164,0.538219,0.187342,0.094747,0.426905,-0.219798,-0.097128,1.817951,1.098405,...,0.386086,0.274835,0.499132,-0.042106,0.463752,0.19086,0.856596,0.196763,-0.409315,-0.159485
Ppargc1a,0.035735,-0.12285,-0.361228,-0.272818,0.429596,0.626033,-0.57095,-0.095978,1.284642,1.123727,...,-0.211634,0.011189,0.047088,-0.185085,0.251431,0.414077,-0.16618,-0.023663,1.226648,0.673166
Pld5,3.470845,-0.385868,-0.224568,-0.265871,0.035717,-0.261483,0.83445,0.303112,-0.399942,-0.348703,...,0.503254,0.782529,0.387346,0.019482,0.302959,0.514698,0.309709,-0.168676,0.424207,-0.373807
Kcnip4,1.070099,-0.333966,-0.106023,-0.809251,-0.716891,1.066976,-0.521923,-0.789103,-0.800419,-0.674472,...,0.195124,0.282605,0.15152,0.759663,0.162445,0.141167,0.307313,0.935139,0.780004,1.089625


In [1064]:
heatmap2, heatmap_argmax_df_alt_lco, corr_matrix_manual_alt_lco, corr_matrix_manual_alt_lco_sorted = cc.plot_correlation(sd_avgs,amy_avgs)
# Display the plot
hvplot.show(heatmap2)
    
# Ensure output is displayed inline
hv.output(heatmap2, backend='bokeh')

Launching server at http://localhost:35833


In [1066]:
error_genes,all_marker_labels = cc.amy_gene_spell_checker(amy_df_gaba_expr_ge_cv_ls_orig,amy_metadata_df_gaba)
error_genes

  all_m = [amy_metadata_df.loc['markers'][x] for x in range((amy_metadata_df.loc['markers'].shape[0]))]


gene,  Cyp26a1 not found in index
gene,  Hctr2 not found in index
gene,  Scng not found in index
gene,  Lrpprc not found in index
gene,  Luzp2 not found in index
gene,  Rpb4 not found in index


['Cyp26a1', 'Hctr2', 'Scng', 'Lrpprc', 'Luzp2', 'Rpb4']

In [1067]:
all_marker_labels

[['Adora2a', 'Id4'],
 ['Adora2a', 'Scn4b'],
 ['Prlr', 'Calcr'],
 ['Chat', 'Vip'],
 ['Crabp1', 'Etv1'],
 ['Adora2a', 'Crh'],
 ['Pax6', 'Cyp26a1'],
 ['Reln', 'Crim1'],
 ['Isl1', 'Tac1'],
 ['Drd1', 'Ebf1'],
 ['Drd1', 'Scn4b'],
 ['Isl1', 'Aldoc'],
 ['Sncg', 'Vip'],
 ['Prlr', 'Cbln1'],
 ['Prlr', 'Satb1'],
 ['Foxp2', 'Fmod'],
 ['Gal', 'Avp'],
 ['Lhx8', 'Th'],
 ['Gpr101', 'Gabre'],
 ['Moxd1', 'Vwc2'],
 ['Moxd1', 'Pvalb'],
 ['Lamp5', 'Hctr2'],
 ['Foxp2', 'Htr1f'],
 ['Scng', 'Kcnc2'],
 ['Prlr', 'Greb1'],
 ['Lamp5', 'Kit'],
 ['Sncg', 'Krt73'],
 ['Lhx6', 'Nxph2'],
 ['Fign', 'Lrpprc'],
 ['Cbln4', 'Luzp2'],
 ['Npnt', 'Calca'],
 ['Prkcd', 'Oprk1'],
 ['Pax6', 'Enpp2'],
 ['Pax6', 'Th'],
 ['Foxp2', 'Col6a1'],
 ['Prkcd', 'Adora2a'],
 ['Prkcd', 'Ezr'],
 ['Prkcd', 'Nts'],
 ['Calb2', 'Rgs12'],
 ['Reln', 'Ndnf'],
 ['Htr3a', 'Kcnc2'],
 ['Rpb4', 'Sst'],
 ['Foxp2', 'Adra2a'],
 ['Cbln4', 'Sst'],
 ['Sst', 'Nek7'],
 ['Chodl', 'Moxd1'],
 ['Sst', 'Fign'],
 ['Sst', 'Tmtc4'],
 ['Sst', 'Npy'],
 ['Prlr', 'St18'],
 ['Pr

In [1069]:
correct_gene_names = ['Hcrtr2','Sncg','Rbp4']

In [1070]:
all_marker_labels_c = cc.correct_error_genes(error_genes,correct_gene_names,all_marker_labels)

Cyp26a1
Cyp26a1
Cyp26a1
Hctr2
Hctr2
Scng
Scng
Scng
Lrpprc
Lrpprc
Lrpprc
Luzp2
Luzp2
Luzp2
Rpb4
Rpb4
Rpb4
Rpb4


In [1071]:
all_marker_labels_c

[['Adora2a', 'Id4'],
 ['Adora2a', 'Scn4b'],
 ['Prlr', 'Calcr'],
 ['Chat', 'Vip'],
 ['Crabp1', 'Etv1'],
 ['Adora2a', 'Crh'],
 ['Pax6', 'Cyp26a1'],
 ['Reln', 'Crim1'],
 ['Isl1', 'Tac1'],
 ['Drd1', 'Ebf1'],
 ['Drd1', 'Scn4b'],
 ['Isl1', 'Aldoc'],
 ['Sncg', 'Vip'],
 ['Prlr', 'Cbln1'],
 ['Prlr', 'Satb1'],
 ['Foxp2', 'Fmod'],
 ['Gal', 'Avp'],
 ['Lhx8', 'Th'],
 ['Gpr101', 'Gabre'],
 ['Moxd1', 'Vwc2'],
 ['Moxd1', 'Pvalb'],
 ['Lamp5', 'Hcrtr2'],
 ['Foxp2', 'Htr1f'],
 ['Sncg', 'Kcnc2'],
 ['Prlr', 'Greb1'],
 ['Lamp5', 'Kit'],
 ['Sncg', 'Krt73'],
 ['Lhx6', 'Nxph2'],
 ['Fign', 'Lrpprc'],
 ['Cbln4', 'Luzp2'],
 ['Npnt', 'Calca'],
 ['Prkcd', 'Oprk1'],
 ['Pax6', 'Enpp2'],
 ['Pax6', 'Th'],
 ['Foxp2', 'Col6a1'],
 ['Prkcd', 'Adora2a'],
 ['Prkcd', 'Ezr'],
 ['Prkcd', 'Nts'],
 ['Calb2', 'Rgs12'],
 ['Reln', 'Ndnf'],
 ['Htr3a', 'Kcnc2'],
 ['Rbp4', 'Sst'],
 ['Foxp2', 'Adra2a'],
 ['Cbln4', 'Sst'],
 ['Sst', 'Nek7'],
 ['Chodl', 'Moxd1'],
 ['Sst', 'Fign'],
 ['Sst', 'Tmtc4'],
 ['Sst', 'Npy'],
 ['Prlr', 'St18'],
 ['P

In [1072]:
folder = '/bigdata/isaac/gaba_files/'
 
GABA_cl_mg_dict_filtered = {int(k): v for k, v in GABA_cl_mg_dict_filtered.items()}

In [1073]:
GABA_cl_mg_dict_filtered

{1: ['Chat',
  'Isl1',
  'Ecel1',
  'Th',
  'Zic1',
  'Luzp2',
  'Lhx8',
  'Clstn2',
  'Pbx3',
  'Ngfr',
  'Slc5a7',
  'Slc18a3',
  'Slc10a4'],
 2: ['Htr3a',
  'Zeb2',
  'Col19a1',
  'Npy1r',
  'Krt73',
  'Cxcl14',
  'Sncg',
  'Synpr',
  'Sln',
  'Pde5a',
  'Necab1',
  'Npas1',
  'Rgs12',
  'Cnr1'],
 3: ['Cbln2',
  'Prox1',
  'Calb2',
  'Prox1os',
  'Gpd1',
  'Plpp4',
  'Vip',
  'Aebp1',
  'Tac2',
  'Svil',
  'Adra1b',
  'Asic4'],
 4: ['Cpne4', 'Pde3a'],
 5: ['Ndnf', 'Tnnt1', 'Rgs5', 'Ppp1r1c', 'Timp3', '1810011O10Rik', 'Reln'],
 6: ['Dab1', 'Nfib', 'Nrip3', 'Rxfp1', 'Ifi27l2a'],
 7: ['Pdlim5', 'Nfix', 'Sema5a', 'Egfr', 'Cplx3', 'Crispld1', 'Kit', 'Thrsp'],
 8: ['Hcrtr2', 'Id2', 'Cryab', 'Vcan', 'Hapln1', 'Trpc5'],
 9: ['Thsd7a',
  'Gabrd',
  'Pthlh',
  'Tcap',
  'Pvalb',
  'Itih5',
  'Igfbp4',
  'Nek7',
  'C1ql1'],
 10: ['Kitl', 'Rbp4', 'Lgals1', 'Cort', 'Fam150b'],
 11: ['Vwc2', '3110035E14Rik', 'Tmem132c', 'Moxd1', 'Gpr83'],
 12: ['Maf', 'Mafb'],
 13: ['Sst', 'Sox6', 'Adgrg6', 'Npy'

In [1074]:
sd_shared_cl_mg_dict, connector_df_alt_lco_marker_shared_top = cc.build_corr_table_shared_top(heatmap_argmax_df_alt_lco,
                                                     corr_matrix_manual_alt_lco,
                                                     all_marker_labels_c,
                                                     amy_metadata_df_gaba,
                                                     GABA_cl_mg_dict_filtered)

In [1075]:
sd_shared_cl_mg_dict

{'1': ['Chat', 'Isl1'],
 '2': ['Sncg', 'Htr3a'],
 '3': ['Vip', 'Cbln2'],
 '5': ['Reln', 'Ndnf'],
 '7': ['Kit', 'Pdlim5'],
 '8': ['Hcrtr2', 'Id2'],
 '9': ['Pthlh', 'Pvalb'],
 '10': ['Rbp4', 'Kitl'],
 '11': ['Moxd1', 'Vwc2'],
 '13': ['Npy', 'Sst'],
 '16': ['St18', 'Gm17660'],
 '17': ['Calcr', 'Peg10'],
 '18': ['Prlr', 'Pde1c'],
 '19': ['Cbln1', 'Grp'],
 '29': ['Gal', 'Asb4'],
 '40': ['Htr1f', 'Tshz1']}

In [1076]:
GABA_mg_cl_dict_final = cc.create_mg_cl_dict_final(GABA_cl_mg_dict_filtered,sd_shared_cl_mg_dict)

In [1077]:
GABA_mg_cl_dict_final

{1: ['Chat', 'Isl1'],
 2: ['Sncg', 'Htr3a'],
 3: ['Vip', 'Cbln2'],
 4: ['Cpne4', 'Pde3a'],
 5: ['Reln', 'Ndnf'],
 6: ['Dab1', 'Nfib'],
 7: ['Kit', 'Pdlim5'],
 8: ['Hcrtr2', 'Id2'],
 9: ['Pthlh', 'Pvalb'],
 10: ['Rbp4', 'Kitl'],
 11: ['Moxd1', 'Vwc2'],
 12: ['Maf', 'Mafb'],
 13: ['Npy', 'Sst'],
 14: ['Spon1', 'Tox'],
 15: ['Gm28884', 'Fam159b'],
 16: ['St18', 'Gm17660'],
 17: ['Calcr', 'Peg10'],
 18: ['Prlr', 'Pde1c'],
 19: ['Cbln1', 'Grp'],
 20: ['Tmem100'],
 21: ['Unc13c', 'Crtac1'],
 22: ['Igfbp6', 'Rprm'],
 23: ['Wfs1', 'Prok2'],
 24: ['Jsrp1', 'Col18a1'],
 25: ['Lpl'],
 26: ['Rmst', 'Mgat4c'],
 27: ['Oprk1', 'Trhde'],
 28: ['Lmo1', 'Chn2'],
 29: ['Gal', 'Asb4'],
 30: ['Igsf1', 'A230065H16Rik'],
 31: ['Foxp2', 'Gpr88'],
 32: ['Npy2r'],
 33: ['Nwd2'],
 34: ['Lypd1'],
 35: ['Hs3st4', 'Car10'],
 36: ['BC039966', 'Ngb'],
 37: ['Col23a1'],
 38: ['Col6a1'],
 39: ['Myh7'],
 40: ['Htr1f', 'Tshz1'],
 41: ['Col11a1', 'Pde11a'],
 42: ['Pax6', 'Npnt'],
 43: ['Calca'],
 44: ['Dsc3', 'Mfge8'],
 4

In [1078]:
sd_labels_df = cc.generate_sd_labels_df(GABA_mg_cl_dict_final)

In [1079]:
sd_labels_df

Unnamed: 0,lco_index,sd_label,sd_label_complete
0,1,"[Chat, Isl1]",1 Chat-Isl1
1,2,"[Sncg, Htr3a]",2 Sncg-Htr3a
2,3,"[Vip, Cbln2]",3 Vip-Cbln2
3,4,"[Cpne4, Pde3a]",4 Cpne4-Pde3a
4,5,"[Reln, Ndnf]",5 Reln-Ndnf
5,6,"[Dab1, Nfib]",6 Dab1-Nfib
6,7,"[Kit, Pdlim5]",7 Kit-Pdlim5
7,8,"[Hcrtr2, Id2]",8 Hcrtr2-Id2
8,9,"[Pthlh, Pvalb]",9 Pthlh-Pvalb
9,10,"[Rbp4, Kitl]",10 Rbp4-Kitl


In [1080]:
amy_labels_df,amy_labels = cc.generate_amy_labels_df(connector_df_alt_lco_marker_shared_top,corr_matrix_manual_alt_lco)

In [1081]:
amy_labels_df

Unnamed: 0,0
1,1 Foxp2-Fmod
2,2 Foxp2-Adra2a
3,3 Foxp2-Col6a1
4,4 Foxp2-Htr1f
5,5 Pax6-Enpp2
6,6 Pax6-Th
7,7 Pax6-Cyp26a1
8,8 Prkcd-Oprk1
9,9 Prkcd-Ezr
10,10 Prkcd-Adora2a


In [473]:
##update corr plot with dropped/merged clusters (from dimorph_processing.py)

In [1082]:
folder

'/bigdata/isaac/gaba_files/'

In [1083]:
folder = '/bigdata/isaac/gaba_files/'
cc.plot_correlation_w_labels(corr_matrix_manual_alt_lco_sorted,sd_labels_df,amy_labels_df, folder, 'GABA_filtered_merged',savefig=True)

Launching server at http://localhost:46711


In [1084]:
today

'2024-09-02'

In [740]:
#write labels df's to file

In [1086]:
folder = '/bigdata/isaac/gaba_files/'
sd_labels_df.to_csv(folder + 'GABA_sd_labels_df_' + today + '.csv')
amy_labels_df.to_csv(folder + 'GABA_amy_labels_df_' + today + '.csv')

In [1087]:
folder

'/bigdata/isaac/gaba_files/'

In [1088]:
folder = '/bigdata/isaac/gaba_files/'
file = 'GABA_mg_cl_dict_final_' + today
#write dict to file
with open(folder+file+'.json', "w") as outfile: 
    json.dump(GABA_mg_cl_dict_final, outfile)

In [906]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1089]:
cc.plot_connector_plot_with_labels(connector_df_alt_lco_marker_shared_top,GABA_mg_cl_dict_final,list(sd_labels_df['sd_label_complete']),amy_labels, folder,'GABA',savefig = True)

<IPython.core.display.Javascript object>

In [1090]:
xticks = cc.gene_explorer('Chat', 'amy', amy_df_GABA_expr_ge_cv_ls, amy_metadata_df_gaba,output_folder = '/bigdata/isaac/gaba_files/gene_explorer/',markers = True)

<IPython.core.display.Javascript object>

  ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha="right")


In [1092]:
fig,ax = plt.subplots(figsize = (12,10))
ax.plot(np.array(amy_avgs.loc['Chat']), '.-')
#ax.set_xticklabels(xticks)
#ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha="right")
plt.show()

<IPython.core.display.Javascript object>

## Vglut1 Analysis <a name="vglut1_analysis"></a>

In [320]:
all_cell_classes

['Astro',
 'Astro_SC',
 'Astro_agt',
 'COP',
 'EC',
 'Epend',
 'GABA',
 'OL',
 'OPC',
 'OPC_cycling',
 'Peri',
 'VGLUT1',
 'VGLUT2',
 'VLMC',
 'VSM',
 'microglia',
 'pvm']

### Process VGLUT1 Amy data

In [432]:
amy_df_VGLUT1_expr_ge_cv_ls, amy_metadata_df_VGLUT1 = cc.process_amy_data_class(amy_df,amy_metadata_df,IEG_list,sex_gene_list,cell_class='VGLUT1')

removing # duplicate gene rows:  65
removing  53  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
removing  6  genes found in  ['Xist', 'Tsix', 'Ddx3y', 'Eif2s3y', 'Kdm5d', 'Uty']
Total genes reduced from 27874 to 13757


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

column (gene) mean after standardization: 0.00
column (gene) sigma after standardization: 1.00


In [433]:
amy_df_VGLUT1_expr_ge_cv_ls.shape

(603, 11947)

In [434]:
today

'2024-08-24'

In [436]:
#save to file

folder = '/bigdata/isaac/Vglut1_files/'

file1 = 'amy_df_VGLUT1_expr_ge_cv_ls_orig' 
file2 = 'amy_metadata_df_VGLUT1_orig' 

#amy_df_VGLUT1_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
#amy_metadata_df_VGLUT1.to_json(folder+file2+'.json')

In [350]:
folder = '/bigdata/isaac/Vglut1_files/'
amy_df_VGLUT1_expr_ge_cv_ls_orig = pd.read_feather(folder + 'amy_df_VGLUT1_expr_ge_cv_ls_orig.feather')
amy_metadata_df_VGLUT1_orig = pd.read_json(folder + 'amy_metadata_df_VGLUT1_orig.json')

In [809]:
amy_df_VGLUT1_expr_ge_cv_ls_orig

Unnamed: 0_level_0,AGAGAATCAATCCTAG-1_08-1,CGATGCGCAGTGGCTC-1_08-1,GAGAGGTTCGCGTCGA-1_08-1,GCCAGGTTCACTTCTA-1_08-1,GGGCTCAAGCTGGCCT-1_08-1,GTTACCCAGCACACCC-1_08-1,TTATTGCGTATGACAA-1_08-1,AACACACAGACGCCAA-1_10-1,ACAAAGATCTCATGGA-1_10-1,AGGTGTTAGCAAGGAA-1_10-1,...,TTACGTTAGATTGGGC-1_74-1,ACACAGTCATGTGCCG-1_74-2,AGAACCTAGGTCCCTG-1_74-2,TCATTACCAACAGCTT-1_74-2,GACCGTGGTATAATGG-1_75-2,AGTCACAGTAACATGA-1_75-3,TGCGGCACACGTAGTT-1_75-3,TTCATTGTCACTTATC-1_75-3,TCTAACTCAGTATGAA-1_76-1,AGCTCAAGTAAGATTG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tac1,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,...,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471,-0.244471
Xist,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,5.183820,1.830350,3.895341,...,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641,-0.234641
Slamf1,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,...,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871,-0.105871
Cartpt,-0.260137,-0.260137,-0.260137,-0.260137,4.922390,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,...,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137,-0.260137
Fibcd1,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,...,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757,-0.200757
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Mgat5b,1.238385,-0.787189,2.056063,-0.787189,1.238385,1.238385,-0.787189,-0.787189,1.238385,-0.787189,...,-0.787189,-0.787189,-0.787189,-0.787189,-0.787189,0.225598,-0.787189,-0.787189,-0.787189,-0.787189
Cdc42ep3,-0.402102,-0.402102,-0.402102,-0.402102,-0.402102,-0.402102,2.068810,-0.402102,-0.402102,-0.402102,...,-0.402102,-0.402102,-0.402102,2.715841,-0.402102,-0.402102,-0.402102,-0.402102,-0.402102,-0.402102
Gm28075,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,21.677573,-0.038868,-0.038868,-0.038868,-0.038868,...,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868,-0.038868
Dpy19l3,-0.540816,-0.540816,-0.540816,-0.540816,1.474359,-0.540816,-0.540816,2.002052,-0.540816,-0.540816,...,-0.540816,-0.540816,0.730618,-0.540816,0.730618,-0.540816,-0.540816,-0.540816,-0.540816,0.730618


### Load VGLUT1 SD data

In [324]:
folder

'/bigdata/isaac/Vglut1_files/'

In [802]:
folder = '/bigdata/isaac/Vglut1_files/'

#VGLUT1_meta_data_df_plis = pd.read_json(folder + 'Vglut1_meta_data_df_plis2024-08-21.json')

VGLUT1_df_prelinkage_ls_orig = pd.read_feather(folder + 'Vglut1_df_pre_linkage_ls_2024-08-21.feather')
#VGLUT1_metadata_df_prelinkage_ls_orig = pd.read_json(folder + 'Vglut1_meta_data_df_pre_linkage_2024-08-21.json')
VGLUT1_meta_data_df_plis_filtered = pd.read_json(folder + 'Vglut1meta_data_df_plis_filtered_2024-08-29.json')


In [810]:
VGLUT1_df_plis_filtered = VGLUT1_df_prelinkage_ls_orig.reindex(index = VGLUT1_meta_data_df_plis_filtered.columns)

In [812]:
VGLUT1_df_plis_filtered

gene,S100a3,Loxl2,Avp,Gm26737,Snca,Cx3cr1,Gm15124,Mcm8,Cartpt,Grp,...,Gpsm3,Itpr1,Ppp1r3b,Ptcra,Ier3,Llgl2,Dlx6os1,Acan,Nhs,Smoc2
AGGTTACTCCATCTGC-1_10X51_1,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
GACCCTTAGGGACCAT-1_10X51_1,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,3.223543,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
CAGTGCGTCTCAGTCC-1_10X51_2,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
TTGGGCGCAGAGGTAC-1_10X36_2,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
AGTACTGCAACCGCTG-1_10X51_2,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,3.223543,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AGGACGAGTCTGTAAC-1_10X38_1,-0.060873,-0.097924,2.972951,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,14.623429,0.588399,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
AGTCATGGTTCAAAGA-1_10X52_3,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,0.588399,-0.055517,-0.123987,3.223543,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
CGTGATAAGCACTCGC-1_10X51_3,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,-0.780670,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689
TTCTAGTGTTTACGTG-1_10X52_1,-0.060873,-0.097924,-0.279639,-0.092088,-0.104791,-0.080927,-0.128169,-0.240404,-0.226919,-0.218716,...,-0.064748,0.588399,-0.055517,-0.123987,-0.263428,-0.135938,-0.080606,-0.090912,-0.299721,-0.172689


In [804]:
VGLUT1_meta_data_df_plis_filtered.shape

(29, 9550)

### Get intersection

In [690]:
lco = pd.unique(VGLUT1_meta_data_df_plis.loc['cluster_label'])
lco

array([28, 27, 17, 11, 14, 18, 19, 3, 6, 8, 12, 15, 0, 16, 5, 9, 26, 13,
       22, 21, 25, 24, 1, 10, 20, 23, 2, 7, 4], dtype=object)

In [438]:
#amy_df_VGLUT1_expr_ge_cv_ls, VGLUT1_df_prelinkage_ls = cc.get_df_gene_intersection(VGLUT1_df_prelinkage_ls_orig,amy_df_VGLUT1_expr_ge_cv_ls,IEG_list)

removing  0  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
Index(['Tac1', 'Slamf1', 'Cartpt', 'Fibcd1', 'Pbld2', 'Tspan15', 'Snca',
       'Dlk1', 'Gm26710', 'Tshz2',
       ...
       'Emilin1', 'Hspb2', 'Vwc2', 'Fgfr1', 'Wnt5a', 'Gabrg3', 'Nhs',
       'Tmem215', 'Rims3', 'Itpr2'],
      dtype='object', length=314)


In [813]:
amy_df_VGLUT1_expr_ge_cv_ls, VGLUT1_df_plis_filtered = cc.get_df_gene_intersection(VGLUT1_df_plis_filtered,amy_df_VGLUT1_expr_ge_cv_ls,IEG_list)

removing  0  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
Index(['Tac1', 'Slamf1', 'Cartpt', 'Fibcd1', 'Pbld2', 'Tspan15', 'Snca',
       'Dlk1', 'Gm26710', 'Tshz2',
       ...
       'Emilin1', 'Hspb2', 'Vwc2', 'Fgfr1', 'Wnt5a', 'Gabrg3', 'Nhs',
       'Tmem215', 'Rims3', 'Itpr2'],
      dtype='object', length=314)


In [814]:
amy_df_VGLUT1_expr_ge_cv_ls.shape

(314, 11947)

In [815]:
VGLUT1_df_plis_filtered.shape

(314, 9550)

In [441]:
sorted(list(amy_df_VGLUT1_expr_ge_cv_ls.index))

['1700031P21Rik',
 '2900060B14Rik',
 '9330185C12Rik',
 'A330069E16Rik',
 'A830036E02Rik',
 'AW551984',
 'Acan',
 'Ackr1',
 'Acvr1c',
 'Adcyap1',
 'Agl',
 'Aldh1a3',
 'Amigo2',
 'Angpt1',
 'Ap1s2',
 'Arhgap36',
 'Arhgap6',
 'Arhgdib',
 'Arhgef40',
 'Atf3',
 'B230312C02Rik',
 'B930036N10Rik',
 'BC026585',
 'BC030499',
 'BC048546',
 'Bace2',
 'Baiap3',
 'Batf3',
 'Bcl11a',
 'Bdnf',
 'Bok',
 'Btbd3',
 'C1ql1',
 'C1ql2',
 'C730034F03Rik',
 'Cacna2d2',
 'Calb2',
 'Car8',
 'Cartpt',
 'Cav1',
 'Cbln2',
 'Cbln4',
 'Ccnb1',
 'Cd24a',
 'Cd44',
 'Cdkn1c',
 'Cenpf',
 'Chrdl1',
 'Cited2',
 'Cntn4',
 'Cntn5',
 'Col11a1',
 'Col12a1',
 'Col1a1',
 'Col23a1',
 'Col24a1',
 'Col25a1',
 'Col27a1',
 'Col6a1',
 'Cplx3',
 'Cpne2',
 'Cpne5',
 'Crh',
 'Cryab',
 'Csf3r',
 'Ctxn3',
 'Cx3cr1',
 'Cxcl12',
 'Cxcl14',
 'Cyp26b1',
 'Dach1',
 'Dcn',
 'Dio3',
 'Diras2',
 'Dlk1',
 'Doc2b',
 'Dock10',
 'Dqx1',
 'Ecel1',
 'Ecm2',
 'Emilin1',
 'Endou',
 'Eps8',
 'Ermn',
 'Esr1',
 'Etl4',
 'Exoc1',
 'Fam46a',
 'Fam83a',
 'Fbl

### Checkpoint - save intersected data/metadata to file / load from here

In [442]:
folder

'/bigdata/isaac/Vglut1_files/'

In [443]:
#save to file

folder = '/bigdata/isaac/Vglut1_files/'

file1 = 'amy_df_VGLUT1_expr_ge_cv_ls_cell_comp' 
file2 = 'amy_metadata_df_VGLUT1_cell_comp' 

amy_df_VGLUT1_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
amy_metadata_df_VGLUT1.to_json(folder+file2+'.json')

#################################

file1 = 'VGLUT1_df_prelinkage_ls_cell_comp' 
file2 = 'VGLUT1_meta_data_df_prelinkage_cell_comp' 

VGLUT1_df_prelinkage_ls.to_feather(folder+file1+'.feather')
VGLUT1_metadata_df_prelinkage_ls_orig.to_json(folder+file2+'.json')


In [52]:
#load data
folder = '/bigdata/isaac/VGLUT1_files/'

#amy_df_VGLUT1_expr_ge_cv_ls = pd.read_feather(folder + 'amy_df_VGLUT1_expr_ge_cv_ls_cell_comp.feather')
#amy_metadata_df_VGLUT1 = pd.read_json(folder + 'amy_metadata_df_VGLUT1_cell_comp.json')

#VGLUT1_df_prelinkage_ls = pd.read_feather(folder + 'VGLUT1_df_prelinkage_ls_cell_comp.feather')
#VGLUT1_meta_data_df_prelinkage = pd.read_json(folder + 'VGLUT1_meta_data_df_prelinkage_cell_comp.json' )

### Compute avg expression for all genes, for every cluster

In [1001]:
amy_avgs = dp.compute_avg_expr_per_cluster_label(amy_df_VGLUT1_expr_ge_cv_ls,amy_metadata_df_VGLUT1)

In [1002]:
amy_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,32
Tac1,-0.194167,-0.126893,-0.178076,-0.016315,-0.036845,-0.118406,-0.124385,-0.161997,-0.079559,4.152479,...,-0.123742,-0.217711,0.004919,4.263656,-0.096108,-0.135938,-0.244471,-0.179483,-0.128544,-0.203009
Slamf1,-0.105871,0.001953,-0.066666,0.052993,-0.105871,0.010102,-0.039187,0.125944,-0.105871,-0.105871,...,-0.005924,-0.105871,-0.016506,-0.047439,0.043648,0.049546,0.094511,0.015168,-0.024519,-0.037588
Cartpt,-0.021226,-0.123727,-0.000883,0.004818,-0.219082,-0.181355,-0.130713,-0.083667,-0.124451,-0.169563,...,-0.14453,0.818864,-0.072145,-0.089928,-0.101676,-0.166746,-0.172837,-0.260137,0.144345,-0.21274
Fibcd1,-0.200757,-0.096971,-0.083385,-0.200757,-0.200757,-0.041012,-0.172215,-0.149736,0.496483,-0.016769,...,-0.185782,-0.200757,0.091133,-0.112006,-0.112697,-0.181824,-0.075747,-0.200757,-0.159702,0.054471
Pbld2,-0.081712,0.451428,0.051361,0.359408,-0.055383,0.123805,0.088671,0.150994,-0.213967,-0.114765,...,-0.195439,-0.076586,-0.136937,-0.238935,-0.177578,-0.045297,-0.009962,-0.02766,-0.165691,-0.167791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gabrg3,0.143111,0.459558,0.008212,-0.115661,-0.417358,0.409043,0.560066,0.745752,-0.405415,0.356747,...,-0.567281,-0.613648,-0.389966,0.09582,-0.193156,0.680269,0.222021,0.15227,-0.377551,-0.05809
Nhs,0.052265,-0.201807,-0.231616,-0.265213,-0.139388,-0.171996,0.278677,-0.082943,0.539967,-0.214153,...,0.025154,-0.265213,0.115833,0.113901,0.073143,0.007143,-0.054186,0.318324,0.248706,-0.079451
Tmem215,-0.20894,0.404455,0.600378,0.659114,-0.187673,-0.238815,0.616408,-0.206852,0.446499,0.99352,...,-0.358244,-0.411616,0.059611,0.963422,-0.064432,-0.26546,-0.072393,-0.129807,-0.067544,0.02234
Rims3,0.03584,-0.20809,-0.216283,-0.181919,-0.095112,-0.117987,1.084829,-0.090978,0.94758,0.651262,...,-0.04311,0.059749,-0.066852,0.648454,-0.050085,0.362177,-0.026185,-0.139134,-0.002799,-0.081028


In [421]:
VGLUT1_df_prelinkage_ls.shape

(315, 10369)

In [1003]:
sd_avgs = dp.compute_avg_expr_per_cluster_label(VGLUT1_df_plis_filtered,VGLUT1_meta_data_df_plis_filtered)

In [819]:
sd_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
Tac1,-0.096371,-0.203812,-0.128785,-0.088291,-0.058328,-0.120016,-0.064116,-0.083966,-0.074248,-0.016609,...,-0.043609,1.28618,0.004949,-0.203812,-0.033596,-0.040876,0.148131,-0.053088,-0.054177,-0.0705
Slamf1,0.151819,-0.117678,-0.071013,-0.081061,0.004962,-0.117678,-0.012686,-0.023074,-0.037092,0.119507,...,-0.017217,0.161074,0.252833,-0.117678,-0.033012,-0.117678,0.149866,-0.033431,0.087075,0.186349
Cartpt,-0.077219,1.961464,-0.152357,-0.099881,0.122446,-0.226919,-0.084758,-0.101595,0.394353,0.023777,...,-0.078759,0.366756,-0.068677,-0.118502,-0.095172,0.176751,0.033282,0.103509,-0.067612,-0.226919
Fibcd1,-0.193053,-0.232025,-0.223494,-0.107613,0.449114,-0.232025,-0.218476,0.582846,-0.148495,-0.193001,...,-0.192073,-0.061643,-0.200538,-0.232025,-0.232025,-0.232025,-0.232025,-0.223408,-0.202361,-0.232025
Pbld2,0.357262,-0.061706,-0.177195,-0.136383,-0.003001,-0.12123,-0.034769,-0.034348,0.012313,-0.077536,...,-0.110454,0.022744,0.53304,0.310322,-0.01273,-0.210516,0.274168,-0.07431,-0.032423,-0.210516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gabrg3,1.478926,-0.322647,-0.271008,-0.255009,-0.031226,-0.21774,-0.213352,-0.218246,-0.285432,-0.306217,...,0.909307,0.935079,1.511242,-0.322647,-0.322647,-0.322647,-0.229982,-0.295231,-0.228091,-0.322647
Nhs,-0.128213,1.560369,0.714317,0.387355,-0.024658,-0.124721,-0.265787,-0.246684,-0.243329,-0.299721,...,0.854816,0.792793,0.360712,0.593509,-0.105891,0.09032,0.042359,-0.081527,0.01278,0.257098
Tmem215,-0.058926,0.066053,0.037052,-0.091517,-0.20144,-0.059545,-0.223587,-0.09838,-0.234701,-0.131028,...,0.563962,0.727548,2.352586,0.656652,-0.056595,0.118386,1.25249,-0.16515,-0.214299,-0.247941
Rims3,-0.109146,-0.298423,-0.237889,-0.212924,-0.228668,-0.298423,-0.219108,-0.248727,-0.267062,-0.261501,...,2.003608,1.529063,1.617096,-0.298423,-0.015557,-0.226118,-0.090185,-0.0335,-0.139057,-0.110895


In [1005]:
heatmap2, heatmap_argmax_df_alt_lco, corr_matrix_manual_alt_lco, corr_matrix_manual_alt_lco_sorted = cc.plot_correlation(sd_avgs,amy_avgs)
# Display the plot
hvplot.show(heatmap2)
    
# Ensure output is displayed inline
hv.output(heatmap2, backend='bokeh')

Launching server at http://localhost:35191


In [1006]:
error_genes,all_marker_labels = cc.amy_gene_spell_checker(amy_df_VGLUT1_expr_ge_cv_ls_orig,amy_metadata_df_VGLUT1)
error_genes

  all_m = [amy_metadata_df.loc['markers'][x] for x in range((amy_metadata_df.loc['markers'].shape[0]))]


gene,  Zic4 not found in index


['Zic4']

In [1007]:
all_marker_labels

[['Bok', 'Pamr1'],
 ['Car12', 'Celsr1'],
 ['Cartpt', 'Fam46a'],
 ['Cbln1', 'Coch'],
 ['Cd36', 'Calb2'],
 ['Dcn', 'C1ql2'],
 ['Eps8', 'Cd44'],
 ['Ermn', 'Mpped1'],
 ['Fbln1', 'Il33'],
 ['Fibcd1', 'Vit'],
 ['Fmo1', 'Rxfp3'],
 ['Gpr101', 'Grem1'],
 ['Gpx3', 'Rxfp1'],
 ['Grp', 'Cpne8'],
 ['Gsg1l', 'Prox1'],
 ['Igfn1', 'Ndst4'],
 ['Mid1', 'Cdh22'],
 ['Oasl2', 'Ifit1'],
 ['Plcxd3', 'Reln'],
 ['Rspo2', 'Sema3e'],
 ['Sema5a', 'Dcn'],
 ['Sim1', 'C1ql1'],
 ['St8sia2', 'Cald1'],
 ['Tac1', 'Igfbp5'],
 ['Thrsp', 'Lamp5'],
 ['Trh', 'Kit'],
 ['Trh', 'Mdga1'],
 ['Trh', 'Medag'],
 ['Trh', 'Rxfp1'],
 ['Trh', 'Slc23a3'],
 ['Wfs1', 'Sorcs3'],
 ['Zic4', 'Trp73']]

In [1018]:
folder = '/bigdata/isaac/Vglut1_files/'

In [1009]:
with open(folder + 'Vglut1_cl_mg_filtered_2024-08-29.json') as json_data:
    VGLUT1_cl_mg_dict_filtered = json.load(json_data)

In [1010]:
VGLUT1_cl_mg_dict_filtered = {int(k): v for k, v in VGLUT1_cl_mg_dict_filtered.items()}

In [1011]:
VGLUT1_cl_mg_dict_filtered

{1: ['Arhgdib', 'Sim1'],
 2: ['Gm11549',
  'Prox1',
  'Eps8',
  'Peg10',
  'Plekhg1',
  'Penk',
  'Pappa',
  'Grp',
  'Atf3'],
 3: ['Il33', 'Fbln1', 'Prox1os'],
 4: ['Cplx3', 'Fst', 'Cd44', 'Gm15261'],
 5: ['Arhgap6', 'Galnt9', 'Dlk1', 'Id4', 'Dcn', 'Mirt1', 'Meis2'],
 6: ['2410022M11Rik', 'Cd24a', 'Dio3'],
 7: ['Pde11a', 'Vgll3', 'Ecm2', 'Slc1a3', 'Rspo1'],
 8: ['Lypd6'],
 9: ['Bdnf', 'Pcdh11x', 'Abhd11os', 'Llgl2', 'C1ql2'],
 10: ['Igfbp4', 'Mgp', 'Col6a1', 'Pdlim1', 'Col25a1', 'Sox5'],
 11: ['Cpne2',
  'Adgrg6',
  'S100a16',
  'Fgf1',
  'Tspan18',
  'Smoc2',
  'Calb2',
  'Kit',
  'Col12a1'],
 12: ['Angpt1', 'Mt2', 'Tac1', 'Nxph3', 'Nhlh2', 'Cxcl12', 'Igfbpl1'],
 13: ['Sema5a',
  'Medag',
  'Stxbp6',
  'Cox6a2',
  'Sorcs1',
  'Nxph1',
  'Vwc2',
  'Tshz2',
  'Rims3'],
 14: ['Snca', 'Foxp2', 'Oprk1', 'Olfm3', 'Rgs12'],
 15: ['Spon1',
  'Tmem215',
  'Scube1',
  'Rph3a',
  'Myl4',
  'Cyp26b1',
  'Sostdc1',
  'Htr2c',
  'Ptn'],
 16: ['Rxfp1',
  'Fam3c',
  'Col23a1',
  'Pam',
  'Krt12',
  

In [1012]:
VGLUT1_df_prelinkage_ls.shape

(314, 10369)

In [1013]:
VGLUT1_cl_mg_dict_filtered.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

In [1014]:
sd_shared_cl_mg_dict, connector_df_alt_lco_marker_shared_top = cc.build_corr_table_shared_top(heatmap_argmax_df_alt_lco,
                                                     corr_matrix_manual_alt_lco,
                                                     all_marker_labels,
                                                     amy_metadata_df_VGLUT1,
                                                     VGLUT1_cl_mg_dict_filtered)

In [1015]:
sd_shared_cl_mg_dict

{'1': ['Sim1', 'Arhgdib'],
 '3': ['Il33', 'Fbln1'],
 '4': ['Cd44', 'Cplx3'],
 '5': ['Dcn', 'Arhgap6'],
 '12': ['Tac1', 'Angpt1'],
 '21': ['Reln', 'BC048546']}

In [1016]:
VGLUT1_mg_cl_dict_final = cc.create_mg_cl_dict_final(VGLUT1_cl_mg_dict_filtered,sd_shared_cl_mg_dict)

In [1017]:
VGLUT1_mg_cl_dict_final

{1: ['Sim1', 'Arhgdib'],
 2: ['Gm11549', 'Prox1'],
 3: ['Il33', 'Fbln1'],
 4: ['Cd44', 'Cplx3'],
 5: ['Dcn', 'Arhgap6'],
 6: ['2410022M11Rik', 'Cd24a'],
 7: ['Pde11a', 'Vgll3'],
 8: ['Lypd6'],
 9: ['Bdnf', 'Pcdh11x'],
 10: ['Igfbp4', 'Mgp'],
 11: ['Cpne2', 'Adgrg6'],
 12: ['Tac1', 'Angpt1'],
 13: ['Sema5a', 'Medag'],
 14: ['Snca', 'Foxp2'],
 15: ['Spon1', 'Tmem215'],
 16: ['Rxfp1', 'Fam3c'],
 17: ['Moxd1'],
 18: ['Nppc', 'Plk5'],
 19: ['Rnf152', 'Ecel1'],
 20: ['Rab3b'],
 21: ['Reln', 'BC048546'],
 22: ['Gfod2']}

In [868]:
VGLUT1_mg_cl_dict_final.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

In [1019]:
list(VGLUT1_mg_cl_dict_final.values())

[['Sim1', 'Arhgdib'],
 ['Gm11549', 'Prox1'],
 ['Il33', 'Fbln1'],
 ['Cd44', 'Cplx3'],
 ['Dcn', 'Arhgap6'],
 ['2410022M11Rik', 'Cd24a'],
 ['Pde11a', 'Vgll3'],
 ['Lypd6'],
 ['Bdnf', 'Pcdh11x'],
 ['Igfbp4', 'Mgp'],
 ['Cpne2', 'Adgrg6'],
 ['Tac1', 'Angpt1'],
 ['Sema5a', 'Medag'],
 ['Snca', 'Foxp2'],
 ['Spon1', 'Tmem215'],
 ['Rxfp1', 'Fam3c'],
 ['Moxd1'],
 ['Nppc', 'Plk5'],
 ['Rnf152', 'Ecel1'],
 ['Rab3b'],
 ['Reln', 'BC048546'],
 ['Gfod2']]

In [1020]:
sd_labels_df = cc.generate_sd_labels_df(VGLUT1_mg_cl_dict_final)

In [1021]:
sd_labels_df

Unnamed: 0,lco_index,sd_label,sd_label_complete
0,1,"[Sim1, Arhgdib]",1 Sim1-Arhgdib
1,2,"[Gm11549, Prox1]",2 Gm11549-Prox1
2,3,"[Il33, Fbln1]",3 Il33-Fbln1
3,4,"[Cd44, Cplx3]",4 Cd44-Cplx3
4,5,"[Dcn, Arhgap6]",5 Dcn-Arhgap6
5,6,"[2410022M11Rik, Cd24a]",6 2410022M11Rik-Cd24a
6,7,"[Pde11a, Vgll3]",7 Pde11a-Vgll3
7,8,[Lypd6],8 Lypd6
8,9,"[Bdnf, Pcdh11x]",9 Bdnf-Pcdh11x
9,10,"[Igfbp4, Mgp]",10 Igfbp4-Mgp


In [1022]:
amy_labels_df,amy_labels = cc.generate_amy_labels_df(connector_df_alt_lco_marker_shared_top,corr_matrix_manual_alt_lco)

In [1023]:
amy_labels_df

Unnamed: 0,0
1,1 Zic4-Trp73
2,2 Rspo2-Sema3e
3,3 Sema5a-Dcn
4,4 Wfs1-Sorcs3
5,5 Cbln1-Coch
6,6 Thrsp-Lamp5
7,7 Bok-Pamr1
8,8 Sim1-C1ql1
9,9 Gpx3-Rxfp1
10,10 Ermn-Mpped1


In [473]:
##update corr plot with dropped/merged clusters (from dimorph_processing.py)

In [1024]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1025]:
folder = '/bigdata/isaac/Vglut1_files/'
cc.plot_correlation_w_labels(corr_matrix_manual_alt_lco_sorted,sd_labels_df,amy_labels_df, folder, 'VGLUT1_filtered_merged',savefig=True)

Launching server at http://localhost:41505


In [1027]:
today

'2024-08-28'

In [740]:
#write labels df's to file

In [1028]:
folder = '/bigdata/isaac/Vglut1_files/'
sd_labels_df.to_csv(folder + 'VGLUT1_sd_labels_df_' + today + '.csv')
amy_labels_df.to_csv(folder + 'VGLUT1_amy_labels_df_' + today + '.csv')

In [770]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1029]:
folder = '/bigdata/isaac/Vglut1_files/'
file = 'VGLUT1_mg_cl_dict_final_' + today
#write dict to file
with open(folder+file+'.json', "w") as outfile: 
    json.dump(VGLUT1_mg_cl_dict_final, outfile)

In [493]:
VGLUT1_meta_data_df_plis.shape

(29, 10369)

In [903]:
list(sd_labels_df['sd_label_complete'])

['1 Arhgdib-Sim1',
 '2 Gm11549-Prox1',
 '3 Il33-Fbln1',
 '4 Cplx3-Fst',
 '5 Arhgap6-Galnt9',
 '6 2410022M11Rik-Cd24a',
 '7 Pde11a-Vgll3',
 '8 Lypd6',
 '9 Bdnf-Pcdh11x',
 '10 Igfbp4-Mgp',
 '11 Cpne2-Adgrg6',
 '12 Angpt1-Mt2',
 '13 Sema5a-Medag',
 '14 Snca-Foxp2',
 '15 Spon1-Tmem215',
 '16 Rxfp1-Fam3c',
 '17 Moxd1',
 '18 Nppc-Plk5',
 '19 Rnf152-Ecel1',
 '20 Rab3b',
 '21 BC048546-Reln',
 '22 Gfod2']

In [906]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1030]:
cc.plot_connector_plot_with_labels(connector_df_alt_lco_marker_shared_top,VGLUT1_mg_cl_dict_final,list(sd_labels_df['sd_label_complete']),amy_labels, folder,'Vglut1',savefig = True)

<IPython.core.display.Javascript object>

## Vglut2 Analysis <a name="vglut2_analysis"></a>

### Process VGLUT1 Amy data

In [494]:
amy_df_VGLUT2_expr_ge_cv_ls, amy_metadata_df_VGLUT2 = cc.process_amy_data_class(amy_df,amy_metadata_df,IEG_list,sex_gene_list,cell_class='VGLUT2')

removing # duplicate gene rows:  65
removing  53  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
removing  6  genes found in  ['Xist', 'Tsix', 'Ddx3y', 'Eif2s3y', 'Kdm5d', 'Uty']
Total genes reduced from 27874 to 12720


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

column (gene) mean after standardization: 0.00
column (gene) sigma after standardization: 1.00


In [495]:
amy_df_VGLUT2_expr_ge_cv_ls.shape

(338, 5231)

In [496]:
today

'2024-08-27'

In [497]:
#save to file

folder = '/bigdata/isaac/Vglut2_files/'

file1 = 'amy_df_VGLUT2_expr_ge_cv_ls_orig' 
file2 = 'amy_metadata_df_VGLUT2_orig' 

amy_df_VGLUT2_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
amy_metadata_df_VGLUT2.to_json(folder+file2+'.json')

In [909]:
folder = '/bigdata/isaac/Vglut2_files/'
amy_df_VGLUT2_expr_ge_cv_ls_orig = pd.read_feather(folder + 'amy_df_VGLUT2_expr_ge_cv_ls_orig.feather')
amy_metadata_df_VGLUT2_orig = pd.read_json(folder + 'amy_metadata_df_VGLUT2_orig.json')

### Load VGLUT2 SD data

In [499]:
folder

'/bigdata/isaac/Vglut2_files/'

In [928]:
folder = '/bigdata/isaac/Vglut2_files/'

VGLUT2_df_prelinkage_ls_orig = pd.read_feather(folder + 'Vglut2_df_pre_linkage_ls_2024-09-01.feather')
VGLUT2_metadata_df_prelinkage_ls_orig = pd.read_json(folder + 'Vglut2_meta_data_df_pre_linkage_2024-09-01.json')
VGLUT2_meta_data_df_plis_filtered = pd.read_json(folder + 'Vglut2meta_data_df_plis_filtered_2024-09-01.json')


In [922]:
VGLUT2_df_plis_filtered = VGLUT2_df_prelinkage_ls_orig.reindex(index = VGLUT2_meta_data_df_plis_filtered.columns)

### Get intersection

In [501]:
lco = pd.unique(VGLUT2_meta_data_df_plis.loc['cluster_label'])
lco

array([18, 15, 25, 24, 12, 16, 5, 10, 7, 0, 21, 13, 14, 22, 8, 1, 4, 3, 6,
       2, 9, 19, 20, 23, 11, 17], dtype=object)

In [942]:
amy_df_VGLUT2_expr_ge_cv_ls, VGLUT2_df_plis_filtered = cc.get_df_gene_intersection(VGLUT2_df_plis_filtered,amy_df_VGLUT2_expr_ge_cv_ls,IEG_list)

removing  0  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
Index(['Avp', 'Oxt', 'Cartpt', 'Nts', 'Aldh1a3', 'Tcf7l2', 'Gm26730', 'Tac1',
       'Dlk1', 'Ebf1',
       ...
       'Rab3b', 'Emx2', 'Pld5', 'Pax6', 'Igf1', 'Angpt1', 'Plcxd3', 'Gpc6',
       'Agl', 'Zfhx4'],
      dtype='object', length=211)


In [943]:
VGLUT2_df_plis_filtered.shape

(211, 5276)

In [924]:
amy_df_VGLUT2_expr_ge_cv_ls.shape

(211, 5231)

### Checkpoint - save intersected data/metadata to file / load from here

In [506]:
folder

'/bigdata/isaac/Vglut2_files/'

In [933]:
#save to file

folder = '/bigdata/isaac/Vglut2_files/'

filea = 'amy_df_VGLUT2_expr_ge_cv_ls_cell_comp' 
fileb = 'amy_metadata_df_VGLUT2_cell_comp' 

amy_df_VGLUT2_expr_ge_cv_ls.to_feather(folder+filea+'.feather')
amy_metadata_df_VGLUT2.to_json(folder+fileb+'.json')

#################################

filea = 'VGLUT2_df_prelinkage_ls_cell_comp' 
fileb = 'VGLUT2_meta_data_df_prelinkage_cell_comp' 

VGLUT2_df_prelinkage_ls.to_feather(folder+filea+'.feather')
VGLUT2_metadata_df_prelinkage_ls_orig.to_json(folder+fileb+'.json')


### Compute avg expression for all genes, for every cluster

In [935]:
amy_avgs = dp.compute_avg_expr_per_cluster_label(amy_df_VGLUT2_expr_ge_cv_ls,amy_metadata_df_VGLUT2)

In [936]:
amy_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
Avp,-0.074957,-0.083856,-0.06656,-0.051051,-0.078361,-0.056969,-0.094268,0.004841,-0.094268,-0.056093,...,-0.094268,-0.066106,-0.054488,-0.094268,-0.003613,-0.094268,0.033305,-0.022689,0.030524,-0.025886
Oxt,-0.049104,-0.053249,-0.077603,-0.012273,-0.077603,-0.077603,-0.077603,0.044878,-0.077603,-0.077603,...,-0.077603,0.054148,-0.077603,-0.077603,-0.077603,-0.077603,-0.077603,-0.077603,-0.077603,-0.077603
Cartpt,0.63498,-0.242793,-0.097604,-0.224493,-0.085537,-0.148096,-0.293011,0.798742,-0.222168,-0.382185,...,0.856544,-0.268684,-0.326491,-0.410308,-0.410308,-0.374722,-0.328834,-0.238307,-0.238018,0.120074
Nts,-0.28427,-0.226141,-0.250016,-0.21925,-0.238319,-0.24635,-0.191323,-0.241853,-0.285121,-0.242121,...,-0.222689,-0.245339,-0.246173,0.061517,1.686555,-0.06565,-0.104321,-0.232857,0.11924,0.553831
Aldh1a3,0.072111,0.050965,-0.085769,0.090257,0.134404,0.255158,-0.037875,-0.013407,-0.074213,-0.125211,...,-0.125211,-0.004946,-0.000072,-0.125211,0.006921,-0.024973,0.010284,-0.125211,0.11659,-0.054216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Angpt1,-0.287876,-0.308563,-0.256184,-0.291374,-0.318779,-0.189815,0.131889,-0.040483,-0.126554,-0.221948,...,-0.276371,-0.304962,-0.229457,0.377195,0.341955,-0.129283,-0.283625,-0.318779,-0.318779,-0.256193
Plcxd3,1.164858,0.54521,-0.365467,-0.17352,-0.256397,-0.107099,0.408585,-0.003676,0.486204,-0.394461,...,0.534533,-0.116488,-0.19402,-0.01967,-0.044583,1.215055,0.826795,-0.148406,-0.485876,0.742251
Gpc6,1.114039,0.084834,0.770507,0.376039,0.563825,-0.04179,-0.050843,-0.293494,-0.391481,-0.349388,...,-0.516973,0.059973,-0.050772,-0.371761,-0.027412,1.219179,0.860614,-0.469255,0.168634,-0.264058
Agl,-0.093313,0.229351,-0.062085,0.153333,0.017545,0.039397,0.143498,-0.071363,-0.001362,-0.055102,...,-0.117948,0.222558,-0.020059,0.188599,0.019847,0.059182,-0.010739,0.149255,-0.042172,-0.006387


In [946]:
sd_avgs = dp.compute_avg_expr_per_cluster_label(VGLUT2_df_plis_filtered,VGLUT2_meta_data_df_plis_filtered)

In [947]:
sd_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Avp,-0.117072,8.50212,-0.175117,-0.108008,-0.114807,0.003997,-0.066767,-0.077087,-0.027367,-0.072685,-0.092196,-0.100263,-0.115987,-0.084902,-0.070544,-0.105992,-0.049997,-0.063081,-0.063753,-0.08014
Oxt,-0.112308,5.402613,-0.112308,-0.082253,-0.054886,-0.062216,-0.112308,-0.039014,-0.0481,-0.060375,-0.061859,-0.053587,-0.060298,-0.112308,-0.055349,-0.020582,-0.0038,-0.045485,-0.050353,-0.0653
Cartpt,-0.126503,-0.263607,-0.326183,-0.323796,-0.341493,-0.1206,-0.028677,0.114583,-0.34107,0.150258,-0.301298,-0.311876,1.32149,-0.270984,-0.328824,-0.163384,-0.167559,-0.05952,0.905679,-0.1718
Nts,2.921619,-0.180101,-0.145731,-0.000918,-0.213939,-0.007324,-0.155271,-0.186644,-0.067271,0.037343,-0.052475,-0.206313,-0.214957,0.535446,0.286029,0.034192,0.223432,0.894561,-0.212267,-0.236032
Aldh1a3,-0.139155,0.176692,-0.139155,-0.036701,-0.040504,0.00573,0.192637,-0.033355,-0.058144,-0.008108,0.170008,-0.05416,-0.000238,-0.063636,-0.06729,0.068289,-0.060473,-0.017626,0.082373,0.121851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Angpt1,0.941584,3.874725,3.254361,1.046694,0.573458,-0.052327,-0.11414,-0.195139,-0.007289,0.117668,-0.091426,-0.098995,-0.09267,0.397145,0.300167,-0.22887,-0.249914,-0.20043,-0.2658,-0.261069
Plcxd3,0.101494,0.060177,0.45432,-0.067032,0.195583,0.23526,0.404927,0.048342,-0.082475,0.247591,-0.388842,-0.446429,-0.285851,-0.217217,-0.170668,0.158504,-0.312187,1.015286,1.658383,0.60124
Gpc6,0.338394,-0.295491,-0.357939,-0.221585,-0.165908,-0.255542,0.052512,-0.130963,0.008831,-0.26984,-0.330146,-0.140497,-0.326155,-0.385276,-0.389527,-0.200018,0.256035,-0.148674,1.523913,0.434883
Agl,0.457367,0.073973,-0.241604,-0.046992,-0.027482,0.007192,0.033883,0.050836,-0.21884,-0.080927,-0.015864,-0.026754,0.18191,-0.210852,-0.010958,0.010103,-0.073585,-0.022017,0.090357,0.251886


In [949]:
heatmap2, heatmap_argmax_df_alt_lco, corr_matrix_manual_alt_lco, corr_matrix_manual_alt_lco_sorted = cc.plot_correlation(sd_avgs,amy_avgs)
# Display the plot
hvplot.show(heatmap2)
    
# Ensure output is displayed inline
hv.output(heatmap2, backend='bokeh')

Launching server at http://localhost:34325


In [667]:
heatmap2, heatmap_argmax_df_alt_lco, corr_matrix_manual_alt_lco, corr_matrix_manual_alt_lco_sorted = cc.plot_correlation(sd_avgs,amy_avgs,lco)
# Display the plot
hvplot.show(heatmap2)
    
# Ensure output is displayed inline
hv.output(heatmap2, backend='bokeh')

Launching server at http://localhost:46469


In [950]:
error_genes,all_marker_labels = cc.amy_gene_spell_checker(amy_df_VGLUT2_expr_ge_cv_ls_orig,amy_metadata_df_VGLUT2)
error_genes

  all_m = [amy_metadata_df.loc['markers'][x] for x in range((amy_metadata_df.loc['markers'].shape[0]))]


gene,  Papp2 not found in index
gene,  Cdkn1c not found in index
gene,  Matn2 not found in index
gene,  Lxh1 not found in index
gene,  Papp2 not found in index
gene,  Skor1 not found in index


['Papp2', 'Cdkn1c', 'Matn2', 'Lxh1', 'Papp2', 'Skor1']

In [951]:
all_marker_labels_c = cc.correct_error_genes(error_genes,['Pappa2','Lhx1'],all_marker_labels)

Papp2
Papp2
Papp2
Papp2
Cdkn1c
Cdkn1c
Matn2
Matn2
Lxh1
Lxh1
Lxh1
Skor1
Skor1


In [952]:
len(all_marker_labels_c)

45

In [518]:
folder

'/bigdata/isaac/Vglut2_files/'

In [953]:
with open(folder + 'Vglut2_cl_mg_filtered_2024-09-01.json') as json_data:
    VGLUT2_cl_mg_dict_filtered = json.load(json_data)

In [954]:
VGLUT2_cl_mg_dict_filtered = {int(k): v for k, v in VGLUT2_cl_mg_dict_filtered.items()}

In [955]:
VGLUT2_cl_mg_dict_filtered

{1: ['Foxb1', 'BC048546', 'B230323A14Rik'],
 2: ['Sim1',
  'Gal',
  'Angpt1',
  'Otp',
  'Fam46a',
  'Ebf3',
  'Pou3f2',
  'Pla2r1',
  'Oxt',
  'Avp',
  'Fign',
  'Epha4',
  'C1ql2'],
 3: ['Zfhx3',
  'Gabrq',
  'Gpr101',
  'Pcdh18',
  'Trhr',
  'Adcyap1',
  'Penk',
  'Cbln1',
  'Asb4',
  'Brs3',
  'Tmem255a',
  'BC039966',
  'Gm2694',
  'Zic1'],
 4: ['Lpl', 'Zic5', 'Zic4', 'Pcdh20'],
 5: ['Pgr15l'],
 6: ['Grp', 'Tcf4', 'Emx2'],
 7: ['Tshz1', 'Noa1', 'Col12a1', 'Ppp1r17', 'Calcr'],
 8: ['Nr4a2', 'Zfhx4'],
 9: ['H2-Q2', 'Medag', 'Sst', 'Igfbp6'],
 10: ['Ebf1'],
 11: ['Hmbs', 'Cd27', 'Krt17', 'B930036N10Rik', 'Hist4h4', '2900060B14Rik'],
 12: ['Lamb3', 'Col18a1'],
 13: ['Cartpt', 'Tac1', 'Arhgap6'],
 14: ['Bhlhe22', 'Thrsp'],
 15: ['Lmo3', 'Mylk'],
 16: ['Lmo1'],
 17: ['Gpr88',
  'Sparcl1',
  'Sema3a',
  'Mpped1',
  'Crabp1',
  'Krt9',
  'Nefm',
  'Necab1',
  'Stc1'],
 18: ['Foxp2', 'Sema3c', 'Peg10', 'Hrh3', 'Fam19a1', 'Ecel1'],
 19: ['Ndnf',
  'Sox6',
  'Mafb',
  'Slc17a7',
  'Lhfp',
  

In [961]:
sd_shared_cl_mg_dict, connector_df_alt_lco_marker_shared_top = cc.build_corr_table_shared_top(heatmap_argmax_df_alt_lco,
                                                     corr_matrix_manual_alt_lco,
                                                     all_marker_labels_c,
                                                     amy_metadata_df_VGLUT2,
                                                     VGLUT2_cl_mg_dict_filtered)

In [962]:
sd_shared_cl_mg_dict

{'2': ['Otp', 'Sim1'], '13': ['Cartpt', 'Tac1'], '15': ['Mylk', 'Lmo3']}

In [979]:
type(list(VGLUT2_mg_cl_dict_final.keys())[0])

int

In [985]:
sd_shared_cl_mg_dict_tmp = {int(k): v for k, v in sd_shared_cl_mg_dict.items()}

In [986]:
sd_shared_cl_mg_dict_tmp

{2: ['Otp', 'Sim1'], 13: ['Cartpt', 'Tac1'], 15: ['Mylk', 'Lmo3']}

In [988]:
VGLUT2_mg_cl_dict_final = cc.create_mg_cl_dict_final(VGLUT2_cl_mg_dict_filtered,sd_shared_cl_mg_dict)

match!
match!
match!


In [989]:
VGLUT2_mg_cl_dict_final

{1: ['Foxb1', 'BC048546'],
 2: ['Otp', 'Sim1'],
 3: ['Zfhx3', 'Gabrq'],
 4: ['Lpl', 'Zic5'],
 5: ['Pgr15l'],
 6: ['Grp', 'Tcf4'],
 7: ['Tshz1', 'Noa1'],
 8: ['Nr4a2', 'Zfhx4'],
 9: ['H2-Q2', 'Medag'],
 10: ['Ebf1'],
 11: ['Hmbs', 'Cd27'],
 12: ['Lamb3', 'Col18a1'],
 13: ['Cartpt', 'Tac1'],
 14: ['Bhlhe22', 'Thrsp'],
 15: ['Mylk', 'Lmo3'],
 16: ['Lmo1'],
 17: ['Gpr88', 'Sparcl1'],
 18: ['Foxp2', 'Sema3c'],
 19: ['Ndnf', 'Sox6'],
 20: ['Pbx3', '4930523C07Rik']}

In [990]:
sd_labels_df = cc.generate_sd_labels_df(VGLUT2_mg_cl_dict_final)

In [991]:
sd_labels_df

Unnamed: 0,lco_index,sd_label,sd_label_complete
0,1,"[Foxb1, BC048546]",1 Foxb1-BC048546
1,2,"[Otp, Sim1]",2 Otp-Sim1
2,3,"[Zfhx3, Gabrq]",3 Zfhx3-Gabrq
3,4,"[Lpl, Zic5]",4 Lpl-Zic5
4,5,[Pgr15l],5 Pgr15l
5,6,"[Grp, Tcf4]",6 Grp-Tcf4
6,7,"[Tshz1, Noa1]",7 Tshz1-Noa1
7,8,"[Nr4a2, Zfhx4]",8 Nr4a2-Zfhx4
8,9,"[H2-Q2, Medag]",9 H2-Q2-Medag
9,10,[Ebf1],10 Ebf1


In [992]:
amy_labels_df,amy_labels = cc.generate_amy_labels_df(connector_df_alt_lco_marker_shared_top,corr_matrix_manual_alt_lco)

In [993]:
amy_labels_df

Unnamed: 0,0
1,1 Tac1-Cartpt
2,2 Lhx1-Trp73
3,3 Cd34-Pappa2
4,4 Cd34-Pappa2
5,5 Peg10-Calcr
6,6 Oxt-Avp
7,7 Pkib-Emx2
8,8 Cdkn1c-Penk
9,9 Rxfp1-Cd24a
10,10 Gpr88-Dgkk


In [473]:
##update corr plot with dropped/merged clusters (from dimorph_processing.py)

In [995]:
folder = '/bigdata/isaac/Vglut2_files/'
cc.plot_correlation_w_labels(corr_matrix_manual_alt_lco_sorted,sd_labels_df,amy_labels_df, folder, 'VGLUT2_filtered_merged',savefig=True)

Launching server at http://localhost:44851


In [996]:
folder = '/bigdata/isaac/Vglut2_files/'
sd_labels_df.to_csv(folder + 'VGLUT2_sd_labels_df_' + today + '.csv')
amy_labels_df.to_csv(folder + 'VGLUT2_amy_labels_df_' + today + '.csv')

In [997]:
folder = '/bigdata/isaac/Vglut2_files/'
file = 'VGLUT2_mg_cl_dict_final_' + today
#write dict to file
with open(folder+file+'.json', "w") as outfile: 
    json.dump(VGLUT2_mg_cl_dict_final, outfile)

In [999]:
cc.plot_connector_plot_with_labels(connector_df_alt_lco_marker_shared_top,VGLUT2_mg_cl_dict_final,list(sd_labels_df['sd_label_complete']),amy_labels, folder,'Vglut2',savefig = True)

<IPython.core.display.Javascript object>

## Nonneuronal Analysis <a name="nn_analysis"></a>

To get nonneuronal class, combine all cell classes except GABA,VGLUT1/2
For markers row, just use cell (sub) class

In [1093]:
all_cell_classes

['Astro',
 'Astro_SC',
 'Astro_agt',
 'COP',
 'EC',
 'Epend',
 'GABA',
 'OL',
 'OPC',
 'OPC_cycling',
 'Peri',
 'VGLUT1',
 'VGLUT2',
 'VLMC',
 'VSM',
 'microglia',
 'pvm']

In [1118]:
NN_subclasses = [x for x in all_cell_classes if x not in ['GABA','VGLUT1','VGLUT2']]


In [1119]:
cell_class = NN_subclasses

In [1146]:
Counter(amy_metadata_df.loc['cell_class'])

Counter({'GABA': 13006,
         'VGLUT1': 11947,
         'OL': 7177,
         'Astro': 5447,
         'VGLUT2': 5231,
         'OPC': 3817,
         'microglia': 2385,
         'Astro_agt': 1848,
         'EC': 1845,
         'Peri': 1163,
         'VSM': 577,
         'COP': 391,
         'OPC_cycling': 289,
         'VLMC': 144,
         'pvm': 119,
         'Epend': 72,
         'Astro_SC': 56})

In [None]:
amy_df_NN_init = pd.DataFrame()

In [1153]:
amy_df.shape

(28002, 55514)

In [1152]:
amy_metadata_df.shape

(6, 55514)

In [1187]:
np.array(amy_metadata_df_gaba.loc['celltype'].apply(lambda x: int(re.search(r'-(\d+)-', x).group(1))))

array([11, 11, 11, ..., 51, 51, 51])

In [1189]:
amy_metadata_df_gaba

Unnamed: 0_level_0,GGGTATTTCTCGCGTT-1_10-1,GGTAACTAGACATCCT-1_18-1,GGTAATCGTGGACTAG-1_18-1,TTCCGGTAGTGGTGGT-1_18-1,AGGGTGAAGTACAACA-1_19-1,AGTACCATCCCTGGTT-1_19-1,CATGAGTTCCGGCTTT-1_19-1,TTGCCTGAGACGGTTG-1_19-1,AAACGCTTCACCATGA-1_23-1,TACCCACCAGTGACCC-1_23-1,...,GTCAAACTCCCGATCT-1_76-2,GTGTAACGTGAGACCA-1_76-2,GTGTGATAGGTGAGAA-1_76-2,TACTTCAGTAACTAAG-1_76-2,TCCGATCTCGTTTACT-1_76-2,TCGCTCAAGATTGAGT-1_76-2,TGCAGGCTCTTACACT-1_76-2,TTACAGGTCCGAGATT-1_76-2,TTGCGTCCAACACGAG-1_76-2,TTGCTGCAGCACTCCG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,GABA-11-Adora2a-Id4,...,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh,GABA-51-Vip-Crh
sample,10-1,18-1,18-1,18-1,19-1,19-1,19-1,19-1,23-1,23-1,...,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,2,2,2,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,2
cell_class,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,...,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA,GABA
markers,"[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]","[Adora2a, Id4]",...,"[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]","[Vip, Crh]"
cluster_label,11,11,11,11,11,11,11,11,11,11,...,51,51,51,51,51,51,51,51,51,51


In [1188]:
amy_metadata_df_NN_init

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
cell_class,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
markers,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


In [1154]:
#cell_cnt = 0
amy_df_NN_init = pd.DataFrame()
amy_metadata_df_NN_init = pd.DataFrame()
for x in NN_subclasses:
    NN_df_tmp = amy_df.loc[:,amy_metadata_df.loc['cell_class'] == x]
    NN_metadata_df_tmp = amy_metadata_df.loc[:,amy_metadata_df.loc['cell_class']==x]
    amy_df_NN_init = pd.concat([amy_df_NN_init,NN_df_tmp], axis = 1)
    amy_metadata_df_NN_init = pd.concat([amy_metadata_df_NN_init,NN_metadata_df_tmp],axis = 1)
    #print (x, amy_df.loc[:,amy_metadata_df.loc['cell_class']==x].shape)
    #cell_cnt+=amy_df.loc[:,amy_metadata_df.loc['cell_class']==x].shape[1]
#print ('total NN cells', cell_cnt)

In [1150]:
amy_df_NN_init.shape

(28002, 25330)

In [1158]:
amy_df_NN_init

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC168977.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PISD,0,8,0,0,8,7,0,0,0,0,...,17,0,0,9,0,0,8,10,9,8
DHRSX,0,0,0,0,0,0,0,0,0,0,...,0,0,6,0,6,0,0,0,4,8
Vmn2r122,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1157]:
amy_df_NN_init.loc['celltype'].index

Index(['AAACGCTCAGCCTTCT-1_08-1', 'AAAGGTATCATGGGAG-1_08-1',
       'AAATGGAGTAACTGCT-1_08-1', 'AACCTGAAGACGGTCA-1_08-1',
       'AACCTGAGTGAAAGTT-1_08-1', 'AAGAACAAGAAGGGAT-1_08-1',
       'AATAGAGTCATCACTT-1_08-1', 'AATCGTGGTCGAATTC-1_08-1',
       'AATGGCTCACGGTGAA-1_08-1', 'ACAGCCGTCCCAAGTA-1_08-1',
       ...
       'TCGAACAAGGAGCTGT-1_76-1', 'TGTACAGTCTGCAGCG-1_76-1',
       'AGGACTTTCATGGAGG-1_76-2', 'CCCATTGGTACCTAGT-1_76-2',
       'CTATCTACAATTGCTG-1_76-2', 'CTCATCGTCACCCATC-1_76-2',
       'CTGCCATGTATCGCTA-1_76-2', 'GCATGATTCTCGTCGT-1_76-2',
       'TGGGCGTAGAAGCCAC-1_76-2', 'TTCTGTAGTGGTATGG-1_76-2'],
      dtype='object', length=25330)

In [1155]:
amy_metadata_df_NN_init.shape

(6, 25330)

In [1160]:
amy_df_NN_expr_ge_cv_ls, amy_metadata_df_NN = cc.process_amy_data_class(amy_df_NN_init,amy_metadata_df_NN_init, IEG_list,sex_gene_list)

non neuronal
removing # duplicate gene rows:  65
removing  53  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
removing  6  genes found in  ['Xist', 'Tsix', 'Ddx3y', 'Eif2s3y', 'Kdm5d', 'Uty']
Total genes reduced from 27874 to 16577


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

column (gene) mean after standardization: 0.00
column (gene) sigma after standardization: 1.00


In [1166]:
amy_metadata_df_NN

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
cell_class,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
markers,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


In [1192]:
np.unique(amy_metadata_df_NN.loc['cell_class'])

array(['Astro', 'Astro_SC', 'Astro_agt', 'COP', 'EC', 'Epend', 'OL',
       'OPC', 'OPC_cycling', 'Peri', 'VLMC', 'VSM', 'microglia', 'pvm'],
      dtype=object)

In [1165]:
amy_df_NN_expr_ge_cv_ls

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Plp1,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,0.283968,-0.831833,...,-0.831833,-0.157297,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833
Ptgds,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,...,-0.677320,-0.677320,0.415777,0.303966,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320
Ccl8,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,-0.041290,...,13.166384,25.233417,-0.041290,17.657943,-0.041290,-0.041290,-0.041290,14.372630,27.332388,-0.041290
Acta2,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,...,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287
Ccl4,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,...,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Myrf,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,...,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801,-0.523801
Anks1b,-0.755874,-0.755874,-0.755874,-0.755874,1.507510,-0.755874,-0.755874,1.201632,-0.755874,-0.755874,...,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874
Clec12a,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,-0.030818,...,-0.030818,-0.030818,-0.030818,-0.030818,28.245740,-0.030818,41.139388,-0.030818,-0.030818,-0.030818
Gpr50,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,...,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986,-0.047986


In [1167]:
#save to file

folder = '/bigdata/isaac/Nonneuronal_files/'

file1 = 'amy_df_NN_expr_ge_cv_ls' 
file2 = 'amy_metadata_df_NN' 

amy_df_NN_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
amy_metadata_df_NN.to_json(folder+file2+'.json')

In [1168]:
folder = '/bigdata/isaac/Nonneuronal_files/'
amy_df_NN_expr_ge_cv_ls_orig = pd.read_feather(folder + 'amy_df_NN_expr_ge_cv_ls.feather')
amy_metadata_df_NN_orig = pd.read_json(folder + 'amy_metadata_df_NN.json')

### Load NN SD data

In [324]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1170]:
folder = '/bigdata/isaac/Nonneuronal_files/'

#NN_meta_data_df_plis = pd.read_json(folder + 'NN_meta_data_df_plis2024-08-21.json')

NN_df_prelinkage_ls_orig = pd.read_feather(folder + 'NN_df_pre_linkage_ls_2024-09-02.feather')
#NN_metadata_df_prelinkage_ls_orig = pd.read_json(folder + 'NN_meta_data_df_pre_linkage_2024-08-21.json')
NN_meta_data_df_plis_filtered = pd.read_json(folder + 'NNmeta_data_df_plis_filtered_2024-09-02.json')


In [1171]:
NN_df_plis_filtered = NN_df_prelinkage_ls_orig.reindex(index = NN_meta_data_df_plis_filtered.columns)

In [1205]:
NN_df_plis_filtered

Unnamed: 0,GTGTGGCAGCCACTCG-1_10X37_1,GTAATGCCACATGAAA-1_10X52_2,CGTGCTTGTCTGTCAA-1_10X37_2,ACCACAAGTATCGGTT-1_10X35_2,TCTGTCGGTCACCACG-1_10X35_1,TGCTCCAGTAACAGGC-1_10X37_2,AAGTCGTTCGCCTATC-1_10X38_1,CTTGAGAGTTATTCTC-1_10X35_2,TCGAACATCGGTTAGT-1_10X51_4,ATGACCAAGAAGCGGG-1_10X52_1,...,CAAGAGGGTCAATCTG-1_10X36_2,TATACCTAGAGTAACT-1_10X35_1,GCACGTGGTTATTCCT-1_10X37_1,GATGATCTCAAGCCTA-1_10X36_1,GGTGTCGAGAAGCTGC-1_10X36_2,GAGAGGTCACATCATG-1_10X37_2,ATCAGGTTCCACTAGA-1_10X35_2,TCCACGTAGAGCATCG-1_10X37_1,CTGCGAGCACTACGGC-1_10X35_2,CTCTGGTTCGAAGCAG-1_10X37_1
Plp1,-0.577533,0.093173,-0.577533,-0.577533,-0.046011,-0.046011,-0.242180,-0.577533,-0.577533,-0.242180,...,2.857944,2.838410,2.854743,2.853939,2.891084,3.013789,2.992821,2.958752,2.942952,2.973423
Ptgds,-0.447747,0.354358,0.058325,-0.447747,-0.447747,0.058325,-0.447747,-0.447747,-0.447747,-0.447747,...,1.902377,1.302973,0.564396,0.354358,1.366501,2.577184,1.662534,1.931012,1.872572,-0.447747
Acta2,-0.230281,0.821739,0.821739,1.437131,2.212434,0.821739,3.541171,1.437131,1.873759,2.212434,...,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281,-0.230281
Ccl4,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,...,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776,-0.217776
Cxcl10,-0.138062,-0.138062,4.570505,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,...,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062,-0.138062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Serpinf1,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,...,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839,-0.132839
Pid1,-0.740683,-0.740683,0.555463,-0.740683,-0.740683,-0.740683,-0.740683,0.555463,-0.740683,-0.740683,...,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683,-0.740683
Evi2a,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,-0.413490,...,2.485197,-0.413490,-0.413490,2.485197,2.951781,3.333008,3.333008,2.951781,3.333008,-0.413490
Anks1b,-0.837112,-0.032725,-0.032725,-0.837112,-0.837112,-0.837112,-0.837112,-0.837112,1.030618,-0.837112,...,1.712736,3.288819,3.070587,3.255179,1.421089,1.945611,1.945611,2.046587,2.139475,2.898348


In [1175]:
NN_meta_data_df_plis_filtered.shape

(29, 1976)

### Get intersection

In [1176]:
amy_df_NN_expr_ge_cv_ls, NN_df_plis_filtered = cc.get_df_gene_intersection(NN_df_plis_filtered,amy_df_NN_expr_ge_cv_ls,IEG_list)

removing  0  genes found in  ['Btg2', 'Jun', 'Egr4', 'Fosb', 'Junb', 'Gadd45g', 'Fos', 'Arc', 'Nr4a1', 'Npas4', 'Coq10b', 'Tns1', 'Per2', 'Ptgs2', 'Rnd3', 'Tnfaip6', 'Srxn1', 'Tiparp', 'Ccnl1', 'Mcl1', 'Dnajb5', 'Nr4a3', 'Fosl2', 'Nptx2', 'Rasl11a', 'Mest', 'Sertad1', 'Egr2', 'Midn', 'Gadd45b', 'Dusp6', 'Irs2', 'Plat', 'Ier2', 'Rrad', 'Tpbg', 'Csrnp1', 'Peli1', 'Per1', 'Kdm6b', 'Inhba', 'Plk2', 'Ifrd1', 'Baz1a', 'Trib1', 'Pim3', 'Lrrk2', 'Dusp1', 'Cdkn1a', 'Pim1', 'Sik1', 'Frat2', 'Dusp5']
Index(['Plp1', 'Ptgds', 'Acta2', 'Ccl4', 'Cxcl10', 'Gdf15', 'Tagln', 'Myoc',
       'Ccl12', 'C1qa',
       ...
       'Fxyd6', 'Nusap1', 'Lims2', 'Tst', 'Hspa1b', 'Serpinf1', 'Pid1',
       'Evi2a', 'Anks1b', 'Gpr50'],
      dtype='object', length=457)


In [1177]:
amy_df_NN_expr_ge_cv_ls.shape

(457, 25330)

In [1178]:
NN_df_plis_filtered.shape

(457, 1976)

### Checkpoint - save intersected data/metadata to file / load from here

In [1190]:
folder

'/bigdata/isaac/Nonneuronal_files/'

In [1206]:
#save to file

folder = '/bigdata/isaac/Nonneuronal_files/'

file1 = 'amy_df_NN_expr_ge_cv_ls_cell_comp' 
file2 = 'amy_metadata_df_NN_cell_comp' 

amy_df_NN_expr_ge_cv_ls.to_feather(folder+file1+'.feather')
amy_metadata_df_NN.to_json(folder+file2+'.json')

#################################

file1 = 'NN_df_prelinkage_ls_cell_comp' 
file2 = 'NN_meta_data_df_prelinkage_cell_comp' 

NN_df_plis_filtered.to_feather(folder+file1+'.feather')
NN_meta_data_df_plis_filtered.to_json(folder+file2+'.json')


In [52]:
#load data
folder = '/bigdata/isaac/Nonneuronal_files/'

#amy_df_VGLUT1_expr_ge_cv_ls = pd.read_feather(folder + 'amy_df_VGLUT1_expr_ge_cv_ls_cell_comp.feather')
#amy_metadata_df_VGLUT1 = pd.read_json(folder + 'amy_metadata_df_VGLUT1_cell_comp.json')

#VGLUT1_df_prelinkage_ls = pd.read_feather(folder + 'VGLUT1_df_prelinkage_ls_cell_comp.feather')
#VGLUT1_meta_data_df_prelinkage = pd.read_json(folder + 'VGLUT1_meta_data_df_prelinkage_cell_comp.json' )

### Compute avg expression for all genes, for every cluster

In [None]:
# add cluster label corresponding to NN subclass

In [1193]:
NN_subclasses

['Astro',
 'Astro_SC',
 'Astro_agt',
 'COP',
 'EC',
 'Epend',
 'OL',
 'OPC',
 'OPC_cycling',
 'Peri',
 'VLMC',
 'VSM',
 'microglia',
 'pvm']

In [1198]:
NN_subclass_2_cl_dict = dict(zip(NN_subclasses,np.arange(1,len(NN_subclasses)+1)))

In [1199]:
NN_subclass_2_cl_dict

{'Astro': 1,
 'Astro_SC': 2,
 'Astro_agt': 3,
 'COP': 4,
 'EC': 5,
 'Epend': 6,
 'OL': 7,
 'OPC': 8,
 'OPC_cycling': 9,
 'Peri': 10,
 'VLMC': 11,
 'VSM': 12,
 'microglia': 13,
 'pvm': 14}

In [1181]:
amy_metadata_df_NN

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
cell_class,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
markers,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


In [1200]:
tmp = np.array(amy_metadata_df_NN.loc['cell_class'])
#print (np.unique(tmp))
tmp_c = [NN_subclass_2_cl_dict[x] for x in tmp]
amy_metadata_df_NN.loc['cluster_label'] = tmp_c

In [1201]:
amy_metadata_df_NN

Unnamed: 0_level_0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
celltype,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
sample,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,08-1,...,76-1,76-1,76-2,76-2,76-2,76-2,76-2,76-2,76-2,76-2
FC time,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
batch,1,1,1,1,1,1,1,1,1,1,...,3,3,3,3,3,3,3,3,3,3
cell_class,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,Astro,...,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm,pvm
markers,[],[],[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
cluster_label,1,1,1,1,1,1,1,1,1,1,...,14,14,14,14,14,14,14,14,14,14


In [1202]:
amy_df_NN_expr_ge_cv_ls

Unnamed: 0,AAACGCTCAGCCTTCT-1_08-1,AAAGGTATCATGGGAG-1_08-1,AAATGGAGTAACTGCT-1_08-1,AACCTGAAGACGGTCA-1_08-1,AACCTGAGTGAAAGTT-1_08-1,AAGAACAAGAAGGGAT-1_08-1,AATAGAGTCATCACTT-1_08-1,AATCGTGGTCGAATTC-1_08-1,AATGGCTCACGGTGAA-1_08-1,ACAGCCGTCCCAAGTA-1_08-1,...,TCGAACAAGGAGCTGT-1_76-1,TGTACAGTCTGCAGCG-1_76-1,AGGACTTTCATGGAGG-1_76-2,CCCATTGGTACCTAGT-1_76-2,CTATCTACAATTGCTG-1_76-2,CTCATCGTCACCCATC-1_76-2,CTGCCATGTATCGCTA-1_76-2,GCATGATTCTCGTCGT-1_76-2,TGGGCGTAGAAGCCAC-1_76-2,TTCTGTAGTGGTATGG-1_76-2
Plp1,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,0.283968,-0.831833,...,-0.831833,-0.157297,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833,-0.831833
Ptgds,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,...,-0.677320,-0.677320,0.415777,0.303966,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320,-0.677320
Acta2,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,...,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287,-0.176287
Ccl4,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,...,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429,-0.140429
Cxcl10,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,...,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704,-0.050704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Serpinf1,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,...,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879,-0.133879
Pid1,-0.590355,1.437783,-0.590355,-0.590355,-0.590355,-0.590355,-0.590355,-0.590355,-0.590355,-0.590355,...,-0.590355,-0.590355,1.205808,-0.590355,1.205808,2.024829,-0.590355,-0.590355,-0.590355,1.968870
Evi2a,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,-0.570896,...,-0.570896,1.882151,1.290119,-0.570896,1.290119,-0.570896,-0.570896,-0.570896,0.968326,-0.570896
Anks1b,-0.755874,-0.755874,-0.755874,-0.755874,1.507510,-0.755874,-0.755874,1.201632,-0.755874,-0.755874,...,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874,-0.755874


In [1203]:
amy_avgs = dp.compute_avg_expr_per_cluster_label(amy_df_NN_expr_ge_cv_ls,amy_metadata_df_NN)

In [1204]:
amy_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Plp1,-0.66546,-0.578277,-0.667195,1.029069,-0.70141,-0.539389,1.470026,-0.395822,-0.394724,-0.700734,-0.650205,-0.727498,-0.710365,-0.690616
Ptgds,-0.59562,-0.517051,-0.588183,-0.008392,-0.625089,-0.526547,1.400389,-0.48231,-0.506764,-0.6045,0.424494,-0.61824,-0.614362,-0.374213
Acta2,-0.167423,-0.176287,-0.169854,-0.170864,-0.143382,1.580089,-0.166018,-0.161931,-0.167771,0.170143,-0.084717,6.148721,-0.168185,-0.176287
Ccl4,-0.13712,-0.140429,-0.134243,-0.122402,-0.127292,-0.140429,-0.13487,-0.134695,-0.140429,-0.120598,-0.140429,-0.128073,1.269295,0.213776
Cxcl10,-0.044022,-0.050704,-0.050704,-0.050704,-0.007912,0.037556,-0.047404,-0.050704,-0.050704,-0.024555,0.18037,-0.050704,0.391192,0.128604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Serpinf1,-0.086766,-0.011413,-0.128533,-0.133879,-0.119879,-0.133879,-0.129943,-0.131981,-0.133879,-0.101128,5.742457,-0.133879,0.756146,0.281902
Pid1,-0.138306,0.089628,-0.098495,0.16159,-0.558331,-0.380757,-0.539256,1.302543,1.003597,0.308255,0.117441,0.104624,0.013563,0.545974
Evi2a,-0.551346,-0.531573,-0.563983,-0.167022,-0.566082,-0.540311,1.027545,-0.464814,-0.453019,-0.56468,-0.557973,-0.570896,0.339503,0.092551
Anks1b,-0.572246,-0.338028,-0.58388,0.164259,-0.58714,-0.426372,0.511481,0.996851,0.850098,-0.575877,-0.501712,-0.574387,-0.553477,-0.532704


In [1208]:
sd_avgs = dp.compute_avg_expr_per_cluster_label(NN_df_plis_filtered,NN_meta_data_df_plis_filtered)

In [1209]:
sd_avgs

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11
Plp1,-0.298094,-0.474753,-0.455696,0.048752,-0.500343,-0.497147,-0.475469,-0.300942,-0.356264,2.320411,2.592855
Ptgds,-0.282364,-0.393036,-0.380672,0.014366,-0.414812,-0.412778,-0.425743,-0.332763,-0.375272,2.574494,2.031106
Acta2,1.811885,-0.230281,-0.204412,-0.193369,-0.221641,-0.224147,4.608164,-0.230281,-0.227823,-0.230281,-0.230281
Ccl4,-0.166938,-0.183426,2.625442,3.249038,-0.204727,-0.217776,-0.162517,-0.201551,-0.183374,-0.196234,-0.205192
Cxcl10,0.169112,0.190126,0.73471,0.855016,-0.005857,-0.026118,-0.138062,-0.138062,0.051523,-0.138062,-0.138062
...,...,...,...,...,...,...,...,...,...,...,...
Serpinf1,-0.132839,-0.132839,0.216344,1.004096,-0.132839,-0.132839,-0.132839,-0.132839,0.197922,-0.042584,-0.132839
Pid1,-0.347434,1.024666,0.080209,-0.085067,0.047597,-0.715799,0.432261,1.321005,-0.318878,-0.705864,-0.68935
Evi2a,-0.41349,-0.273062,0.651019,1.043503,-0.404562,-0.409265,-0.41349,-0.275711,-0.383013,2.791634,1.965869
Anks1b,-0.234765,0.438778,0.825773,-0.319138,-0.769546,-0.77574,0.91294,0.515754,-0.735941,0.872978,1.551485


In [1210]:
heatmap2, heatmap_argmax_df_alt_lco, corr_matrix_manual_alt_lco, corr_matrix_manual_alt_lco_sorted = cc.plot_correlation(sd_avgs,amy_avgs)
# Display the plot
hvplot.show(heatmap2)
    
# Ensure output is displayed inline
hv.output(heatmap2, backend='bokeh')

Launching server at http://localhost:39505


In [1211]:
error_genes,all_marker_labels = cc.amy_gene_spell_checker(amy_df_NN_expr_ge_cv_ls_orig,amy_metadata_df_NN)
error_genes

  


[]

In [1212]:
all_marker_labels

[[]]

In [1213]:
folder = '/bigdata/isaac/Nonneuronal_files/'

In [1214]:
with open(folder + 'NN_cl_mg_filtered_2024-09-02.json') as json_data:
    NN_cl_mg_filtered = json.load(json_data)

In [1215]:
NN_cl_mg_filtered = {int(k): v for k, v in NN_cl_mg_filtered.items()}

In [1217]:
NN_cl_mg_filtered

{1: ['Tmem212',
  'Sntn',
  'Gm16160',
  'Gm10714',
  'Spag17',
  'Calml4',
  'Cdhr3',
  'Stoml3',
  'Ccdc153'],
 2: ['Pdgfra',
  'Prc1',
  'Lockd',
  'Ube2c',
  'Mki67',
  'Top2a',
  'Melk',
  'Esco2',
  'Neil3',
  'Pbk',
  'Ska1'],
 3: ['Ccl2'],
 4: ['Fcgr3',
  'Aif1',
  'Ctss',
  'C1qa',
  'Dusp2',
  'Lyz2',
  'Gm43936',
  'Rgs1',
  'Fcrls',
  'Tyrobp',
  'Trem2',
  'C1qb',
  'C1qc',
  'Ccl4',
  'Cx3cr1',
  'P2ry12',
  'Ccl12',
  'Il1b'],
 5: ['Atp13a5',
  'Ifitm1',
  'Vtn',
  'Abcc9',
  'Itm2a',
  'Slc6a13',
  'Slco1a4',
  'Cldn5',
  'Kcnj8',
  'Nostrin'],
 6: ['Foxq1', 'Egfl7', 'Kank3', 'Gm9946', 'Cd93', 'Lef1'],
 7: ['Rab3c',
  'Vgf',
  'Scn2a1',
  'Prlr',
  'Synpr',
  'Pcp4',
  'Cnn1',
  'Acta2',
  'Tagln',
  'Myh11',
  'Pln',
  'Tpm2',
  'Fbxl22'],
 8: ['Lhfpl3', 'Vcan', 'Fam89a', 'C1ql1', 'Matn4', 'Snx22'],
 9: ['Gm11681', 'Aldoc', 'Myoc', 'Gfap', 'Aqp4', 'Slc4a4', 'C4b', 'Ntsr2'],
 10: ['Ptgds',
  'Anln',
  'Trf',
  'Klk6',
  'Hapln2',
  'Gm28322',
  'Il33',
  'Apod',
  'Aspa

# load marker csv to map filtered dict markers to nn subclass

In [1222]:
cns_marker_df = pd.read_csv('/home/isaac/from_muhammad/CNS_cell_markers_expanded_IB.csv')

Match capitlization with NN dict

In [1227]:
cns_marker_df['Markers'] = cns_marker_df['Markers'].apply(lambda x: ', '.join([marker.capitalize() for marker in x.split(', ')]))

In [1228]:
cns_marker_df

Unnamed: 0,CellType,Markers
0,Neuron,"Map2, Nefl, Rbfox3, Syn1, Dcx, Camk2a, Gad1, G..."
1,Astrocyte,"Gfap, Aldh1l1, Slc1a2, Slc1a3, Aqp4, Glul, Cd4..."
2,Oligodendrocyte,"Mog, Mbp, Mag, Plp1, Mobp, Olig1, Olig2, Sox10..."
3,Microglia,"P2ry12, Cx3cr1, Csf1r, Trem2, Aif1, Tmem119, C..."
4,Ependymal,"Vim, S100b, Foxj1, Ttr, Gfap, Dnah5, Mcidas, S..."
5,Endothelial,"Cldn5, Pecam1, Flt1, Vwf, Cdh5, Tie1, Tie2, Ic..."
6,Pericyte,"Pdgfrb, Ng2, Rgs5, Cd146, Acta2, Anpep, Mcam, ..."
7,Choroid Plexus,"Ttr, Kirrel2, Slc4a5, Aqp1, Tjp1, Ocln, Clic6,..."
8,Neural Stem/Progenitor,"Nestin, Sox2, Prom1, Pax6, Fabp7, Hes1, Vim, D..."
9,Ciliated Cell,"Foxj1, Dnah5, Rsph1, Ccdc39, Ccdc40, Mcidas, Rfx3"


In [1229]:
cns_marker_df.iloc[0,1]

'Map2, Nefl, Rbfox3, Syn1, Dcx, Camk2a, Gad1, Gad2, Slc17a7, Slc1a1, Chat, Th, Calb1, Pvalb, Sst, Vip, Crh, Pdyn, Penk'

Create new NN dict mapping markers to cell types from cns_marker_df

In [1314]:
NN_cl_ct = dict.fromkeys(NN_cl_mg_filtered)
NN_cl_ct

{1: None,
 2: None,
 3: None,
 4: None,
 5: None,
 6: None,
 7: None,
 8: None,
 9: None,
 10: None,
 11: None}

In [1315]:
NN_cl_ct = dict.fromkeys(NN_cl_mg_filtered)
for celltype, markers in zip(cns_marker_df['CellType'],cns_marker_df['Markers']):
    for k,v in NN_cl_mg_filtered.items():
        for g in v:
            if g in markers:
                #use cell type if found
                NN_cl_ct.update({k:celltype})

In [1316]:
NN_cl_ct

{1: None,
 2: 'Fibroblast',
 3: None,
 4: 'Monocyte/Macrophage',
 5: 'Pericyte',
 6: None,
 7: 'Smooth Muscle Cell',
 8: None,
 9: 'Smooth Muscle Cell',
 10: None,
 11: 'Oligodendrocyte'}

Update /overwrite manually from Amit Meeting, matching format with NN_subclass

In [1336]:
NN_cl_ct[1] = 'Epend'
NN_cl_ct[2] = 'Proliferating_Cells'
NN_cl_ct[3] = 'Microglia_1'
NN_cl_ct[4] = 'Microglia_2'
NN_cl_ct[5] = 'Peri_1'
NN_cl_ct[6] = 'Peri_2' 
NN_cl_ct[7] = 'VSM'
NN_cl_ct[8] = 'OPC'
NN_cl_ct[9] = 'Astro'
NN_cl_ct[10] = 'OL_1'
NN_cl_ct[11] = 'OL_2'




In [1337]:
NN_cl_ct

{1: 'Epend',
 2: 'Proliferating_Cells',
 3: 'Microglia_1',
 4: 'Microglia_2',
 5: 'Peri_1',
 6: 'Peri_2',
 7: 'VSM',
 8: 'OPC',
 9: 'Astro',
 10: 'OL_1',
 11: 'OL_2'}

Match formatting of NN_subclass_list with NN_cl_ct dict

In [1326]:
NN_subclasses

['Astro',
 'Astro_SC',
 'Astro_agt',
 'COP',
 'EC',
 'Epend',
 'OL',
 'OPC',
 'OPC_cycling',
 'Peri',
 'VLMC',
 'VSM',
 'microglia',
 'pvm']

In [1338]:
NN_sd_labels_df = pd.DataFrame.from_dict(NN_cl_ct,orient='index')
NN_sd_labels_df = NN_sd_labels_df.rename(columns = {0 : 'sd_label_complete'})
NN_sd_labels_df

Unnamed: 0,sd_label_complete
1,Epend
2,Proliferating_Cells
3,Microglia_1
4,Microglia_2
5,Peri_1
6,Peri_2
7,VSM
8,OPC
9,Astro
10,OL_1


In [1328]:
NN_amy_dict = dict(zip(np.arange(1,len(NN_subclasses)+1),NN_subclasses))
NN_amy_dict

{1: 'Astro',
 2: 'Astro_SC',
 3: 'Astro_agt',
 4: 'COP',
 5: 'EC',
 6: 'Epend',
 7: 'OL',
 8: 'OPC',
 9: 'OPC_cycling',
 10: 'Peri',
 11: 'VLMC',
 12: 'VSM',
 13: 'microglia',
 14: 'pvm'}

In [1329]:
NN_amy_labels_df = pd.DataFrame.from_dict(NN_amy_dict,orient='index')
NN_amy_labels_df

Unnamed: 0,0
1,Astro
2,Astro_SC
3,Astro_agt
4,COP
5,EC
6,Epend
7,OL
8,OPC
9,OPC_cycling
10,Peri


In [1330]:
##update corr plot with dropped/merged clusters (from dimorph_processing.py)

In [1024]:
folder

'/bigdata/isaac/Vglut1_files/'

In [1339]:
folder = '/bigdata/isaac/Nonneuronal_files/'
cc.plot_correlation_w_labels(corr_matrix_manual_alt_lco_sorted,NN_sd_labels_df,NN_amy_labels_df, folder, 'NN_filtered_merged',savefig=True)

Launching server at http://localhost:35295


In [1304]:
today

'2024-09-09'

In [740]:
#write labels df's to file

In [1340]:
folder = '/bigdata/isaac/Nonneuronal_files/'
NN_sd_labels_df.to_csv(folder + 'NN_sd_labels_df_' + today + '.csv')
NN_amy_labels_df.to_csv(folder + 'NN_amy_labels_df_' + today + '.csv')

In [1341]:
NN_cl_ct = {str(k):v for k,v in NN_cl_ct.items()}
NN_amy_dict = {str(k):v for k,v in NN_amy_dict.items()}

In [1342]:
folder = '/bigdata/isaac/Nonneuronal_files/'
file = 'NN_cl_ct' + today
#write dict to file
with open(folder+file+'.json', "w") as outfile: 
    json.dump(NN_cl_ct, outfile)
    
file2 = 'NN_amy_dict'
with open(folder+file2+'.json', "w") as outfile: 
    json.dump(NN_amy_dict, outfile)

In [906]:
folder

'/bigdata/isaac/Vglut1_files/'

In [None]:
###################old/backup/scratch###################

In [None]:
tips = sns.load_dataset("tips") 

In [None]:
tips

In [None]:
fig,ax = plt.subplots()
sns.stripplot(x="day", y="total_bill", data=tips, jitter = 0.1)
plt.show()

### corrwith() vs. corr() example

In [None]:
np.random.seed(5)
index = ["g1", "g2", "g3", "g4", "g5"]
columns = ["one", "two", "three", "four","five"]
df1 = pd.DataFrame(np.random.rand(5,5), index=index, columns=columns)
df2 = pd.DataFrame(np.random.rand(5,4), index=index, columns=columns[:4])


In [None]:
df1

In [None]:
df2

In [None]:
df1.corrwith(df2, axis=0)

In [None]:
pd.concat([df1, df2],axis=1).corr()

In [None]:

# Initialize an empty dataframe to hold the correlation coefficients
correlation_matrix_chat = pd.DataFrame(index=df1.columns, columns=df2.columns)

# Compute the correlation coefficients
for col1 in df1.columns:
    for col2 in df2.columns:
        correlation_matrix_chat.loc[col1, col2] = df1[col1].corr(df2[col2])

In [None]:
correlation_matrix_chat

In [None]:
np.random.rand(4,4)

In [None]:
sd_avgs.columns

2024-06-07 gaba_cl_mg_dict is missing a few clusters for some reason, reran on 2024-06-27

In [None]:

#with open('/bigdata/isaac/gaba_files/gaba_cl_mg_dict2024-06-07.json') as json_data:
    #gaba_cl_mg_dict = json.load(json_data)

In [None]:
sorted(gaba_cl_mg_dict.items())

In [None]:
gaba_cl_mg_dict['31']

In [None]:
gaba_df_marker_log_and_std.index[:18]

In [None]:
gaba_df_marker_log_and_std.shape

In [None]:
gaba_df_marker

In [None]:
gaba_meta_data_df_plis

In [None]:
gaba_mean_marker_per_cluster = pd.DataFrame(columns=['gene', 'avg_expr','cluster_id'])
for k,v in gaba_cl_mg_dict.items():
    #print (k,v)
    tmp_df = pd.DataFrame({'gene':gaba_cl_mg_dict[k],
                           'avg_expr':np.array(gaba_df_marker.loc[gaba_cl_mg_dict[k],gaba_df_marker.iloc[:,np.where(gaba_meta_data_df_plis.loc['cluster_label']==int(k))[0]].columns].mean(axis=1)), 
                           'cluster_id':np.array(int(k)*np.ones(len(gaba_cl_mg_dict[k])))})
    gaba_mean_marker_per_cluster = pd.concat([gaba_mean_marker_per_cluster, tmp_df])

gaba_mean_marker_per_cluster

In [None]:
np.where(gaba_meta_data_df_plis.loc['cluster_label']==31)[0]

In [None]:
gaba_cl_mg_dict['63']

In [None]:
def compute_class_marker_mean(df,meta_data_df,cl_mg_dict):
    '''
    For level 2 analysis. Computes average expression of each marker gene for a specific cell class (e.g. gaba)                  
    ----------------------------------------------------------------------
    Parameters

    df: pandas.core.frame.DataFrame
        marker gene expression dataframe of a single cell class
    meta_data_df: pandas.core.frame.DataFrame
        gene expression metadata corresponding to df
    cl_mg_dict: pandas.core.frame.DataFrame
        2D input array, typically output from tsne, in dataframe format
    ----------------------------------------------------------------------
    Returns
    mean_marker_per_cluster: pandas.core.frame.DataFrame
        dataframe storing marker gene, avg expression, and cluster ID
    '''
    mean_marker_per_cluster = pd.DataFrame(columns=['gene', 'avg_expr','cluster_id'])
    for k,v in cl_mg_dict.items():
        #print (k,v)
        tmp_df = pd.DataFrame({'gene':cl_mg_dict[k],
                               'avg_expr':np.array(df.loc[cl_mg_dict[k],df.iloc[:,np.where(meta_data_df.loc['cluster_label']==int(k))[0]].columns].mean(axis=1)), 
                               'cluster_id':np.array(int(k)*np.ones(len(cl_mg_dict[k])))})
        
        #print (tmp_df.head())
        mean_marker_per_cluster = pd.concat([mean_marker_per_cluster, tmp_df])

    return mean_marker_per_cluster 

In [None]:
gaba_mean_marker_per_cluster = compute_class_marker_mean(gaba_df_marker, gaba_meta_data_df_plis, gaba_cl_mg_dict)

In [None]:
gaba_mean_marker_per_cluster