In [None]:
import scanpy as sc
import os
import pandas as pd
import matplotlib as mpl
import sys
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('/wsfish/glioblastoma/')
import FISHspace as sp

%reload_ext autoreload
%autoreload 2

mpl.rcParams['pdf.fonttype'] = 42

# save figure with no pad
mpl.rcParams['savefig.pad_inches'] = 0
mpl.rcParams['savefig.bbox'] = 'tight'

# set axes width
mpl.rcParams['axes.linewidth'] = 0.2
mpl.rcParams['xtick.minor.pad'] = 0
mpl.rcParams['xtick.major.pad'] = 0
mpl.rcParams['ytick.minor.pad'] = 0
mpl.rcParams['ytick.major.pad'] = 0
mpl.rcParams['xtick.minor.width'] = 0.2
mpl.rcParams['xtick.major.width'] = 0.2
mpl.rcParams['ytick.minor.width'] = 0.2
mpl.rcParams['ytick.major.width'] = 0.2

# use colorblind seaborn style
plt.style.use('seaborn-colorblind')

# Full Data

In [None]:
#adata_full = sc.read_h5ad('../OrganoidAnalysis/integration/GBMOrganoids_scVIsurgery.h5ad')

In [None]:
adata_full = sc.read_h5ad('../OrganoidAnalysis/integration/GBMOrganoids_scVIsurgery20240408.h5ad')

In [None]:
annotation_colors = {
    'rAC':'#FF6EC7',
    'preOPC':'#7befb2',
    '+HYP2': '#C50F53',
    'RG':'#ff9470',
    'GBL':'#b9e670',
    'nIPC':'#FFF192',
    'OPC':'#89c4f4',
    'AC':'#2ecc71',
    '+HYP1':'#EDE8F3',
    '+HYP3':'#840034',
    'FBL':'#9f5afd',
    '+HR1':'#e3ba8f',
    '+HR2':'#825e5c', #'#be5683',
    'OPC2':'#89c4f4',
}

annotation_colors = {
    '+HYP2':'#840034',
    'preOPC': '#7befb2',
    'RG':'#ff9470',
    '+HYP1':'#C50F53',
    'AC1':'#2ecc71',
    'nIPC':'#FFF192',
    'OPC':'#89c4f4',
    'GBL':'#b9e670',
    'hAC':'#EDE8F3',
    'AC2':'#2ecc71',
    'FBL':'#9f5afd',
    'rAC':'#e3ba8f',
    'cOPC':'#038aff',
    'Unknown':'#efeff0',
}

In [None]:
plt.figure(figsize=(2.5,2.5))
sc.pl.umap(
    adata_full,
    color=["annotation_20240408","majority_voting","line"],
    frameon=False,
    ncols=4,
    s=10,
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.tl.rank_genes_groups(adata_full, groupby='condition', method='wilcoxon')
sc.pl.rank_genes_groups(adata_full,n_genes=25)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.tl.rank_genes_groups(adata_full, groupby='time', method='t-test',)
sc.pl.rank_genes_groups(adata_full,n_genes=10)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.tl.rank_genes_groups(adata_full, groupby='annotation_20240408', method='t-test')

In [None]:
mpl.rcParams['figure.figsize'] = 4,4
mpl.rcParams['axes.grid'] = True
sc.pl.rank_genes_groups(adata_full)

In [None]:
leiden_colors = adata_full.uns['annotation_20240408_colors']

In [None]:
adata_full = adata_full[adata_full.obs['annotation_20240408'] != 'Unknown']

In [None]:
cluster_key = 'annotation_20240408'
folder = 'annotation20240408'
filename = 'Hypoxia'

adata_c = adata_full[(adata_full.obs.condition == 'hyp') | (adata_full.obs.condition == 'initial')]

#add Sample for plotting lines separaterly
adata_c.obs['Sample'] = pd.Categorical([l+'-'+t for l, t in  zip(adata_c.obs.line, adata_c.obs.time)])
adata_c.obs['Area'] = adata_c.obs.cell_area

line_order = adata_c.obs.line.cat.categories.tolist()
time_order = ['000h', '024h', '072h', '144h']


for s in adata_c.obs['Sample'].cat.categories:
    fig, axs= plt.subplots(nrows=1, ncols=1,figsize=(3,3))
    n = line_order.index(s.split('-')[0])
    t = time_order.index(s.split('-')[1])
    
    sp.pl.plot_polygons(
        adata_c,
        sample=s,
        clusters=adata_c.obs[cluster_key].cat.categories.tolist(),
        cluster_key=cluster_key,
        palette= annotation_colors,
        area_min_size=3,
        ax=axs,
    )
    
    axs.title.set_text('{}-{}'.format
                               (
                                   s.split('-')[0], 
                                   s.split('-')[1],
                               )
                              )
    plt.savefig('figures/{}/{}_{}.svg'.format(folder, filename, s), dpi=300, format='svg',bbox_inches='tight')
    plt.show()

In [None]:
cluster_key = 'annotation_20240408'
folder = 'annotation20240408'
filename = 'Plasma'
adata_c = adata_full[(adata_full.obs.condition == 'pla') | (adata_full.obs.condition == 'initial')]

#add Sample for plotting lines separaterly
adata_c.obs['Sample'] = pd.Categorical([l+'-'+t for l, t in  zip(adata_c.obs.line, adata_c.obs.time)])
adata_c.obs['Area'] = adata_c.obs.cell_area

line_order = adata_c.obs.line.cat.categories.tolist()
time_order = ['000h', '024h', '072h', '144h']


for s in adata_c.obs['Sample'].cat.categories:
    fig, axs= plt.subplots(nrows=1, ncols=1,figsize=(3,3))
    n = line_order.index(s.split('-')[0])
    t = time_order.index(s.split('-')[1])
    
    sp.pl.plot_polygons(
        adata_c,
        sample=s,
        clusters=adata_c.obs[cluster_key].cat.categories.tolist(),
        cluster_key=cluster_key,
        palette= annotation_colors,
        area_min_size=3,
        ax=axs,
    )
    
    axs.title.set_text('{}-{}'.format
                               (
                                   s.split('-')[0], 
                                   s.split('-')[1],
                               )
                              )
    plt.savefig('figures/{}/{}_{}.svg'.format(folder, filename, s), dpi=300, format='svg',bbox_inches='tight')
    plt.show()

In [None]:
cluster_key = 'annotation_20240408'
folder = 'annotation20240408'
filename = 'HypoxiaPlasma'

adata_c = adata_full[(adata_full.obs.condition == 'hyppla') | (adata_full.obs.condition == 'initial')]

#add Sample for plotting lines separaterly
adata_c.obs['Sample'] = pd.Categorical([l+'-'+t for l, t in  zip(adata_c.obs.line, adata_c.obs.time)])
adata_c.obs['Area'] = adata_c.obs.cell_area

line_order = adata_c.obs.line.cat.categories.tolist()
time_order = ['000h', '024h', '072h', '144h']


for s in adata_c.obs['Sample'].cat.categories:
    fig, axs= plt.subplots(nrows=1, ncols=1,figsize=(3,3))
    n = line_order.index(s.split('-')[0])
    t = time_order.index(s.split('-')[1])
    
    sp.pl.plot_polygons(
        adata_c,
        sample=s,
        clusters=adata_c.obs[cluster_key].cat.categories.tolist(),
        cluster_key=cluster_key,
        palette= annotation_colors,
        area_min_size=3,
        ax=axs,
    )
    
    axs.title.set_text('{}-{}'.format
                               (
                                   s.split('-')[0], 
                                   s.split('-')[1],
                               )
                              )
    plt.savefig('figures/{}/{}_{}.svg'.format(folder, filename, s), dpi=300, format='svg',bbox_inches='tight')
    plt.show()

In [None]:
sc.pl.umap(
    adata_full,
    color=[
        #"total_counts",
        #"batch",
        'SOX2','SOX9',
        'FOXG1',
        'AQP4','HOPX',
        'HES1','PAX6',
        
        'EGFR','APOE',
        'PDGFRA','CHI3L1',
        'CDK4','STMN1',
        'OLIG1', 'OLIG2',
        'DLL3','CRYAB',
        'TNC','BCAN',
        'COL1A1','COL3A1','CTSH','DCN',
        'NHLH1','SOX10',
        'RGS16',
        'EPAS1','IGFBP3','CD44',
        'TGFBI','VEGFA',
        'total_counts',
        
    ],
    s=30,
    use_raw=False,
    cmap='magma',
    wspace=0.4,
    frameon=False,
)

# Clustering at t0

In [None]:
adata_t0 = adata_full[adata_full.obs.time == '000h'].copy()

In [None]:
sc.pp.neighbors(adata_t0, use_rep="X_scVI")
sc.tl.umap(adata_t0)

In [None]:
r = 1
sc.tl.leiden(adata_t0,resolution=r,key_added='leiden_{}'.format(r))

r = 0.75
sc.tl.leiden(adata_t0, resolution=r,key_added='leiden_{}'.format(r) )

r = 0.8
sc.tl.leiden(adata_t0,resolution=r,key_added='leiden_{}'.format(r))

r = 0.85
sc.tl.leiden(adata_t0,resolution=r,key_added='leiden_{}'.format(r))

r = 0.95
sc.tl.leiden(adata_t0,resolution=r,key_added='leiden_{}'.format(r))



In [None]:
plt.figure(figsize=(2.5,2.5))
sc.pl.umap(
    adata_t0,
    color=["leiden_0.75","leiden_0.5", "leiden_0.85","leiden_0.8"],
    frameon=False,
    ncols=4,
    s=10,
)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
sc.tl.rank_genes_groups(adata_t0, groupby='leiden_0.8', method='t-test')

In [None]:
mpl.rcParams['figure.figsize'] = 4,4
mpl.rcParams['axes.grid'] = True
sc.pl.rank_genes_groups(adata_t0)

In [None]:
sc.pl.umap(
    adata_t0,
    color=[

        'EPAS1','IGFBP3','CD44','TGFBI','VEGFA','IGFBP5','HILPDA','TIMP1','TGFB1','TGFB2','CD44',
        'MCM5','PROX1','SOX9','PLD5','POSTN','HIF1A','CNTNAP3B','STK32B','SLC2A1',
        'TP73','SERPINE1',
        'total_counts',    
    ],
    s=30,
    use_raw=False,
    cmap='magma',
    wspace=0.4,
    frameon=False,
)

In [None]:
annotation = {
    0:'+HYP1',
    1:'Reactive Ast',
    2:'RG',
    3:'preOPC',
    4:'GBL',
    5:'+WH',
    6:'+HYP0',
    7:'FBL',
    8:'nIPC',
    9:'OPC',
    
}
adata_t0.obs['annotation'] = pd.Categorical([annotation[int(c)] for c in adata_t0.obs['leiden_0.8']])
adata_t0.write_h5ad('GBM_t0.h5ad')

In [None]:
sc.pl.umap(
    adata_t0,
    color=[
        #"total_counts",
        #"batch",
        'SOX2','SOX9','SOX4','FOXG1',
        'HES1','PAX6',
        'TNC','BCAN','EDNRB','TTYH1',
        'CRYAB','HOPX',
        
        'ST18','CNTN1', 'CDK4','STMN1','NHLH1','RGS16',
    
        'AQP4','SLC1A3','CHI3L1','MGST1','APOE','GJA1','GFAP',
        
        'EGFR','DLL3',
        'SOX10',
        'PDGFRA', 'OLIG1', 'OLIG2',
        
        'COL1A1', 'COL3A1','DCN','TENM1','CD74','ARHGAP24','CALCRL',
        'SEMA5A',
        #'EPAS1','IGFBP3','CD44','TGFBI','VEGFA','IGFBP5','HILPDA','TIMP1','TGFB1','TGFB2','CD44'
        'MCM5',
        'total_counts',    
    ],
    s=30,
    use_raw=False,
    cmap='magma',
    wspace=0.4,
    frameon=False,
)

In [None]:
#add Sample for plotting lines separaterly
adata_t0.obs['Sample'] = adata_t0.obs.line
adata_t0.obs['Area'] = adata_t0.obs.cell_area
fig, axs= plt.subplots(nrows=4,ncols=1,figsize=(10,10))

cluster = 'leiden_0.8'
for n, line in zip(range(4), adata_t0.obs['Sample'].cat.categories):
    sp.pl.plot_polygons(
        adata_t0,
        sample=line,
        clusters=adata_t0.obs.leiden.cat.categories.tolist(),
        cluster_key=cluster,
        palette= {str(i):c for i,c in enumerate(adata_t0.uns['{}_colors'.format(cluster)])},
        area_min_size=3,
        ax=axs[n]
    )

In [None]:
markers = adata_t0.uns['rank_genes_groups']

In [None]:
sc.tl.filter_rank_genes_groups(adata_t0,min_fold_change=1,)

In [None]:
markers = adata_t0.uns['rank_genes_groups']['names']

In [None]:
'''
dic_markers = {annotation[c]: [] for c in annotation}
#Top 15 markers
for m in range(5):
    for c in annotation:
        dic_markers[annotation[c]].append(markers[m][c])
'''

dic_markers = {
    '+HYP1': ['VEGFA', 'HILPDA', 'CD44', 'IGFBP5'],
    'Reactive Ast': ['APOE', 'MGST1', 'CHI3L1','TP73','SERPINE1'],
    'RG': ['TOP2A', 'CENPF','PAX6'],
    'preOPC': ['SOX4', 'ERBB3', 'SOX10', 'CNTN1','EGFR'],
    'GBL': ['SLC1A3', 'EDNRB', 'AQP4', 'TTYH1','BCAN','TNC'],
    '+WH': ['TGFBI', 'HOPX', 'TIMP1'],
    '+HYP0': ['EPAS1', 'IGFBP3','SLC2A1','PROX1','HIF1A'],
    'FBL': ['COL3A1','DCN','TENM1','CD74','ARHGAP24','CALCRL'],
    'nIPC': ['SOX11', 'SOX4', 'STMN1', 'ST18', 'CEMIP2','MEIS2','NHLH1','DLX2'],
    'OPC': ['BCAN', 'SOX10', 'DLL3', 'SOX4', 'PDGFRA'],
}

enrichment_table = []
all_enriched_genes = np.unique(np.array(sum(dic_markers.values(), [])))
for g in all_enriched_genes:
    l = [1 if g in dic_markers[c] else 0 for c in dic_markers]
    enrichment_table.append(l)
    
enrichment_table = np.array(enrichment_table)
enrichment_table = pd.DataFrame(data=enrichment_table.astype(np.int64), index=all_enriched_genes, columns= list(dic_markers.keys()) )
enrichment_table.to_parquet('Marker_table_GBO_t0.parquet')

In [None]:
labels = adata_t0.obs['leiden_0.8'].values.astype(int)
enrichment = enrich_(labels)

e = enrichment._fit(
    adata_t0.layers['counts'].astype('int64').T,permute=True
)

dic_markers = {}
for c in range(e.shape[1]):
    order = e[:,c].argsort()[::-1]
    markers = adata_t0.var_names[order[:25]]
    dic_markers[annotation[c]] = markers
    
enrichment_table = []
all_enriched_genes = np.unique(np.array(list( dic_markers.values())))
for g in all_enriched_genes:
    l = [1 if g in dic_markers[c] else 0 for c in dic_markers]
    enrichment_table.append(l)
enrichment_table = np.array(enrichment_table)
enrichment_table = pd.DataFrame(data=enrichment_table.astype(np.int64), index=all_enriched_genes, columns= list(dic_markers.keys()) )
enrichment_table.to_parquet('Enrichment_table_GBO_t0.parquet')

In [None]:
enrichment_table.shape

# CellTypist

In [None]:
import celltypist

In [None]:
adata_t0_ct = adata_t0.copy()

In [None]:
sc.pp.normalize_total(adata_t0_ct, target_sum=1e4)
sc.pp.log1p(adata_t0_ct)

In [None]:
model = celltypist.train(adata_t0_ct, labels = 'annotation', n_jobs = 10, feature_selection = True)

# CellAssign Annotation

In [None]:
adata_t0

In [None]:
adata_t0.obs.index = adata_t0.obs.index.astype("str")
adata_t0.var.index = adata_t0.var.index.astype("str")
adata_t0.var_names_make_unique()
adata_t0.obs_names_make_unique()

In [None]:
lib_size = adata_t0.X.sum(1)
adata_t0.obs["size_factor"] = lib_size / np.mean(lib_size)
adata_t0_markers = adata_t0[:,enrichment_table.index].copy()

In [None]:
import scvi
from scvi.external import CellAssign
scvi.external.CellAssign.setup_anndata(adata_t0_markers,size_factor_key='size_factor')

In [None]:
state_model = CellAssign(adata_t0_markers, enrichment_table,)
state_model.train()

In [None]:
state_model.history["elbo_validation"].plot()

In [None]:
import seaborn as sns
predictions = state_model.predict()
sns.clustermap(predictions, cmap="viridis")

In [None]:
adata_t0_markers.obs['cellassign_predictions'] = predictions.idxmax(axis=1).values

In [None]:
df = adata_t0_markers.obs
confusion_matrix = pd.crosstab(
    df["cellassign_predictions"],
    df["annotation"],
    rownames=["cellassign_predictions"],
    colnames=["Original predictions"],
)
confusion_matrix /= confusion_matrix.sum(1).ravel().reshape(-1, 1)
fig, ax = plt.subplots(figsize=(5, 4))
sns.heatmap(
    confusion_matrix,
    cmap=sns.diverging_palette(245, 320, s=60, as_cmap=True),
    ax=ax,
    square=True,
    cbar_kws=dict(shrink=0.4, aspect=12),
)

# CellAssign annotation

In [None]:
import gdown
def download_data(save_path: str):
    sce_follicular_path = os.path.join(save_path, "sce_follicular.h5ad")
    sce_hgsc_path = os.path.join(save_path, "sce_hgsc.h5ad")
    fl_celltype_path = os.path.join(save_path, "fl_celltype.csv")
    hgsc_celltype_path = os.path.join(save_path, "hgsc_celltype.csv")

    gdown.download(
        "https://drive.google.com/uc?id=10l6m2KKKioCZnQlRHomheappHh-jTFmx",
        sce_follicular_path,
        quiet=False,
    )
    gdown.download(
        "https://drive.google.com/uc?id=1Pae7VEcoZbKRvtllGAEWG4SOLWSjjtCO",
        sce_hgsc_path,
        quiet=False,
    )
    gdown.download(
        "https://drive.google.com/uc?id=1tJSOI9ve0i78WmszMLx2ul8F8tGycBTd",
        fl_celltype_path,
        quiet=False,
    )
    gdown.download(
        "https://drive.google.com/uc?id=1Mk5uPdnPC4IMRnuG5N4uFvypT8hPdJ74",
        hgsc_celltype_path,
        quiet=False,
    )

    return (
        sce_follicular_path,
        sce_hgsc_path,
        fl_celltype_path,
        hgsc_celltype_path,
    )


In [None]:
save_dir = 'cellassign_dir'

(
    sce_follicular_path,
    sce_hgsc_path,
    fl_celltype_path,
    hgsc_celltype_path,
) = download_data(save_dir)


In [None]:
follicular_adata = sc.read(sce_follicular_path)
fl_celltype_markers = pd.read_csv(fl_celltype_path, index_col=0)

follicular_adata.obs.index = follicular_adata.obs.index.astype("str")
follicular_adata.var.index = follicular_adata.var.index.astype("str")
follicular_adata.var_names_make_unique()
follicular_adata.obs_names_make_unique()

follicular_adata

In [None]:
fl_celltype_markers.shape

In [None]:
class enrich_: #sparese
   
    def __init__(self, 
                 labels_attr: np.array) -> None:
        self.labels_attr = labels_attr
        self.permute_labs = None
        self.sizes = None
        self.nnz = None
        self.means = None
        self.f_nnz = None
        
    def _shuffle(self):
        
        permute_labs = np.random.permutation(self.labels_attr)

        self.permute_labs = permute_labs
        
    def _sort_col(self,arr,ordering):
        from scipy import sparse
        
        arr_list =[]
        # arr_ = sparse_tmp.copy()
        chunksize = 100000000 // arr.shape[1]
        start = 0
        while start < arr.shape[0]:
            submatrix = arr[start:start + chunksize, :]
            arr_list.append(submatrix[:, ordering])
            start = start + chunksize
            
        return sparse.vstack(arr_list)

    def _fit(self, mtx,permute:bool=False):
        
            if permute:
                enrich_._shuffle()
                labels = self.permute_labs
                logging.info(f'permute{labels}')
            
            else: 
                labels = self.labels_attr 
            labels = labels.astype(float)
            
            # Need to sort out through labels first before doing the split 
            idx = np.unique(labels[np.argsort(labels)], return_index=True)
            idx_ = np.concatenate([idx[1],[mtx.shape[1]]])
            
            mtx_ = self._sort_col(mtx,np.argsort(labels))
    
            alist = []
            for i in range(len(idx_)-1):

                alist.append(mtx_[:,idx_[i]:idx_[i+1]])   

            n_labels = max(labels) + 1

            n_cells = mtx_.shape[1]

            sizes = np.zeros(len(idx[0]))
            nnz=np.zeros([mtx_.shape[0],len(idx[0])])
            means=np.zeros([mtx_.shape[0],len(idx[0])])
            for i in np.arange(len(alist)):

                nnz[:,i] = alist[i].getnnz(axis=1)
                means[:,i] = np.squeeze((alist[i].mean(axis=1).A))
                sizes[i] = alist[i].shape[1]
                
            self.sizes, self.nnz, self.means = sizes, nnz, means

            # Non-zeros and means over all cells
            (nnz_overall, means_overall) = mtx_.getnnz(axis=1),np.squeeze((mtx_.mean(axis=1).A))

            # Scale by number of cells
            f_nnz = nnz / sizes
            f_nnz_overall = nnz_overall / n_cells
            
            self.f_nnz = f_nnz


            # Means and fraction non-zero values in other clusters (per cluster)
            means_other  = ((means_overall * n_cells)[None].T - (means * sizes)) / (n_cells - sizes)
            f_nnz_other = ((f_nnz_overall * n_cells)[None].T - (f_nnz * sizes)) / (n_cells - sizes)

            # enrichment = (f_nnz + 0.1) / (f_nnz_overall[None].T + 0.1) * (means + 0.01) / (means_overall[None].T + 0.01)
            enrichment = (f_nnz + 0.1) / (f_nnz_other + 0.1) * (means + 0.01) / (means_other + 0.01)

            return enrichment
