# RA - Notch Dataset Analysis

In [None]:
import sys
sys.path.append("/data/srlab/lrumker/MCSC_Project/cna-display/")

In [None]:
import sys
sys.path.append("/data/srlab/lrumker/MCSC_Project/statutils/")

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import cna
import pp, pf
plt.style.use('../pp.mplstyle')
np.random.seed(0) # for reproducibility

In [None]:
import statutils
import vis

## Read Data

In [None]:
print('Reading')
d = cna.read("/data/srlab1/laurie-yakir/notch.real/notch_CNAready.h5ad")

In [None]:
# Copy information from cell metadata to sample metadata
# Takes mean per-sample across cells 
d.obs_to_sample(['nUMI', 'nGene', 'percent_mito', 'score_notch', 'time'])

res = cna.tl._association.association(d, #dataset 
                                      d.samplem.pheno.values, #phenotype
                                      None, #batches
                                      None) #covariates

In [None]:
# Confirm all cells retained
np.sum(d.uns['keptcells'])==d.obs.shape[0]

### Export results for gene expression analysis

In [None]:
FDR_thresh = res.fdr_5p_t

# Cell scores per neighborhood
d.obs['ncorrs'] = res.ncorrs

# Positively-associated cells
d.obs['poscells'] = np.repeat(False, d.obs.shape[0])
d.obs['poscells'].loc[d.obs['ncorrs']>FDR_thresh] = True

# Negatively-associated cells
d.obs['negcells'] = np.repeat(False, d.obs.shape[0])
d.obs['negcells'].loc[d.obs['ncorrs']<-FDR_thresh] = True

In [None]:
# Save objects for interpretation
dummy_df_cna = pd.DataFrame(d.obs.loc[:,["cell_id", "time", "score_notch", "ncorrs", "poscells", "negcells",
                                    "poscells", "negcells"]])
dummy_df_cna.to_csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_res.txt")

dummy_df_nampcs = pd.DataFrame(d.uns['NAM_nbhdXpc'].iloc[:,0:2])
dummy_df_nampcs.to_csv("/data/srlab/lrumker/MCSC_Project/notch/notch_cna_NAM_PCs.txt")

### NAM PC1 is the dominant signal in the dataset

In [None]:
plt.plot(np.arange(d.uns['NAM_svs'].shape[0]), np.abs(d.uns['NAM_svs']))

In [None]:
# Variance explained
d.uns['NAM_varexp'][0:2]

### Clusters Separated

In [None]:
np.mean(d.uns['NAM_nbhdXpc'].iloc[:,0].loc[d.obs['cell_subtype']=='lining'])

In [None]:
np.mean(d.uns['NAM_nbhdXpc'].iloc[:,0].loc[d.obs['cell_subtype']=='sublining'])

In [None]:
st.ttest_ind(d.uns['NAM_nbhdXpc'].iloc[:,0].loc[d.obs['cell_subtype']=='lining'], 
             d.uns['NAM_nbhdXpc'].iloc[:,0].loc[d.obs['cell_subtype']=='sublining'],
             equal_var = False)

### Key Correlations

In [None]:
# NAM PC1 to Notch
print(np.corrcoef(d.uns['NAM_nbhdXpc'].iloc[:,0],d.obs['score_notch'])[0,1])
st.spearmanr(d.uns['NAM_nbhdXpc'].iloc[:,0],d.obs['score_notch'])

In [None]:
# Correlation of NAM PC1 to Pseudotime
print(np.corrcoef(d.uns['NAM_nbhdXpc'].iloc[:,0],d.obs['time'])[0,1])
st.spearmanr(d.uns['NAM_nbhdXpc'].iloc[:,0],d.obs['time'])

In [None]:
# Correlation of pseudotime to notch
print(np.corrcoef(d.obs['score_notch'],d.obs['time'])[0,1])
st.spearmanr(d.obs['score_notch'],d.obs['time'])

In [None]:
# Correlation of naive gene expression PC to notch
print(np.corrcoef(d.obs['score_notch'],d.X[:,0])[0,1])
st.spearmanr(d.obs['score_notch'],d.X[:,0])

In [None]:
nreps = 10000
corr_magnitude_diff = []
for i in np.arange(nreps):
    donors = np.random.choice(d.samplem['donor'], d.N)
    bootstrap_time = []
    bootstrap_NAMPC1 = []
    boostrap_notch = []
    for donor in donors:
        loc_donor = [i for i in np.arange(d.obs.shape[0]) if d.obs['donor'][i]==donor]
        bootstrap_time.extend(d.obs['time'].iloc[loc_donor])
        bootstrap_NAMPC1.extend(d.uns['NAM_nbhdXpc'].iloc[loc_donor,0])
        boostrap_notch.extend(d.obs['score_notch'].iloc[loc_donor])
    notch_time_corrs = np.abs(st.spearmanr(boostrap_notch,bootstrap_time))
    notch_nampc1_corrs = np.abs(st.spearmanr(boostrap_notch,bootstrap_NAMPC1)[0])
    corr_magnitude_diff.extend(notch_nampc1_corrs -notch_time_corrs)

In [None]:
# P-value
np.sum(np.array(corr_magnitude_diff)<0)/len(corr_magnitude_diff)

## Make Figure

In [None]:
fig, axs = plt.subplots(2,3, figsize=(6,3))

plot_order = np.random.choice(d.uns['keptcells'].sum(),d.uns['keptcells'].sum(),replace=False)
umap = d.obsm['X_umap'][d.uns['keptcells']][plot_order,:]
ix = np.repeat([True], d.uns['keptcells'].sum())
use_cols = np.repeat('grey', d.obs.shape[0])

ax = axs[0,1]
c = -d.uns['NAM_nbhdXpc'].PC1[ix][plot_order]
ax.scatter(*umap[ix].T, alpha=0.5, c=c[ix], cmap='seismic', 
           vmin=-max(np.abs(c)), vmax=max(np.abs(c)), **pp.umapprops)
ax.set_title('NAM PC1')
ax.text(0.05, 0.75, '$R = {:.2f}$'.format(0.56),
        transform=ax.transAxes, fontsize=6, color="black")
ax.axis('off')

ax = axs[1,0]
c=d.obs['time'][ix][plot_order]
c = c-np.mean(c)
c = c/np.std(c)
ax.scatter(*umap[ix].T, alpha=0.5, c=c[ix], cmap='seismic', 
           vmin=-max(np.abs(c)), vmax=max(np.abs(c)), **pp.umapprops)
ax.set_title('Pseudotime')
ax.text(0.05, 0.75, '$R = {:.2f}$'.format(0.43),
        transform=ax.transAxes, fontsize=6, color="black")
ax.axis('off')

ax = axs[1,1]
c=d.X[:,0][ix][plot_order]
c = c-np.mean(c)
c = c/np.std(c)
cutoff = np.max([-np.percentile(c, 10), np.percentile(c, 90)]) 
ax.scatter(*umap[ix].T, alpha=0.5, c=c[ix], cmap='seismic', 
           vmin=-cutoff, vmax=cutoff, **pp.umapprops)
ax.set_title('Gene Expression PC1')
ax.text(0.05, 0.75, '$R = {:.2f}$'.format(0.22),
        transform=ax.transAxes, fontsize=6, color="black")
ax.axis('off')

ax = axs[0,0]
c=d.obs['score_notch'][ix][plot_order]
c = c-np.mean(c)
c = c/np.std(c)
cutoff = np.max([-np.percentile(c, 10), np.percentile(c, 90)])
ax.scatter(*umap[ix].T, alpha=0.5, c=c[ix], cmap='seismic', 
           vmin=-cutoff, vmax=cutoff, **pp.umapprops)
ax.set_title('Notch Activation')
ax.axis('off')

ax = axs[0,2]
ax.scatter(d.obs['ncorrs'].loc[d.obs['cell_type']=="lining"],
           d.obs['score_notch'].loc[d.obs['cell_type']=="lining"],
           color = "C4", alpha = 0.6, s = 1)
ax.axvline(x=-FDR_thresh, color = "black", lw = 0.5)
ax.axvline(x=FDR_thresh, color = "black", lw = 0.5)
ax.set_xlabel('NAM PC1')
ax.set_ylabel('Notch Activation')
ax.set_title('Lining Cluster')
z = np.polyfit(d.obs['ncorrs'].loc[d.obs['cell_type']=="lining"],
           d.obs['score_notch'].loc[d.obs['cell_type']=="lining"], 1)
p = np.poly1d(z)
ax.plot(d.obs['ncorrs'].loc[d.obs['cell_type']=="lining"],
        p(d.obs['ncorrs'].loc[d.obs['cell_type']=="lining"]),color='rebeccapurple', lw=1)
ax.text(0.21, 0.89, '$R = {:.2f}$'.format(0.36),
        transform=ax.transAxes, fontsize=6, color="black")
ax.set_xticks([-0.8, 0.8])
ax.set_yticks([0,60])

ax = axs[1,2]
ax.scatter(d.obs['ncorrs'].loc[d.obs['cell_type']=="sublining"],
           d.obs['score_notch'].loc[d.obs['cell_type']=="sublining"],
           color = "C9", alpha = 0.6, s = 1)
ax.axvline(x=-FDR_thresh, color = "black", lw = 0.5)
ax.axvline(x=FDR_thresh, color = "black", lw = 0.5)
ax.set_xlabel('NAM PC1')
ax.set_ylabel('Notch Activation')
ax.set_title('Sublining Cluster')
z = np.polyfit(d.obs['ncorrs'].loc[d.obs['cell_type']=="sublining"],
           d.obs['score_notch'].loc[d.obs['cell_type']=="sublining"], 1)
p = np.poly1d(z)
ax.plot(d.obs['ncorrs'].loc[d.obs['cell_type']=="sublining"],
        p(d.obs['ncorrs'].loc[d.obs['cell_type']=="sublining"]),color='darkcyan', lw=1)
ax.text(0.21, 0.89, '$R = {:.2f}$'.format(0.33),
        transform=ax.transAxes, fontsize=6, color="black")
ax.set_xticks([-0.8, 0.8])
ax.set_yticks([0,60])

plt.tight_layout()
plt.savefig('../_figs/rawmainfig.notch.pdf')

### Correlation to Notch within Clusters

In [None]:
mask_sublining = d.obs['cell_type']=="sublining"

In [None]:
np.abs(np.corrcoef(d.obs['score_notch'].loc[~mask_sublining], d.uns['NAM_nbhdXpc'].iloc[:,0].loc[~mask_sublining]))

In [None]:
np.abs(np.corrcoef(d.obs['score_notch'].loc[mask_sublining], d.uns['NAM_nbhdXpc'].iloc[:,0].loc[mask_sublining]))

In [None]:
nreps = 1000
sublining_cors = []
lining_cors = []
for i in np.arange(nreps):
    print(i)
    donors = np.random.choice(d.samplem['donor'], d.N)
    bootstrap_notch = []
    bootstrap_clusterassn = []
    bootstrap_NAMPC1 = []

    for donor in donors:
        loc_donor = [i for i in np.arange(d.obs.shape[0]) if d.obs['donor'][i]==donor]
        bootstrap_NAMPC1.extend(d.uns['NAM_nbhdXpc'].iloc[loc_donor,0])
        bootstrap_notch.extend(d.obs['score_notch'].iloc[loc_donor])
        bootstrap_clusterassn.extend(d.obs['cell_type'].iloc[loc_donor])

    mask_sublining_bootstrapped = [bootstrap_clusterassn[i]=='sublining' for i in np.arange(len(bootstrap_clusterassn))]
    sublining_cors.append(np.corrcoef(np.array(bootstrap_notch)[mask_sublining_bootstrapped], 
                                -np.array(bootstrap_NAMPC1)[mask_sublining_bootstrapped])[0,1])
    lining_cors.append(np.corrcoef(np.array(bootstrap_notch)[mask_sublining_bootstrapped], 
                                -np.array(bootstrap_NAMPC1)[mask_sublining_bootstrapped])[0,1])

In [None]:
### Number of times correlation is less than or equal to 0 out of nreps
np.sum(np.array(lining_cors)<=0)/nreps

In [None]:
### Number of times correlation is less than or equal to 0 out of nreps
np.sum(np.array(sublining_cors)<=0)/nreps

In [None]:
# These results translate to a p-value less than...
1/nreps

### Fraction of associated populations in each cluster

In [None]:
FDR_thresh = res.fdr_5p_t

In [None]:
counts = d.obs['cell_type'].loc[res.ncorrs>FDR_thresh].value_counts()
counts/np.sum(counts)

In [None]:
counts = d.obs['cell_type'].loc[res.ncorrs<-FDR_thresh].value_counts()
counts/np.sum(counts)