In [None]:
import os
import pdb
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt
import decoupler as dc
import seaborn as sns
from scipy import stats
from collections import Counter

from sklearn.mixture import BayesianGaussianMixture as GMM
from scipy.stats import norm
import scipy.stats as stats
import matplotlib.font_manager
import sklearn.metrics
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.figsize'] = (3,4)

In [None]:
sc.settings.verbosity = 0             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, dpi_save=300, facecolor='white', format="svg", transparent=True)
sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')

In [None]:
writeDir="write/"

In [None]:
homology = pd.read_table("../data/hgncHM_121.csv", sep=",")
mouseGenes = np.array(homology["mouse"])
humanGenes = np.array(homology["human"])
m2h = dict(zip(mouseGenes,humanGenes))
h2m = dict(zip(humanGenes,mouseGenes))
#[m2h[g] for g in adiGenes if g in m2h.keys()]
homology

In [None]:
def convertDf(inDf, convDict):
    newGs = []
    for gs in inDf:
        newGs.append([convDict[g] for g in inDf[gs] if g in convDict.keys()])
    return(pd.DataFrame(newGs, index=inDf.columns).T)

In [None]:
sigFile = "data/geneSigs/EMT.csv"
hEMT = pd.read_csv(sigFile)
mEMT = convertDf(hEMT, h2m)

In [None]:
sigFile = "../mhCompTiss/data/LUAD/geneSigs/KAC.csv"
hKac = pd.DataFrame(pd.read_csv(sigFile, skiprows=1)["gene"][:500].values, columns=["KAC"])
mKac = convertDf(hKac, h2m)

In [None]:
sigFile = "data/geneSigs/skinLowerSpike.csv"
mSkinGS = pd.read_csv(sigFile)[:500]
hSkinGS = convertDf(mSkinGS, m2h)

In [None]:
sigFile = "data/geneSigs/colonHRC.xlsx"
hColonGS = pd.read_excel(sigFile,sheet_name=0,skiprows=1)[:500]
mColonGS = convertDf(hColonGS[['coreHRC']], h2m)
hColonGS = hColonGS[['coreHRC']]#[['All-HR', 'TME-HR', 'EpiHR', 'Lgr5 signature', 'coreHRC']]

In [None]:
sigFile = "data/geneSigs/prostateL1geneSig.xlsx"
geneSignName = ["L1","L2","B","ADENO","NEPC"]
geneLists = []
for i,sheet in enumerate([0,2,4,6,8]):
    sigGenes = pd.read_excel(sigFile,sheet_name=sheet)
    geneLists.append(sigGenes["Gene"][:500])
hProstGS = pd.DataFrame(geneLists, index=geneSignName).T
hProstGS = hProstGS[['ADENO']]
mProstGS = convertDf(hProstGS, h2m)

In [None]:
sigFile = "data/geneSigs/rengenGeneLists/PancreasPlasticBurdziak.csv"
sigGenes = pd.read_csv(sigFile, skiprows=1)
Fplas = sigGenes["gene"].values[:200]
hPancrBGS = pd.DataFrame(Fplas, columns=["Burdziak_Plasticity"])
mPancrBGS = convertDf(hPancrBGS, h2m)

In [None]:
hPancrGS = pd.read_csv("../mhCompTiss/data/PDAC/basClaSigGenes.csv")
hPancrGS = hPancrGS[["scBasal"]]
mPancrGS = convertDf(hPancrGS, h2m)

In [None]:
mLungGS = mMultiGS[["Stress"]]
hLungGS = convertDf(mLungGS, m2h)
mStressGS = mMultiGS[["Stress"]]
stress = mStressGS.dropna().values.flatten()
hStressGS = convertDf(mLungGS, m2h)

In [None]:
sigFile = "../mhCompTiss/data/LUAD/humanLUADgeneSig.csv"
sigGenes = pd.read_csv(sigFile)
hhpcs = sigGenes["hpcs"].values

In [None]:
sigFile = "../mhCompTiss/data/LUAD/simpLUADbig.csv"
sigGenes = pd.read_csv(sigFile)
mhpcs = sigGenes["HPCS"].values[:250]
hpcs = sigGenes["HPCS"].values[:250]

In [None]:
sigFile = "data/geneSigs/rengenGeneLists/RengenIntestMorralCl14.csv"
sigGenes = pd.read_csv(sigFile, skiprows=1)
revSC = sigGenes["genes"]
sigFile = "data/geneSigs/rengenGeneLists/RegenIntesAyyaz.csv"
sigGenes = pd.read_csv(sigFile, skiprows=1)
ssc2 = sigGenes["Revival Signature_Gene "].values
mIntestGS = pd.DataFrame([revSC,ssc2], index=["revSC","ssc2"]).T
hIntestGS = convertDf(mIntestGS, m2h)

In [None]:
sigFile = "data/geneSigs/rengenGeneLists/PancreasPlasticBurdziak.csv"
sigGenes = pd.read_csv(sigFile, skiprows=1)
plas = sigGenes["gene"].values[-200:]
hPlasGS = pd.DataFrame([plas], index=["BurdziakPlastic"]).T
mPlasGS = convertDf(hPlasGS, h2m)

In [None]:
mouseGenes = ["Plaur", "Procr", "F2r", "Slc4a11", "Itga2", "Cldn4"]
humanGenes = ["PLAUR", "PROCR", "F2R", "SLC4A11", "ITGA2", "CLDN4"]

In [None]:
sigGenes

In [None]:
mouseGS = mProstGS.join(mColonGS).join(mSkinGS).join(mPancrGS).join(mPancrBGS).join(mEMT).join(mLungGS).join(mKac).join(mIntestGS).join(pd.DataFrame(mhpcs, columns=['HPCS'])
)#.join(convertDf(hColonGS, h2m))
mouseGS.to_csv("figures/cellSigs.csv")
mouseGS

In [None]:
hAdataFiles = [
"Pelka2021_Colorectal.h5ad",

'luadBischoff.h5ad',
'luadDost.h5ad',
'luadKim.h5ad',
'luadMDA.h5ad',

"Qian2020_Ovarian.h5ad",

"Dong2020_Prostate.h5ad",
"Steele2020_Pancreas.h5ad",

"Ji2020_Skin.h5ad",
]
mAdataFiles = [
"mouseColon11302.h5ad",
"mouse_Prostate.h5ad",
"mouse_Pancreas_Burdziak.h5ad",
"mouse_Pancreas_TT.h5ad",
"mouse_Skin_GSE261766_Pap.h5ad",
]

In [None]:
hColNames = ['CRC',  
             'LUAD_B', 'LUAD_D', 'LUAD_K', 'LUAD_M', 
             'Ovar', 'PCa', 'PDAC', 'SCC']
mColNames = ['CRC', 'PCa', 'PDAC_B', 'PDAC_T', 'SCC']

colNames = [mColNames, mColNames, hColNames, hColNames]

In [None]:
dict(zip(hColNames,hAdataFiles))

In [None]:
dict(zip(mColNames,mAdataFiles))

In [None]:
infoDict = {
    "mColon":{"file":["mouseColon11302.h5ad"], "gs":mColonGS, "hpcs":mhpcs},
    "mProstate":{"file":["mouse_Prostate.h5ad"], "gs":mProstGS, "hpcs":mhpcs},
    "mPancreas":{"file":["mouse_Pancreas_TT.h5ad"], "gs":mPancrGS, "hpcs":mhpcs},
    "mPancreasB":{"file":["mouse_Pancreas_Burdziak.h5ad"], "gs":mPancrBGS, "hpcs":mhpcs},
    "mSkin":{"file":["mouse_Skin_GSE261766_Pap.h5ad"], "gs":mSkinGS, "hpcs":mhpcs},

    "hColon":{"file":["Pelka2021_Colorectal.h5ad"], "gs":hColonGS, "hpcs":hhpcs},
    "hSkin":{"file":["Ji2020_Skin.h5ad"], "gs":hSkinGS, "hpcs":hhpcs},
    "hProstate":{"file":["Dong2020_Prostate.h5ad"], "gs":hProstGS, "hpcs":hhpcs},
    "hPancreas":{"file":["Steele2020_Pancreas.h5ad"], "gs":hPancrGS, "hpcs":hhpcs},

    "hOvar":{"file":["Qian2020_Ovarian.h5ad"],             "gs":hStressGS, "hpcs":hhpcs},
    
    "hLungM":{"file":["luadMDA.h5ad"], "gs":hKac, "hpcs":hhpcs},
    "hLungK":{"file":["luadKim.h5ad"], "gs":hKac, "hpcs":hhpcs},
    "hLungB":{"file":["luadBischoff.h5ad"], "gs":hKac, "hpcs":hhpcs},
    "hLungD":{"file":["luadDost.h5ad"], "gs":hKac, "hpcs":hhpcs},
}

In [None]:
def plotCorr(name, corrScorePear, corrScoreSign, corrName="HPCS", figsize=(5,5),save=False):
    fig, ax = plt.subplots(figsize=figsize) 
    plt.grid(False)
    sns.heatmap(corrScorePear, cmap="bwr", xticklabels=True, ax=ax, vmax=1, vmin=-1, square=True, linewidths=0.1, cbar=False)
    
    for (i, j), significant in np.ndenumerate(corrScoreSign):
        if significant < 0.001:
            plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
        elif significant < 0.05:
            plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)
    
    ax.set_title(f"Pearson Correlation of {corrName} score with other Cell State Scores")
    ax.tick_params(axis='y', labelrotation=0)
    fig.text(1, 0.4, f"P-Value\no < 0.05\n● < 0.001")
    fig.tight_layout()
    plt.show()
    if(save):
        fig.savefig(f"{save}/{name}_pearCorr{corrName}vO.svg")

In [None]:
def plotHeatmap(data, name, title, xlabel, ylabel, save=False):
    fig, axs = plt.subplots(nrows=1, ncols=len(data[0])+1, figsize=(6,2)) 
    plt.grid(False)
    vmax, vmin = np.max(data), np.min(data)
    for i,ax in enumerate(axs):
        if(i >= len(data[0])):
            ax.axis("off")
            break
        sns.heatmap(data[:,i].reshape(2,1), cmap="bwr", xticklabels=True, ax=ax, square=True, linewidths=0.1, vmax=vmax, vmin=vmin, cbar=False)#cbar_kws={"shrink": 0.5})
        ax.set_xticklabels(xlabel)
        ax.set_yticklabels([ylabel[i],"Other"])
        #ax.set_ylabel(ylabel)
    
    norm = colors.Normalize(vmin, vmax)
    fig.suptitle(f"{title} Expression")
    fig.colorbar(cm.ScalarMappable(norm=norm, cmap=plt.colormaps["bwr"]), ax=ax)
    fig.tight_layout()
    plt.show()
    if(save):
        fig.savefig(f"{save}/{name}_{title}avgExpVo.svg")

In [None]:
def plotBoxPlot(dataDF, label, geneExp, name, save=False):
    slcPos = adata.obs[label].values[geneExp]
    slcNeg = adata.obs[label].values[np.logical_not(geneExp)]

    # Create a figure and axis
    fig, ax = plt.subplots()
    ax.grid(False)
    # Create box plots
    box1 = ax.violinplot(slcPos, positions=[1], showmeans=True, widths=0.6)
    box2 = ax.violinplot(slcNeg, positions=[2], showmeans=True, widths=0.6)
    
    # Perform a t-test to get the p-value
    t_stat, p_val = stats.ttest_ind(slcPos, slcNeg)
    
    # Add p-value bracket and stars to the plot
    x1, x2 = 1, 2
    y, h, col = max(np.max(slcPos), np.max(slcNeg)), 0.1, 'k'
    ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c=col)
    ax.text((x1+x2)*.5, y+h-0.1, f'{p_val:.3e}', ha='center', va='bottom', color=col)
    
    # Add stars based on p-value significance
    if p_val < 0.001:
        stars = '***'
    elif p_val < 0.01:
        stars = '**'
    elif p_val < 0.05:
        stars = '*'
    else:
        stars = 'ns'  # not significant
    
    ax.text((x1+x2)*.5, y+h, stars, ha='center', va='bottom', color=col)
    
    # Add p-value text to the plot
    #plt.text(1.5, max(np.max(slcPos), np.max(slcNeg)), f'p-value: {p_val:.3e}', ha='center')
    
    # Set x-axis labels
    ax.set_xticks([1, 2])
    ax.set_xticklabels(['Slc4a11+', 'Slc4a11-'])
    
    # Set title and labels
    ax.set_title(name)
    ax.set_ylabel(f'{label} Score')
    
    # Show the plot
    plt.show()
    if(save):
        fig.savefig(f"{save}/{name}{label}_boxSlc4a11.svg")
    

In [None]:
def scoreAndLabel(adata, sigGenes, labelOfSigGenes, ogLabel="leiden",translate = False, toGenes=None, fromGenes=None):
    scoreGeneSig(adata, sigGenes, translate = translate, toGenes=toGenes, fromGenes=fromGenes)
    ogLabels = adata.obs[ogLabel].cat.categories
    scoreNames = [f"{sigName}Score" for sigName in sigGenes.columns]
    labelDict = dict(zip(scoreNames,labelOfSigGenes))
    newBClabel, ogLabelScoreMe = getNewLabels(adata, ogLabels, scoreNames, labelDict)
    return(scoreNames, newBClabel, ogLabelScoreMe)

def scoreGeneSig(adata, geneSig, translate = False, toGenes=None, fromGenes=None):
    for j,sigName in enumerate(geneSig.columns):
        clustGenes = geneSig.iloc[:,j].dropna()
        if translate:
            for i,mGene in enumerate(clustGenes):
                indexOver = np.where(fromGenes == mGene)[0]
                if indexOver.size > 0:
                    clustGenes[i] = toGenes[indexOver[0]]
        sc.tl.score_genes(adata, clustGenes, score_name=f"{sigName}Score")

In [None]:
#alphebatize...
infoLabels = ['mColon','mSkin','mProstate','mPancreas','mPancreasB',#,"mBreast",,"hBreast"
              'hColon','hSkin','hProstate','hPancreas','hLungM','hLungK','hLungB','hLungD',"hOvar"]

In [None]:
def getCorrDataGS(adataFiles, dfGS, hpcs=False, gene=False):
    corrScore = np.zeros((2,len(adataFiles),len(dfGS.columns)))
    cosScore = np.zeros((len(adataFiles),len(dfGS.columns)))
        
    for j,adataFile in enumerate(adataFiles):
        print(adataFile)
        adata = sc.read_h5ad(f"write/{adataFile}")
        human = sum([g == g.upper() for g in adata.var_names])>100
        if(gene):
            if(human):
                gene = gene.upper()
            rdata = adata.raw.to_adata()
            if(isinstance(rdata.X, np.ndarray)):
                geneScore = np.array(rdata[:,rdata.var_names==gene].X).flatten() 
            else:
                geneScore = np.array(rdata[:,rdata.var_names==gene].X.todense()).flatten() 
        elif(isinstance(hpcs, np.ndarray)):
            if(human):
                hpcs = np.array([str(gen).upper() for gen in hpcs])
            else:
                hpcs = np.array([gene[0]+gene[1:].lower() for gene in hpcs])
            #if(adataFile=="mouse_Pancreas_Burdziak.h5ad"):
            #    import pdb; pdb.set_trace()
            sc.tl.score_genes(adata, hpcs, score_name="HPCS")
            #print("made hpcs")
        #
        for gs in dfGS:
            gsList = dfGS[gs].dropna().values
            if(human):
                gsList = [str(gene).upper() for gene in gsList]
            sc.tl.score_genes(adata, gsList, score_name=gs)

        #import pdb;
        for i,gs in enumerate(dfGS):
            if(gene):
                corrScore[0,j,i], corrScore[1,j,i] = stats.pearsonr(geneScore, adata.obs[gs].values)
                cosScore[j,i] = cosine_similarity(geneScore.reshape(1, -1), adata.obs[gs].values.reshape(1, -1))[0][0]
            else:
                corrScore[0,j,i], corrScore[1,j,i] = stats.pearsonr(adata.obs["HPCS"].values, adata.obs[gs].values)
                cosScore[j,i] = cosine_similarity(adata.obs["HPCS"].values.reshape(1, -1), adata.obs[gs].values.reshape(1, -1))[0][0]
        #break
        
    adataNames = [adataFile.split(".")[0] for adataFile in adataFiles]
    
    corrScorePear = pd.DataFrame(corrScore[0,:,:].T, columns = adataNames, index=dfGS.columns)
    corrScoreSign = corrScore[1,:,:].T
    return(corrScorePear, corrScoreSign)

In [None]:
def gmmScoreGeneSig(scoreMat, meansInit=[[0.1],[0.4]],plotLen = 3, show=False):
    scoreNames = scoreMat.columns
    numScores = len(scoreNames)
    if(show):
        fig, axs = plt.subplots((numScores//plotLen)+1,plotLen)
        plt.rcParams["figure.figsize"] = (15,5)

    dfScoreBoundry = pd.DataFrame(np.zeros(numScores),scoreNames, columns=["boundry"])
    gmm = GMM(n_components = 2, random_state=10)#, init_params="random_from_data")#, means_init=meansInit)
    #binEx = np.arange(0.5,10,10/200).reshape(-1,1)

    for i, scoreName in enumerate(scoreNames):
        scoreCount = np.array(scoreMat[scoreName]).reshape(-1, 1)
        fitGMM = gmm.fit(scoreCount)
        mean = fitGMM.means_  
        covs  = fitGMM.covariances_
        weights = fitGMM.weights_
        #binEx = np.arange(min(min(mean),max(mean)),max(scoreCount),0.01).reshape(-1,1)
        binEx = np.arange(min(min(mean),max(mean))[0],max(scoreCount)[0],0.01).reshape(-1,1)
        fitGmmBound = fitGMM.predict(binEx)
        furtherBound = fitGmmBound[-1]
        #print(furtherBound)
        #print(fitGmmBound)
        if (len(np.unique(fitGmmBound)) == 2):
            scoreBoundry = binEx[np.where(fitGmmBound == furtherBound)[0][0]][0]
        else:
            scoreBoundry = max(scoreCount)
        #naiveBoundry = np.log10(int(scoreDisc.loc["90%",scoreName])+1)

        dfScoreBoundry.loc[scoreName] = scoreBoundry
        
        if(show):
            x_axis = np.arange(min(scoreCount), max(scoreCount), 0.01)
            y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
            y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

            # Plot 2
            x,y = i//plotLen, i%plotLen
            axs[x,y].set_title(scoreName)
            #axs[x,y].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
            axs[x,y].axvline(scoreBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
            axs[x,y].hist(scoreCount, density=True, color='black', bins=100)        
            axs[x,y].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
            axs[x,y].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
            axs[x,y].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue

    if(show):
        plt.tight_layout(pad=1.0)
        plt.show()
        #sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
    
    scoreIDs = scoreMat.copy()
    scoreID = np.array(scoreNames)
    #pdb.set_trace()

    for scoreName in scoreNames:
        #print(scoreName)
        #print(dfScoreBoundry.loc[scoreName].values[0])
        #if(dfScoreBoundry.loc[scoreName].values[0] > 0):
        scoreIDs.loc[:,scoreName] = (scoreMat.loc[:,scoreName] > dfScoreBoundry.loc[scoreName].values[0]).astype("float")

        
    classification = np.empty(len(scoreMat), dtype="object")
    i = 0
    for cellBar, scoreBool in scoreIDs.iterrows():
        scoreBool = scoreBool.astype("bool")
        numscorees = sum(scoreBool)
        if (numscorees == 1):
            classif = scoreID[scoreBool.values][0][:-5]#.values
        elif (numscorees > 1):
            #pdb.set_trace()
            #classif = "Doublet"
            maxTrue = np.argmax(adata.obs.loc[cellBar,scoreID[scoreBool]])   
            #pdb.set_trace()
            classif = scoreID[scoreBool][maxTrue][:-5]#.values
        else:
            classif = "Negative"
        classification[i] = classif
        i = i + 1
        
    return(classification)

In [None]:
saveDir = "figures/"

In [None]:
inGenes = ["Slc4a11","Plaur"."Plau"]
perScores = np.zeros((2,len(infoLabels)*3,2))
avgScores = np.zeros((len(infoLabels)*3,len(genes)*2))
pvalScores = np.zeros((len(infoLabels)*3,len(genes)))

datasetNames = []
for ki, key in enumerate(infoLabels):
    genes = inGenes
    print(key)
    adataFiles = infoDict[key]["file"]
    dfGS = infoDict[key]["gs"]
    hpcsList = infoDict[key]["hpcs"]
    datasetNames.append(f"{key}-{dfGS.columns[0]}")
    datasetNames.append(f"{key}-HPCS")
    datasetNames.append(f"{key}-stress")

    adata = sc.read_h5ad(f"write/{adataFiles[0]}")
    human=sum([g==g.upper() for g in adata.var_names])>100
    rdata = adata.raw.to_adata()
    
    gSig = dfGS.columns[0]
    
    if(human):
        sc.tl.score_genes(adata, [str(gene).upper() for gene in hpcs], score_name="HPCS")
        sc.tl.score_genes(adata, [str(gene).upper() for gene in stress], score_name="Stress")
        sc.tl.score_genes(adata, [str(gene).upper() for gene in dfGS.iloc[:,0].values], score_name=gSig)
        genes = [g.upper() for g in genes]
    else:
        sc.tl.score_genes(adata, hpcs, score_name="HPCS")
        sc.tl.score_genes(adata, stress, score_name="Stress")
        sc.tl.score_genes(adata, dfGS.iloc[:,0].values, score_name=gSig)
    
    if("HPCS" not in dfGS.columns):
        dfGS = dfGS.join(pd.DataFrame(hpcsList, columns=["HPCS"]))    
    if("stress" not in dfGS.columns):
        dfGS = dfGS.join(pd.DataFrame(stress, columns=["stress"]))

    for gi, gene in enumerate(genes):
        corrOut = getCorrDataGS(adataFiles, dfGS, hpcs=False, gene=gene)
        
        perScores[0,ki*3,gi], perScores[1,ki*3,gi] = corrOut[0].values[0][0], corrOut[1][0][0]
        perScores[0,ki*3+1,gi], perScores[1,ki*3+1,gi] = corrOut[0].values[1][0], corrOut[1][1][0]
        perScores[0,ki*3+2,gi], perScores[1,ki*3+2,gi] = corrOut[0].values[2][0], corrOut[1][2][0]

        if(isinstance(rdata.X, np.ndarray)):
            geneExp = np.array(rdata[:,rdata.var_names==gene].X).flatten() > 0
        else:
            geneExp = np.array(rdata[:,rdata.var_names==gene].X.todense()).flatten() > 0

        for gsi,gs in enumerate([gSig, "HPCS", "Stress"]):
        
            geneExpPos = adata.obs[gs][geneExp]
            geneExpNeg = adata.obs[gs][np.logical_not(geneExp)]
            
            avgScores[ki*3+gsi,gi*2] = np.mean(geneExpPos)
            avgScores[ki*3+gsi,gi*2+1] = np.mean(geneExpNeg)
    
            t_stat, p_val = stats.ttest_ind(geneExpPos, geneExpNeg)
            pvalScores[ki*3+gsi,gi] = p_val
            
        
perCorrs = pd.DataFrame(perScores[0,:,:], index=datasetNames, columns=genes)

In [None]:
plotCorr("PearCorrGenesFull", perCorrs, 
         perScores[1,:,:], 
         corrName="pearCorrallGenesWnfkbvO", 
         figsize=(1,10), save=saveDir)

In [None]:
nfkb = np.unique(pd.read_table("dageneSigsta/LUAD/nfkbGeneList.txt",header=None)[0].values)#BU list

In [None]:
#nfkb = pd.read_table("data/LUAD/hallmarkTnfaNFKB.csv", sep=",").columns.values

geneSigs = pd.read_table("geneSigs/LUAD/geneSigs.csv", sep=",",index_col=0)
lenGenes=200
adi = geneSigs["hADI"].dropna().values[:lenGenes]
pats = geneSigs["hPATS"].dropna().values[:lenGenes]
kac = geneSigs["hKAC"].dropna().values[:lenGenes]
dapt = geneSigs["hDAPT"].dropna().values[:lenGenes]
at0 = geneSigs["hAT0"].dropna().values[:lenGenes]

sigDict = {"KAC":kac,
#           "AT0":at0,
           "ADI":adi,
           "DATP":dapt,
           "PATS":pats}
          
#"NFkB":nfkb

In [None]:
luadFiles = [#'luadPdx.h5ad',
'LUADpdxLX55A.h5ad',
'LUADpdxRu210.h5ad',
'LUADpdxLX278.h5ad',
'LUADpdxRu318.h5ad',
'LUADpdxRu465.h5ad',
'LUADpdxRu699.h5ad',
'LUADpdxRu890.h5ad',

'LUADpdxLX29.h5ad',#non KRAS
#'LUADpdxLX29A.h5ad',#non KRAS #DUP
#'LUADpdxJHU55A.h5ad', #dup
#'LUADpdxLX369.h5ad', #with dups

##'luadBischoff.h5ad',    
'luadBischoffp018.h5ad',
'luadBischoffp032.h5ad',
'luadBischoffp024.h5ad',#non KRAS

##'luadKim.h5ad',
'luadKimP0030.h5ad',
'luadKimP0034.h5ad',

#'luadDost.h5ad',

'LUAD_P2_Malig_re.h5ad',
'LUAD_P3_Malig_re.h5ad', #non KRAS
'LUAD_P5_Malig_re.h5ad' #non KRAS

##'luadLaughney.h5ad'
]
#'luadMouse_shKras.h5ad',

In [None]:
sigFile = "data/LUAD/humanLUADgeneSig.csv"
sigGenes = pd.read_csv(sigFile)
sigGenes

markers = pd.DataFrame(np.empty((500*12,2), dtype="object"), columns=["genes", "clustName"])
for i,clust in enumerate(sigGenes.columns):
    for j,gene in enumerate(sigGenes.index):
        gene = sigGenes.loc[gene,clust]
        #indexOver = np.where(mouseGenes == gene)[0]
        #if indexOver.size > 0:
        #    gene = humanGenes[indexOver[0]]
        markers.iloc[i*500+j] = gene,clust
        
markers = markers.dropna()
markers

In [None]:
#set(nfkb).intersection(set(sigGenes["HPCS"].values))

In [None]:
luadNameDict = {"LUADpdxLX55A.h5ad" : "pdxLX55A",
# "LUADpdxJHU55A.h5ad" : "pdxJHU55A",
 "LUADpdxRu890.h5ad" : "pdxRu890",
 "LUADpdxRu465.h5ad" : "pdxRu465",
 #"LUADpdxLX369.h5ad" : "pdxLX369",
 "LUADpdxLX278.h5ad" : "pdxLX278",
 "LUADpdxRu699.h5ad" : "pdxRu699",
 "LUADpdxRu318.h5ad" : "pdxRu318",
 "LUADpdxRu210.h5ad" : "pdxRu210",
'LUADpdxLX29.h5ad': "pdxLX29*",
#'LUADpdxLX29A.h5ad': "pdxLX29A*",
 "luadBischoffp018.h5ad" : "Bischoffp018",
 "luadBischoffp032.h5ad" : "Bischoffp032",
 "luadBischoffp024.h5ad" : "Bischoffp024*",
 "luadKimP0034.h5ad" : "KimP0034",
 "luadKimP0030.h5ad" : "KimP0030",
 "LUAD_P2_Malig_re.h5ad" : "HanP2",
 "LUAD_P3_Malig_re.h5ad" : "HanP3*",
 "LUAD_P5_Malig_re.h5ad" : "HanP5*"}

In [None]:
def gmmScoreGeneSig(scoreMat, plotLen = 3, show=False):
    scoreNames = scoreMat.columns
    numScores = len(scoreNames)
    if(show):
        fig, axs = plt.subplots((numScores//plotLen)+1,plotLen)
        plt.rcParams["figure.figsize"] = (15,5)

    dfScoreBoundry = pd.DataFrame(np.zeros(numScores),scoreNames, columns=["boundry"])
    gmm = GMM(n_components = 2, random_state=10)#, init_params="random_from_data")#, means_init=meansInit)
    #binEx = np.arange(0.5,10,10/200).reshape(-1,1)

    for i, scoreName in enumerate(scoreNames):
        scoreCount = np.array(scoreMat[scoreName]).reshape(-1, 1)
        fitGMM = gmm.fit(scoreCount)
        mean = fitGMM.means_  
        covs  = fitGMM.covariances_
        weights = fitGMM.weights_
        #binEx = np.arange(min(min(mean),max(mean)),max(scoreCount),0.01).reshape(-1,1)
        #binEx = np.arange(min(min(mean),max(mean))[0],max(scoreCount)[0],0.01).reshape(-1,1)
        #binEx = np.arange(min(mean),np.percentile(scoreCount,95),0.005).reshape(-1,1)
        binEx = np.arange(np.percentile(scoreCount,10),np.percentile(scoreCount,95),0.01).reshape(-1,1)
        
        #print(f"{min(mean)} {np.percentile(scoreCount,85)}")
        fitGmmBound = fitGMM.predict(binEx)
        furtherBound = fitGmmBound[-1]
        fitGmmBoundUniq = np.unique(fitGmmBound)

        #print(f"bound {fitGmmBound}")
        #print(furtherBound)
        #print(fitGmmBound)
        if (len(fitGmmBoundUniq) == 2):
            if(fitGmmBound[0] == fitGmmBound[-1]):
                furtherBound = fitGmmBoundUniq[fitGmmBoundUniq != furtherBound][0]
            scoreBoundry = binEx[np.where(fitGmmBound == furtherBound)[0][0]][0]
        else:
            scoreBoundry = max(scoreCount)
        #naiveBoundry = np.log10(int(scoreDisc.loc["90%",scoreName])+1)

        dfScoreBoundry.loc[scoreName] = scoreBoundry
        
        if(show):
            x_axis = np.arange(min(scoreCount), max(scoreCount), 0.01)
            y_axis0 = norm.pdf(x_axis, float(mean[0][0]), np.sqrt(float(covs[0][0][0])))*weights[0] # 1st gaussian
            y_axis1 = norm.pdf(x_axis, float(mean[1][0]), np.sqrt(float(covs[1][0][0])))*weights[1] # 2nd gaussian

            # Plot 2
            x,y = i//plotLen, i%plotLen
            axs[x,y].set_title(scoreName)
            #axs[x,y].axvline(naiveBoundry, c='C3', linestyle='dashed', linewidth=1) #red
            axs[x,y].axvline(scoreBoundry, c='C2', linestyle='dashed', linewidth=1)  #green
            axs[x,y].hist(scoreCount, density=True, color='black', bins=100)        
            axs[x,y].plot(x_axis, y_axis0, lw=3, c='C6')                            #pink
            axs[x,y].plot(x_axis, y_axis1, lw=3, c='C1')                            #orange
            axs[x,y].plot(x_axis, y_axis0+y_axis1, lw=3, c='C0', ls=':')            #dotted blue

    if(show):
        plt.tight_layout(pad=1.0)
        plt.show()
        #sc.set_figure_params(scanpy=True, dpi=100, dpi_save=150, fontsize=10, format='png')
    
    scoreIDs = scoreMat.copy()
    scoreZscores = scoreMat.apply(stats.zscore)
    scoreID = np.array(scoreNames)
    #pdb.set_trace()

    for scoreName in scoreNames:
        #print(scoreName)
        #print(dfScoreBoundry.loc[scoreName].values[0])
        #if(dfScoreBoundry.loc[scoreName].values[0] > 0):
        scoreIDs.loc[:,scoreName] = (scoreMat.loc[:,scoreName] > dfScoreBoundry.loc[scoreName].values[0]).astype("float")

        
    classification = np.empty(len(scoreMat), dtype="object")
    i = 0
    for cellBar, scoreBool in scoreIDs.iterrows():
        scoreBool = scoreBool.astype("bool")
        numscorees = sum(scoreBool)
        if (numscorees == 1):
            classif = scoreID[scoreBool.values][0][:-5]#.values
        elif (numscorees > 1):
            #pdb.set_trace()
            #classif = "Doublet"
            maxTrue = np.argmax(scoreZscores.loc[cellBar,scoreID[scoreBool]])   
            #pdb.set_trace()
            classif = scoreID[scoreBool][maxTrue][:-5]#.values
        else:
            classif = "Negative"
        classification[i] = classif
        i = i + 1
        
    return(classification)

In [None]:
luadNames = [luadNameDict[luadf] for luadf in luadFiles]
hpcsNfkbScore = pd.DataFrame(np.zeros((2,len(luadFiles))), columns = luadNames, index=["HPCS","Other"])
_, ncs = sigGenes.shape
#cellStateProp = pd.DataFrame(np.zeros((ncs,len(luadFiles))), columns = luadNames, index=sigGenes.columns.values)
cellStateProp = pd.DataFrame(np.zeros((2,len(luadFiles))), columns = luadNames, index=["HPCS","Other"])

nfkbCorrScore = np.zeros((2,len(sigDict),len(luadFiles)))#, columns = luadNames, index=sigDict.keys())

corrScore = np.zeros((2,len(sigDict),len(luadFiles)))#, columns = luadNames, index=sigDict.keys())
cosScore = np.zeros((2,len(sigDict),len(luadFiles)))#, columns = luadNames, index=sigDict.keys())

#cosScore = pd.DataFrame(np.zeros((len(sigDict),len(luadFiles))), columns = luadNames, index=sigDict.keys())

sigScoreNames = [f"{geneSig}_score" for geneSig in sigDict.keys()]

adatas = np.empty(len(luadFiles), dtype=object)

for i,luadf in enumerate(luadFiles):
    print(luadf)
    adata = sc.read_h5ad(writeDir+luadf)
    sc.tl.leiden(adata, resolution=0.3)
    sc.tl.score_genes(adata, nfkb, score_name="nfkbScore")
    sc.tl.score_genes(adata, sigGenes["HPCS"], score_name="HPCSscore")
    for j,geneSig in enumerate(sigDict):
        scoreName=f"{geneSig}_score"
        sc.tl.score_genes(adata, sigDict[geneSig], score_name=scoreName)
        #adata.obs[f"{geneSig}Norm"] = adata.obs[scoreName]/np.linalg.norm(adata.obs[scoreName])
        
        #corrScore.loc[geneSig,luadNameDict[luadf]] = stats.pearsonr(adata.obs["HPCSscore"].values, adata.obs[scoreName].values)[0]
        #cosScore.loc[geneSig,luadNameDict[luadf]] = cosine_similarity(adata.obs["HPCSscore"].values.reshape(1, -1), adata.obs[scoreName].values.reshape(1, -1))[0][0]

        corrScore[0,j,i], corrScore[1,j,i] = stats.pearsonr(adata.obs["HPCSscore"].values, adata.obs[scoreName].values)
        nfkbCorrScore[0,j,i], nfkbCorrScore[1,j,i] = stats.pearsonr(adata.obs["nfkbScore"].values, adata.obs[scoreName].values)
        
        hpcsScores = adata.obs["HPCSscore"].values.copy()
        otherScore = adata.obs[scoreName].values.copy()
        ogCos = cosine_similarity(hpcsScores.reshape(1, -1), otherScore.reshape(1, -1))[0][0]
        cosScore[0,j,i] = ogCos
        permCos = np.zeros(1000)
        for k in range(len(permCos)):
            np.random.shuffle(hpcsScores)
            np.random.shuffle(otherScore)
            permCos[k] = cosine_similarity(hpcsScores.reshape(1, -1), otherScore.reshape(1, -1))[0][0]
        
        cosScore[1,j,i] = stats.norm.sf((ogCos-np.mean(permCos))/np.std(permCos))


        
    scoreNames, newBClabel, ogLabelScoreMed = scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
    
    gmmClass = gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs["gmmClass"] = gmmClass
    adata.obs["hpcs"] = ["HPCS" if h else "Other" for h in adata.obs.gmmClass=="HPCS"]
    countSC = Counter(gmmClass)
    #for cs in countSC:
    #    cellStateProp.loc[cs,luadNameDict[luadf]] = countSC[cs]/len(gmmClass)
    cellStateProp.loc["HPCS",luadNameDict[luadf]] = countSC["HPCS"]/len(gmmClass)
    cellStateProp.loc["Other",luadNameDict[luadf]] = (len(gmmClass)-countSC["HPCS"])/len(gmmClass)
    
    adata.obs["nfkbZscore"] = stats.zscore(adata.obs["nfkbScore"])
    hpcsNFKb = adata[adata.obs.gmmClass=="HPCS"].obs["nfkbZscore"]
    otherNFKb = adata[np.logical_not(adata.obs.gmmClass=="HPCS")].obs["nfkbZscore"]

    hpcsNfkbScore.loc["HPCS",luadNameDict[luadf]] = np.mean(hpcsNFKb)
    hpcsNfkbScore.loc["Other",luadNameDict[luadf]] = np.mean(otherNFKb)

    adatas[i] = adata


In [None]:
for adata in adatas:
    try:
        sc.pl.umap(adata, color=["hpcs", "HPCSscore","nfkbScore"], cmap="Reds")
    except:
        continue

In [None]:
genes=[g.upper() for g in inGenes]
cosGeneSim = np.zeros((len(adatas), len(genes)))
for i,adata in enumerate(adatas):
    adata = adata.raw.to_adata()
    for j,gene in enumerate(genes):
        geneExp = np.squeeze(np.asarray(adata[:,adata.var_names==gene].X.todense()))
        cosGeneSim[i,j] = cosine_similarity(adata.obs["nfkbScore"].values.reshape(1, -1), geneExp.reshape(1, -1))[0][0]        
        #cosGeneSim[i,j] = stats.pearsonr(adata.obs["nfkbScore"].values, geneExp)[0]

In [None]:
sns.heatmap(pd.DataFrame(cosGeneSim,columns=genes,index=list(luadNameDict.values())))

In [None]:
pd.DataFrame(cosGeneSim,columns=genes,index=list(luadNameDict.values()))

In [None]:
stats.ttest_ind(hpcsNfkbScore.loc["HPCS"],hpcsNfkbScore.loc["Other"], alternative="greater")

In [None]:
fig, ax = plt.subplots(figsize=(4,1))
plt.grid(False)
sns.heatmap(hpcsNfkbScore.iloc[:,:8], cmap="bwr", xticklabels=True, ax=ax, vmax=1.1, vmin=-1.1, square=True, linewidths=0.1)
_, pval = stats.ttest_ind(hpcsNfkbScore.iloc[0,:8],hpcsNfkbScore.iloc[1,:8], alternative="greater")
print(pval)
ax.set_title("Average Z-score of NFkB score in HPCS v Other")
fig.text(-0.3, 0.5, f"P-value:{pval:.3E}")
#fig.savefig("figures/zscoreHPCS_PDX.svg")

In [None]:
fig, ax = plt.subplots(figsize=(4,1))  
plt.grid(False)
sns.heatmap(hpcsNfkbScore.iloc[:,8:], cmap="bwr", xticklabels=True, ax=ax, vmax=1.1, vmin=-1.1, square=True, linewidths=0.1)
_, pval = stats.ttest_ind(hpcsNfkbScore.iloc[0,8:],hpcsNfkbScore.iloc[1,8:], alternative="greater")
print(pval)
ax.set_title("Average Z-score of NFkB score in HPCS v Other")
fig.text(-0.3, 0.5, f"P-value:{pval:.3E}")
#fig.savefig("figures/zscoreHPCS_HUM.svg")

In [None]:
cosScoreVals = pd.DataFrame(cosScore[0,:,:], columns = luadNames, index=sigDict.keys())
cosScoreSign = cosScore[1,:,:]

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(np.log(cosScoreVals.iloc[:,:8]+1), cmap="bwr", xticklabels=True, ax=ax, square=True, linewidths=0.1, vmax=1, vmin=-1)

for (i, j), significant in np.ndenumerate(cosScoreSign[:,:8]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)

ax.set_title("Cosine similarity of HPCS score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")

fig.show()
#fig.savefig("figures/cosSimHPCSvO_PDX.svg")

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(np.log(cosScoreVals.iloc[:,8:]+1), cmap="bwr", xticklabels=True, ax=ax, square=True, linewidths=0.1, vmax=1, vmin=-1)

for (i, j), significant in np.ndenumerate(cosScoreSign[:,8:]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)

ax.set_title("Cosine similarity of HPCS score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")
fig.show()
#fig.savefig("figures/cosSimHPCSvO_HUM.svg")

In [None]:
corrScorePear = pd.DataFrame(corrScore[0,:,:], columns = luadNames, index=sigDict.keys())
corrScoreSign = corrScore[1,:,:]

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(corrScorePear.iloc[:,:8], cmap="bwr", xticklabels=True, ax=ax, vmax=1, vmin=-1, square=True, linewidths=0.1)

for (i, j), significant in np.ndenumerate(corrScoreSign[:,:8]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)

ax.set_title("Pearson Correlation of HPCS score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")
fig.show()
#fig.savefig("figures/pearCorrHPCSvO_PDX.svg")

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(corrScorePear.iloc[:,8:], cmap="bwr", xticklabels=True, ax=ax, square=True, linewidths=0.1, vmax=1, vmin=-1)

for (i, j), significant in np.ndenumerate(corrScoreSign[:,8:]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)
    
ax.set_title("Pearson Correlation of HPCS score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")

fig.show()
#fig.savefig("figures/pearCorrHPCSvO_HUM.svg")

In [None]:
nfkbCorrScorePear = pd.DataFrame(nfkbCorrScore[0,:,:], columns = luadNames, index=sigDict.keys())
nfkbCorrScoreSign = nfkbCorrScore[1,:,:]

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(nfkbCorrScorePear.iloc[:,:8], cmap="bwr", xticklabels=True, ax=ax, vmax=1, vmin=-1, square=True, linewidths=0.1)

for (i, j), significant in np.ndenumerate(nfkbCorrScoreSign[:,:8]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)

ax.set_title("Pearson Correlation of NFkB score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")
fig.show()
#fig.savefig("figures/pearCorrNFkBvO_PDX.svg")

In [None]:
fig, ax = plt.subplots(figsize=(5,2)) 
plt.grid(False)
sns.heatmap(nfkbCorrScorePear.iloc[:,8:], cmap="bwr", xticklabels=True, ax=ax, square=True, linewidths=0.1, vmax=1, vmin=-1)

for (i, j), significant in np.ndenumerate(nfkbCorrScoreSign[:,8:]):
    if significant < 0.001:
        plt.text(j + 0.5, i + 0.5, "●", color='black', ha='center', va='center', fontsize=12)
    elif significant < 0.05:
        plt.text(j + 0.5, i + 0.5, 'o', color='black', ha='center', va='center', fontsize=12)
    
ax.set_title("Pearson Correlation of NFkB score with other Cell State Scores")
ax.tick_params(axis='y', labelrotation=0)
fig.text(0.95, 0.4, f"P-Value\no < 0.05\n● < 0.001")

fig.show()
#fig.savefig("figures/pearCorrNFkBvO_HUM.svg")

In [None]:
cellStatePropSub = cellStateProp.iloc[:,:8]
stacked={cs:cellStatePropSub.loc[cs].values for cs in cellStatePropSub.index.values}
fig, ax = plt.subplots(figsize=(4,4)) 
bottom = np.zeros(cellStatePropSub.shape[1])
width = 0.75

for cs, stack in stacked.items():
    p = ax.bar(cellStatePropSub.columns.values, stack, width, label=cs, bottom=bottom)
    bottom += stack

ax.set_title("Proportion of Cell State per Dataset")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
dataLabel = cellStatePropSub.columns
ax.set_xticks(range(len(dataLabel)))
ax.set_xticklabels(dataLabel, rotation=80)
plt.grid(False)
plt.show()
#fig.savefig("figures/hpcsPropPer_PDX.svg")

In [None]:
cellStatePropSub = cellStateProp.iloc[:,8:]
stacked={cs:cellStatePropSub.loc[cs].values for cs in cellStatePropSub.index.values}
fig, ax = plt.subplots(figsize=(4,4)) 
bottom = np.zeros(cellStatePropSub.shape[1])
width = 0.75

for cs, stack in stacked.items():
    p = ax.bar(cellStatePropSub.columns.values, stack, width, label=cs, bottom=bottom)
    bottom += stack

ax.set_title("Proportion of Cell State per Dataset")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
dataLabel = cellStatePropSub.columns
ax.set_xticks(range(len(dataLabel)))
ax.set_xticklabels(dataLabel, rotation=80)
plt.grid(False)

plt.show()
#fig.savefig("figures/hpcsPropPer_HUM.svg")

In [None]:
cellStatePropSub = cellStateProp
stacked={cs:cellStatePropSub.loc[cs].values for cs in cellStatePropSub.index.values}
fig, ax = plt.subplots(figsize=(4,4)) 
bottom = np.zeros(cellStatePropSub.shape[1])
width = 0.75

for cs, stack in stacked.items():
    p = ax.bar(cellStatePropSub.columns.values, stack, width, label=cs, bottom=bottom)
    bottom += stack

ax.set_title("Proportion of Cell State per Dataset")
ax.legend(loc="center",bbox_to_anchor=(1.1, 0.5))
dataLabel = cellStatePropSub.columns
ax.set_xticks(range(len(dataLabel)))
ax.set_xticklabels(dataLabel, rotation=80)
plt.grid(False)

plt.show()
#fig.savefig("figures/hpcsPropPer.svg")

In [None]:
numRow, numCol = 4,4
figVio, axVio = plt.subplots(numRow, numCol,figsize=(12,12)) 

for i,luadf in enumerate(luadFiles):
    print(luadf)
    adata = sc.read_h5ad(writeDir+luadf)
    sc.tl.score_genes(adata, nfkb, score_name="nfkbScore")
    sc.tl.score_genes(adata, sigGenes["HPCS"], score_name="HPCSscore")
    for geneSig in sigDict:
        scoreName=f"{geneSig}_score"
        sc.tl.score_genes(adata, sigDict[geneSig], score_name=scoreName)
    scoreNames, newBClabel, ogLabelScoreMed = scoreAndLabel(adata, sigGenes, labelOfSigGenes= sigGenes.columns, ogLabel="leiden", translate=False)
    gmmClass = gmmScoreGeneSig(adata.obs[scoreNames],show=False)
    adata.obs["gmmClass"] = gmmClass
    adata.obs["hpcs"] = ["HPCS" if h else "Other" for h in adata.obs.gmmClass=="HPCS"]

    sc.pl.violin(adata, keys="nfkbScore", groupby="hpcs", ax=axVio[i//numCol,i%numCol],
                 inner = "box", stripplot=False, multi_panel=False, show=False, density_norm='width')
    axVio[i//numCol,i%numCol].set_title(luadNameDict[luadf])
    axVio[i//numCol,i%numCol].set_xlabel("")
    axVio[i//numCol,i%numCol].set_ylim([-0.1, 0.8])

figVio.tight_layout()
figVio.show()

In [None]:
#figVio.savefig("figures/violinNFkB.svg")

In [None]:
#markPAR = ["PLAUR","UPAR", "PLAU","UPA", "PROCR","EPCR","CCD41", "PROC", "THBD","THRM", "F2R","PAR1","CF2R", "F2", "PLG","HAE4"]
markPAR = ["PROCR","PROC", "F2R", "F2", "PLAUR","PLAU","THBD"]

numRow, numCol = 4,4
figPAR, axPAR = plt.subplots(numRow, numCol,figsize=(12,12)) 
figPARexp, axPARexp = plt.subplots(numRow, numCol,figsize=(12,12)) 

classLabel = "hpcs"

for i,adata in enumerate(adatas):
    x,y = i//numCol,i%numCol
    print(luadFiles[i][4:][:-5])
    adataGenes = set(adata.raw.to_adata().var_names.values)
    markPARavail = list(set(markPAR).intersection(adataGenes))
    try:
        sc.tl.rank_genes_groups(adata, classLabel, method='wilcoxon', use_raw=True, key_added=classLabel)
    except:
        sc.pp.normalize_total(adata,target_sum=1e4)
        sc.pp.log1p(adata)
        adata.uns['log1p']['base']=None
        sc.tl.rank_genes_groups(adata, classLabel, method='wilcoxon', use_raw=True, key_added=classLabel)
    
    sc.pl.rank_genes_groups_matrixplot(adata, key = classLabel, var_names=markPARavail, values_to_plot="scores", cmap='bwr', colorbar_title='z-scores', dendrogram = False, ax=axPAR[x,y], show=False)
    axPAR[x,y].set_title(luadNameDict[luadFiles[i]])

    sc.pl.matrixplot(adata, markPARavail, groupby="hpcs",ax=axPARexp[x,y], show=False)
    axPARexp[x,y].set_title(luadNameDict[luadFiles[i]])
    
figPAR.tight_layout()
figPARexp.tight_layout()
figPAR.show()
    #adataGenes = adata.raw.to_adata().var_names.values
    #for gene in markPAR:
    #    if(gene in adataGenes):
    #        print(f"\t{gene}")

In [None]:
figPAR

In [None]:
#figPAR.savefig("figures/zcoresPAR.svg")

In [None]:
figPARexp

In [None]:
#figPARexp.savefig("figures/expPAR.svg")

In [None]:
genes = ['PROCR', 'PROC', 'F2R', "PLAUR","PLAU","THBD"]#, "F2"]
geneExp = np.zeros((len(genes),2,len(adatas)))
geneZsc = np.zeros((len(genes),2,len(adatas)))

for i,adata in enumerate(adatas):
    adataR = adata.raw.to_adata()
    for gi, gene in enumerate(genes):
        geneInd = np.where(adataR.var_names.values==gene)[0]
        if(len(geneInd) > 0):
            geneInd = geneInd[0]
            geneExp[gi][0,i] = np.mean(adataR.X.todense()[adataR.obs["hpcs"]=="HPCS",geneInd])
            geneExp[gi][1,i] = np.mean(adataR.X.todense()[adataR.obs["hpcs"]!="HPCS",geneInd])
            for j,(g0, g1) in enumerate(adataR.uns["hpcs"]["names"]):
                if g1==gene:
                    geneZsc[gi][0,i] = -adataR.uns["hpcs"]["scores"][j][1]
                    geneZsc[gi][1,i] = adataR.uns["hpcs"]["scores"][j][1]
                    break
                elif g0==gene:
                    geneZsc[gi][0,i] = adataR.uns["hpcs"]["scores"][j][0]
                    geneZsc[gi][1,i] = -adataR.uns["hpcs"]["scores"][j][0]
                    break


In [None]:
for gi, gene in enumerate(genes):
    fig, ax = plt.subplots(figsize=(5,1))  
    plt.grid(False)
    sns.heatmap(geneZsc[gi][:,:8], cmap='bwr', xticklabels=True, ax=ax, square=True, linewidths=0.1)
    ax.set_title(f"Z-score of {gene} in HPCS v Other")
    ax.set_yticklabels(["HPCS","Other"])
    ax.tick_params(axis='y', labelrotation=0)
    ax.set_xticklabels(list(luadNameDict.values())[:8])
    ax.tick_params(axis='x', labelrotation=90)
    fig.show()
    fig.savefig(f"figures/zSc{gene}HPCSvO_PDX.svg")

In [None]:
for gi, gene in enumerate(genes):
    fig, ax = plt.subplots(figsize=(5,1))  
    plt.grid(False)
    sns.heatmap(geneZsc[gi][:,8:], cmap='bwr', xticklabels=True, ax=ax, square=True, linewidths=0.1)
    ax.set_title(f"Z-score of {gene} in HPCS v Other")
    ax.set_yticklabels(["HPCS","Other"])
    ax.tick_params(axis='y', labelrotation=0)
    ax.set_xticklabels(list(luadNameDict.values())[8:])
    ax.tick_params(axis='x', labelrotation=90)
    fig.show()
    fig.savefig(f"figures/zSc{gene}HPCSvO_HUM.svg")

In [None]:
for gi, gene in enumerate(genes):
    fig, ax = plt.subplots(figsize=(5,1))  
    plt.grid(False)
    sns.heatmap(geneExp[gi][:,:8], cmap='Reds', xticklabels=True, ax=ax, square=True, linewidths=0.1)
    ax.set_title(f"Average Exp of {gene} in HPCS v Other")
    ax.set_yticklabels(["HPCS","Other"])
    ax.tick_params(axis='y', labelrotation=0)
    ax.set_xticklabels(list(luadNameDict.values())[:8])
    ax.tick_params(axis='x', labelrotation=90)
    fig.show()
    fig.savefig(f"figures/exp{gene}HPCSvO_PDX.svg")

In [None]:
for gi, gene in enumerate(genes):
    fig, ax = plt.subplots(figsize=(5,1))  
    plt.grid(False)
    sns.heatmap(geneExp[gi][:,8:], cmap='Reds', xticklabels=True, ax=ax, square=True, linewidths=0.1)
    ax.set_title(f"Average Exp of {gene} in HPCS v Other")
    ax.set_yticklabels(["HPCS","Other"])
    ax.tick_params(axis='y', labelrotation=0)
    ax.set_xticklabels(list(luadNameDict.values())[8:])
    ax.tick_params(axis='x', labelrotation=90)
    fig.show()
    fig.savefig(f"figures/exp{gene}HPCSvO_HUM.svg")

In [None]:
sc.pl.umap(adatas[0], color=["hpcs",'PROCR', 'F2R', "nfkbScore"], 
           ncols=4, cmap="Reds", title=f"{list(luadNameDict.values())[0]} HPCS v Other",)
          #save="HPCSvO_PDX_NFkB.svg")


In [None]:
sc.pl.umap(adatas[9], color=["hpcs",'PROCR', 'F2R', "nfkbScore"], ncols=4, cmap="Reds",
           title=f"{list(luadNameDict.values())[9]} HPCS v Other",)
          #save="HPCSvO_B32_NFkB.svg")

In [None]:
for adata in adatas:
    try:
        sc.pl.umap(adata, color=["hpcs",'SLC4A11', 'F2R', "nfkbScore"], cmap="Reds")
    except:
        continue

In [None]:
#fig, ax = plt.subplots(1,3, figsize=(9,3)) 
pdxInd = [0,1,4]
for i,adata in enumerate(adatas[pdxInd]):
    pdxName = list(luadNameDict.values())[pdxInd[i]]
    sc.pl.umap(adata, color=["hpcs"], palette = "autumn", groups="HPCS",na_in_legend=False,
               title=f"{pdxName}", save=f"_{pdxName}_HPCS.svg")
               #ax = ax[2])


In [None]:
adatasMerge = adatas[pdxInd]
adataNames = ['pdxLX55A','pdxRu890','pdxRu699']
for i,adata in enumerate(adatasMerge):
    adatasMerge[i] = adata.raw.to_adata()
adata = sc.concat(adatasMerge, label="PDXm", keys=adataNames)
adata