# Test associations to polygenic risk scores

In [1]:
import pandas as pd
import numpy as np
import pp, cna, os, pickle
pp.umapprops['s'] = 0.6
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
import scipy.stats as st
import statsmodels.api as sm
fig_dir = '/data/srlab/lrumker/MCSC_Project/cna-prs/figs/'
src_folder = "/data/srlab/lrumker/datasets/onek1k/pheno/"
res_folder = "/data/srlab/lrumker/MCSC_Project/cna-prs/results/"
np.random.seed(0)

In [2]:
# From Okabe & Ito, "colorblind-friendly" palette 
from matplotlib.colors import LinearSegmentedColormap
color_dict = {'orange': '#E69F00', 'skyblue': '#56B4E9', 'green': '#009E73', 
              'yellow': '#F0E442', 'purple': '#CC79A7', 'oceanblue': '#0072B2', 
              'vermillion': '#E63d00'} # O&I use '#D55E00'

In [3]:
# Subscript in text mode not math mode (avoids italics)
params = {'mathtext.default': 'regular' }          
plt.rcParams.update(params)

## Review available clinical attributes

In [4]:
meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
other_dz = meta.Other_Disease.value_counts().index
other_dz = np.concatenate([meta.Autoimmune_Disease_Other.value_counts().index, other_dz])

## Test for associations

In [5]:
covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
for scorename in ['RA', "SLE"]: 
    for celltype in ["Myeloid", "B", "NK", "T", "allcells"]:
        print(celltype) 
        d = cna.read("/data/srlab/lrumker/datasets/onek1k/pheno/"+celltype+".h5ad")

        # QC individuals 
        # Remove individuals lacking documented clinical information
        meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
        meta_clin = meta.drop(columns = meta.columns[pd.isna(meta).sum(axis=0)<400], inplace = False) # 32 clinical variables
        rm_ids = meta_clin.index[pd.isna(meta_clin).sum(axis=1)==meta_clin.shape[1]]
        meta = meta.drop(index=rm_ids)

        if scorename=="SLE":
            # Remove individuals with lupus
            SLE_terms = ['SLE', 'lupus']
            meta['non_null'] = [True if type(meta.Other_Disease.values[i]) is not float else False \
                                         for i in np.arange(meta.shape[0])]
            meta['has_SLE'] = np.repeat(False, meta.shape[0])
            for SLE_term in SLE_terms:
                meta.loc[meta.non_null, 'has_SLE'] = [True if meta.has_SLE.values[i] or \
                                                     SLE_term in meta.Autoimmune_Disease_Other.values[i] or \
                                          SLE_term in meta.Other_Disease.values[i] else False \
                                         for i in np.where(meta.non_null)[0]]
            SLE_ids = np.array(meta.index[meta.has_SLE])
            meta = meta.drop(index=SLE_ids)
        
        if scorename=="RA":
            # Remove individuals with RA
            meta = meta.loc[meta.Rheumatoid_arthritis.values!=1,:]

        # Load PRS
        PRS = pd.read_csv("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+scorename+"_PRS_noMHC.csv", index_col = 0)
        PRS.set_index('IDs', inplace = True, drop = True)
        PRS.columns = [scorename]
        meta= meta.join(PRS)
        d.samplem = d.samplem.join(meta.loc[:,[scorename]]) # Ensures PRS only available for known non-SLE/RA indiv.
        
        # Check for PRS-correlated clinical covariates
        clin_covs = meta.loc[:,['Autoimmune_Disease', 'Diabetes_type1', 'Diabetes_type2',
                   'Rheumatoid_arthritis', 'UlcerativeColitis', 'Hyperthyroidism', #'Autoimmune_Disease_Other', 
                   'Hypertension', 'Hypercholesterolaemia', #'Ca_Type'
                   'Cancer', 'Eye_Disease', 'Osteoporosis', #'Eye_DiseaseType',
                   'COPD', 'Statin', 'ACE_inhibitor', #'Other_Disease',
                   'AngiotensinReptorBlocker', 'CalciumChannelBlocker', 'BetaBlocker',
                   'BetaAgonist', 'Diuretic', 'Oral_hypoglycaemic', 'Insulin', 'OCP',
                   'Paracetamol', 'Aspirin', 'Colchicine', 'PPI', 'Thyroxine',
                        'SmokingStatus_N']] #'Other_Meds'
        clin_covs = clin_covs.join(d.samplem[scorename])
        clin_cov_cors = pd.DataFrame({})
        for clin_cov in [clin_covs.columns[i] for i in np.arange(clin_covs.shape[1]) \
                         if clin_covs.columns[i] != scorename]:
            keep = ~pd.isna(clin_covs.loc[:,clin_cov])
            if clin_covs.loc[keep,clin_cov].value_counts().shape[0]>1:
                obs_cor = np.corrcoef(clin_covs[scorename].values[keep], clin_covs.loc[keep,clin_cov])[0,1]
                new = pd.DataFrame({'Cov': [clin_cov], 'Rsq':[obs_cor**2]})
                clin_cov_cors = pd.concat([clin_cov_cors, new])
        clin_cov_cors.reset_index(inplace=True, drop=True)
        if np.sum(clin_cov_cors.Rsq>0.1)>0:
            print("Found correlated covariate(s):")
            print(clin_cov_cors.loc[clin_cov_cors.Rsq>=0.01,:])

        # Same k selection procedure as in GeNA
        # Select k based on cumulative % variance explained
        max_nampc = []
        for cum_var_exp in [0.50, 0.80]:
            max_nampc.append(np.min(np.where(np.cumsum(d.uns['NAM_varexp'])>cum_var_exp)[0]))
        np.random.seed(0)
        res = cna.tl.association(d, d.samplem[scorename], batches = d.samplem.batch,  
                             covs = d.samplem[covs], local_test = False, ks = max_nampc)

        # Save sample-level phenotype
        sample_pheno = np.dot(d.uns['NAM_sampleXpc'].iloc[:,:res.k], res.beta)
        sample_pheno = pd.DataFrame(sample_pheno)
        sample_pheno.index=d.uns['NAM_sampleXpc'].index
        vars(res)["sample_pheno"] = sample_pheno

        # Save cohort size
        vars(res)["N"] = np.sum(~pd.isna(d.samplem[scorename]))

        # Compute variable gene correlations to the phenotype
        vargene_cors = []
        for i_gene in np.arange(d.var.shape[0]):
            vargene_cors.append(np.corrcoef(d.X[res.kept, i_gene], res.ncorrs)[0,1])
        res.vargene_cors=pd.DataFrame({'gene':d.var.index, 'cor': vargene_cors})

        # Save result
        pickle.dump(res, open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+scorename+"_"+celltype+".p", 'wb'))

Myeloid
B




NK
T
allcells




Myeloid
B
NK




T
allcells


### Test sig. associations among individuals with documented absence of ANY autoimmune disease

In [6]:
covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
scorenames = ["SLE", "RA"]
celltypes=["Myeloid", "allcells"]
for i, scorename in enumerate(scorenames):
    celltype=celltypes[i]
    d = cna.read("/data/srlab/lrumker/datasets/onek1k/pheno/"+celltype+".h5ad")

    # QC individuals 
    # Remove individuals without documented clinical information
    meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
    meta_clin = meta.drop(columns = meta.columns[pd.isna(meta).sum(axis=0)<400], inplace = False) # 32 clinical variables
    rm_ids = meta_clin.index[pd.isna(meta_clin).sum(axis=1)==meta_clin.shape[1]]
    meta = meta.drop(index=rm_ids)

    # Remove individuals with any autoimmune disease
    other_dz = meta.Other_Disease.value_counts().index
    other_dz = np.concatenate([meta.Autoimmune_Disease_Other.value_counts().index, other_dz])
    AIDs = ['MS', 'SLE', 'Sarcoidosis', "Sjogren's", 'ankylosing spondylitis', 'multiple sclerosis',
            "Crohn's", 'dermatomyositis', 'lupus', 'palindromic rheumatism','psoriasis', 
            'sarcoidosis', 'scleroderma', 'psoriatic arthritis']
    meta['non_null'] = [True if type(meta.Other_Disease.values[i]) is not float else False \
                                 for i in np.arange(meta.shape[0])]
    meta['Any_AID'] = np.repeat(False, meta.shape[0])
    for AID_term in AIDs:
        meta.loc[meta.non_null, 'Any_AID'] = [True if meta.Any_AID.values[i] or AID_term in meta.Autoimmune_Disease_Other.values[i] or \
                                  AID_term in meta.Other_Disease.values[i] else False for i in np.where(meta.non_null)[0]]
    AID_ids = np.array(meta.index[meta.Any_AID])
    meta = meta.drop(index=AID_ids)

    # Load PRS
    PRS = pd.read_csv("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+scorename+"_PRS_noMHC.csv", index_col = 0)
    PRS.set_index('IDs', inplace = True, drop = True)
    PRS.columns = [scorename]
    meta= meta.join(PRS)
    d.samplem = d.samplem.join(meta.loc[:,[scorename]])

    # Same k selection procedure as in GeNA
    # Select k based on cumulative % variance explained
    max_nampc = []
    for cum_var_exp in [0.50, 0.80]:
        max_nampc.append(np.min(np.where(np.cumsum(d.uns['NAM_varexp'])>cum_var_exp)[0]))
    np.random.seed(0)
    res = cna.tl.association(d, d.samplem[scorename], batches = d.samplem.batch,  
                         covs = d.samplem[covs], local_test = False, ks = max_nampc)

    # Save sample-level phenotype
    sample_pheno = np.dot(d.uns['NAM_sampleXpc'].iloc[:,:res.k], res.beta)
    sample_pheno = pd.DataFrame(sample_pheno)
    sample_pheno.index=d.uns['NAM_sampleXpc'].index
    vars(res)["sample_pheno"] = sample_pheno

    # Save cohort size
    vars(res)["N"] = np.sum(~pd.isna(d.samplem[scorename]))

    # Compute variable gene correlations to the phenotype
    vargene_cors = []
    for i_gene in np.arange(d.var.shape[0]):
        vargene_cors.append(np.corrcoef(d.X[res.kept, i_gene], res.ncorrs)[0,1])
    res.vargene_cors=pd.DataFrame({'gene':d.var.index, 'cor': vargene_cors})

    # Save result
    pickle.dump(res, open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+scorename+"_"+celltype+"_noAID.p", 'wb'))

### Assemble results

In [7]:
all_res = pd.DataFrame({})
for scorename in ["SLE", "RA"]:
    for celltype in ["Myeloid", "B", "NK", "T", "allcells"]:
        res = pickle.load(open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+\
                               scorename+"_"+celltype+".p", 'rb'))
        new = pd.DataFrame({"Cell Type": [celltype], "Disease PRS": [scorename], 
                            "Cohort": ["Known absence of "+scorename],
                            "N": [res.N], "P": [res.p]})
        all_res = pd.concat([all_res, new])
        
passed, ps_adj, val1, val2 = sm.stats.multipletests(all_res.P.values, method="fdr_bh", alpha=0.05)
all_res['P, adjusted'] = ps_adj

# Add results to table
scorenames = ["SLE", "RA"]
celltypes=["Myeloid", "allcells"]
for i, scorename in enumerate(scorenames):
    celltype=celltypes[i]
    res = pickle.load(open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+\
                               scorename+"_"+celltype+"_noAID.p", 'rb'))
    new = pd.DataFrame({"Cell Type": [celltype], "Disease PRS": [scorename], 
                            "Cohort": ["Known absence of any autoimmune disease"],
                            "N": [res.N], "P": [res.p], "P, adjusted": ["N.A."]})
    all_res = pd.concat([all_res, new])

table_latex = all_res.to_latex(index = False,  escape=False,
              column_format='p{1.5cm}|p{2cm}|p{3.5cm}|p{0.5cm}|p{1.5cm}|p{1.7cm}')
table_latex = table_latex.replace("\\\n", "\\ \hline\n") # add lines between rows
with open('/data/srlab/lrumker/MCSC_Project/cna-qtl/tables/supptable.prs_results.tex','w') as tf:
    tf.write(table_latex)
all_res

Unnamed: 0,Cell Type,Disease PRS,Cohort,N,P,"P, adjusted"
0,Myeloid,SLE,Known absence of SLE,282,0.003996,0.03996
0,B,SLE,Known absence of SLE,492,0.318681,0.455259
0,NK,SLE,Known absence of SLE,511,0.24975,0.41625
0,T,SLE,Known absence of SLE,531,0.36963,0.462038
0,allcells,SLE,Known absence of SLE,532,0.053946,0.17982
0,Myeloid,RA,Known absence of RA,274,0.165834,0.331668
0,B,RA,Known absence of RA,478,0.13986,0.331668
0,NK,RA,Known absence of RA,498,0.527473,0.586081
0,T,RA,Known absence of RA,517,0.605395,0.605395
0,allcells,RA,Known absence of RA,518,0.026973,0.134865


### Review whether detected phenotypes are consistent

In [8]:
scorenames = ["SLE", "RA"]
celltypes=["Myeloid", "allcells"]
for i, scorename in enumerate(scorenames):
    celltype=celltypes[i]
    print(celltype+", "+scorename)
    res = pickle.load(open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+\
                           scorename+"_"+celltype+".p", 'rb'))
    noAID_res = pickle.load(open("/data/srlab/lrumker/MCSC_Project/cna-prs/results/PRS/"+\
                           scorename+"_"+celltype+"_noAID.p", 'rb'))

    res.sample_pheno.columns=["noDZ_pheno"]
    noAID_res.sample_pheno.columns=["noAID_pheno"]
    res.sample_pheno = res.sample_pheno.join(noAID_res.sample_pheno)
    
    # Sample-level phenotype
    rsq = res.sample_pheno.loc[~pd.isna(res.sample_pheno.noAID_pheno),:].corr().iloc[0,1]**2
    print("Sample-level phenotype r-sq: "+ str(rsq))
    
    # Proxy for neighborhood-level phenotype (when not all neighborhoods are shared between objects)
    rsq = np.corrcoef(res.vargene_cors.cor.values, noAID_res.vargene_cors.cor.values)[0,1]**2
    print("Vargene correlation r-sq: "+ str(rsq))
    print("")

Myeloid, SLE
Sample-level phenotype r-sq: 0.9955333673752494
Vargene correlation r-sq: 0.9483535421356725

allcells, RA
Sample-level phenotype r-sq: 0.885751258968871
Vargene correlation r-sq: 0.972497761948934

