In [7]:
import scanpy as sc
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [6]:
pd.set_option('display.max_rows', 50)

# load all the data files

In [67]:
MAGMA_PATH='/ahg/regevdata/projects/scgwas/data/avivalkes_grant/results/magma/MAGMA_v108_GENE_10_ZSTAT.v2.txt'

def load_magma():
    magma_scores = pd.read_csv(MAGMA_PATH, sep='\t').T
    magma_scores = magma_scores.fillna(0)
    return magma_scores

def compute_celltypeprogram_genes(magma_scores, celltype, trait, programtype, tissue=None, nmfprogram=None, status=None):
    MODULE_DIR='/ahg/regevdata/projects/scgwas/data/singlecell/modules/'
    if programtype=='healthytopics':
        module_scores = pd.read_csv(MODULE_DIR + 'nmf_membership/nmf_corprogram_healthy_new/%s/%s_corprog_%d.txt'%(tissue,tissue, nmf_program+1), sep='\t', header=None)
        module_scores = module_scores.set_index(module_scores.columns[0])
        module_scores = module_scores.reset_index()
    elif programtype=='jointtopics':
        module_scores = pd.read_csv(MODULE_DIR + '/nmf_membership/nmf_corprogram_shared_new/%s/nmf_%s%s.txt'%(tissue, status, nmf_program+1), sep='\t', header=None)
        module_scores = module_scores.set_index(module_scores.columns[0])
        module_scores = module_scores.reset_index()
    else:
        module_scores = pd.read_csv(MODULE_DIR+ '/' + programtype + '/celltypeenriched_L2/%s_L2.txt'%celltype, sep='\t', header=None)
        celltype = celltype.split('/')[1]
    module_scores.columns = ['Gene', 'score']
    if len(module_scores[module_scores.score >= 0.8].Gene.values) > 150:
        magma_subset = magma_scores[set(magma_scores.columns).intersection(module_scores[module_scores.score >= 0.8].Gene.values)]
    else:
        magma_subset = magma_scores[set(magma_scores.columns).intersection(module_scores.sort_values(by='score', ascending=False)[0:150].Gene.values)]
    magma_trait = magma_subset[magma_subset.index==trait].values
    trait_argsort = np.argsort(magma_trait)
    top_genes = np.array(magma_subset.columns)[trait_argsort]
    top_genes = top_genes.tolist()[0][::-1][0:50]
    return top_genes

# cell types

In [36]:
celltype_trait_pairs = [('PASS_Celiac', 'zheng_pbmc/T_Lymphocytes'),
                        ('PASS_Ulcerative_Colitis', 'zheng_pbmc/B_Lymphocytes'),
                        ('PASS_Alzheimers_Jansen2019', 'zheng_pbmc/Monocytes'),
                        ('PASS_Alzheimers_Jansen2019', 'zheng_pbmc/DC'),
                        ('PASS_Alzheimers_Jansen2019', 'alzheimers_brain/Microglia'),
                        ('PASS_MDD_Wray2018', 'brain/GABAergic'), 
                        ('PASS_Intelligence_SavageJansen2018', 'brain/Glutamatergic'),
                        ('PASS_Insomnia_Jansen2019', 'brain/Glutamatergic'),
                        ('UKB_460K.biochemistry_TotalBilirubin', 'liver/Epithelial'),
                        ('UKB_460K.biochemistry_Creatinine', 'kidney/Connecting_tubule'),
                        ('PASS_AtrialFibrillation_Nielsen2018', 'heart/Atrial_Cardiomyocyte'),
                        ('UKB_460K.bp_SYSTOLICadjMEDz', 'heart/Pericyte'),
                        ('UKB_460K.bp_DIASTOLICadjMEDz', 'heart/Pericyte'),
                        ('UKB_460K.bp_SYSTOLICadjMEDz', 'heart/Smooth_Muscle'),
                        ('UKB_460K.bp_DIASTOLICadjMEDz', 'heart/Smooth_Muscle'), 
                        ('UKB_460K.lung_FEV1FVCzSMOKE', 'kropski_lung/Fibroblasts'), 
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 'kropski_lung/T_Lymphocytes'),
                        ('PASS_IBD_deLange2017', 'xavier_colon/Endothelial'),
                        ('PASS_IBD_deLange2017', 'xavier_colon/M_cells'), 
                        ('PASS_UC_deLange2017', 'xavier_colon/Enteroendocrine'),
                        ('UKB_460K.body_WHRadjBMIz', 'xavier_colon/Fibroblast'),
                        ('UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED', 'skin/Langerhans_cells'),
                        ('UKB_460K.body_WHRadjBMIz', 'adipose/Fat')]

In [37]:
monogenic_genes = 'ADA,ALPI,ANKZF1,ARPC1B,BACH2,BCL10,BTK,CD3G,CD40LG,CD55,COL7A1,CTLA4,CYBA,CYBB,DCLRE1C,DKC1,DOCK2,FERMT1,FOXP3,G6PC3,GUCY2C,HPS1,HPS3,HPS4,ICOS,IKBKG,IL10,IL10RA,IL10RB,IL21,IL2RA,IL2RG,IRF2BP2,ITCH,ITGB2,LACC1,LIG4,LRBA,MALT1,MASP2,MVK,NCF1,NCF2,NCF4,NFAT5,NLRC4,NPC1,PIK3CD,PIK3R1,POLA1,RAG1,RAG2,RBCK1,RELA,RIPK1,RTEL1,SIRT1,SKIV2L,SLC26A3,SLC37A4,SLC9A3,SLCO2A1,STAT1,STAT3,STIM1,TGFB1,TGFBR1,TGFBR2,TNFAIP3,TRIM22,TRNT1,TTC37,TTC7A,TYMP,WAS,WIPF1,XIAP,ZAP70'
monogenic_genes = monogenic_genes.split(',')

In [38]:
magma_scores = load_magma()
for trait, celltype in celltype_trait_pairs:
    topgenes = compute_celltypeprogram_genes(magma_scores, celltype, trait, 'healthy')
    print("%s\t%s\t%s"%(trait, celltype, ','.join(topgenes)))

PASS_Celiac	zheng_pbmc/T_Lymphocytes	ETS1,CD247,CD28,RCAN3,ANKRD12,TXK,ANXA6,LBH,C12orf75,UBASH3A,PA2G4,C11orf58,NDFIP1,PYHIN1,GRAP2,RPL18,APRT,SOCS3,TNFAIP8,DSTN,RPL6,RORA,RAC2,LAT,PTPRCAP,TAF7,STMN1,BCL11B,HINT1,LEF1,RPS25,GZMK,PRR5,RPA2,SOD1,TMEM173,RGS10,ANXA2R,RPS20,SKAP1,GOLGA7,B2M,CCR7,TRAT1,RBL2,GATA3,HNRNPK,ARHGDIB,RPL35A,TERF2IP
PASS_Ulcerative_Colitis	zheng_pbmc/B_Lymphocytes	REL,GPX1,LSP1,IMPDH2,FAM26F,EIF6,BRK1,SHMT2,NFKBIA,LAPTM5,RPL23A,CTSS,BANK1,PRKCB,MS4A1,HHEX,ALOX5,CCDC50,TCF4,RPS5,ENSA,AFF3,RPS13,USF2,SLC50A1,SCIMP,CD19,DUSP1,DBNL,FAU,ARID5B,ZFP36L1,SELL,NCF4,ADK,PLAC8,RPL23,RPL28,EZR,PPAPDC1B,LSM10,PKIG,AKR1A1,PRR13,RNASET2,BLOC1S2,POU2F2,RPS11,WHSC1L1,LTA4H
PASS_Alzheimers_Jansen2019	zheng_pbmc/Monocytes	MS4A6A,MS4A4A,CD33,FCER1G,LILRA5,TMEM219,ALDOA,PILRA,PPM1N,ZYX,GMFG,ORAI3,FOSB,CTSH,ABI3,SPI1,ERCC1,NDUFA2,SUPT4H1,LAMTOR4,LACTB,VASP,GRN,SCIMP,SLC16A3,SYK,LILRB2,RAC1,SRGN,LILRB4,SHKBP1,RRBP1,CLTA,LRRC25,PYCARD,COX8A,RIN3,LILRB1,GAPDH,SIDT2,ITGAL,RASSF4,SLC12A9,T

# Disease progression cell types

In [45]:
celltype_trait_pairs = [('PASS_UC_deLange2017', 'UC/Disease_T_Lymphocytes'),
                        ('PASS_IBD_deLange2017', 'UC/Disease_T_Lymphocytes'),
                        ('PASS_UC_deLange2017', 'UC/Disease_Enterocytes'),
                        ('PASS_UC_deLange2017', 'UC/Disease_TA'),
                        ('PASS_UC_deLange2017', 'UC/Disease_M_cells'),
                        ('PASS_IBD_deLange2017', 'UC/Disease_T_Lymphocytes'),
                        ('PASS_IBD_deLange2017', 'UC/Disease_M_cells'),
                        ('PASS_Multiple_sclerosis', 'MS/Disease_GABAergic'), 
                        ('PASS_Multiple_sclerosis', 'MS/Disease_Glia'),
                        ('PASS_Multiple_sclerosis', 'MS/Disease_Endothelial'),
                        ('PASS_Multiple_sclerosis', 'MS/Disease_Microglia'),
                        ('PASS_Alzheimers_Jansen2019', 'Alzheimers/Disease_Microglia'),
                        ('UKB_460K.lung_FEV1FVCzSMOKE', 'asthma_disease/Fibroblast'),
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 'asthma_disease/T_Lymphocyte'),
                        ('IPF_Allen2020', 'Fibrosis/Disease_Mucous')
                       ]

In [48]:
for trait, celltype in celltype_trait_pairs:
    topgenes = compute_celltypeprogram_genes(magma_scores, celltype, trait, 'disease')
    print("%s\t%s\t%s"%(trait, celltype, ','.join(topgenes)))

PASS_UC_deLange2017	UC/Disease_T_Lymphocytes	REL,STAT3,GPX1,ZFP36L1,LEF1,PRKCB,NUCKS1,TRPS1,FYB,ITLN1,CREM,SELL,CYTH1,RPLP2,TNFRSF25,NR3C1,BRD7,NDFIP1,GLCCI1,NFKB1,CAPZA1,SMCHD1,CTNNB1,CD44,PRDM1,RPS21,TNFRSF4,NSD1,PIM3,USF2,POLR2B,BATF,TNRC6B,ARID5A,CHD2,PDCD4,BPTF,ICA1,FAM107B,C6orf62,SRSF9,SLFN5,ETS1,RBPJ,YPEL3,ITGB2,DYNLL1,PCSK7,ASH1L,RNF168
PASS_IBD_deLange2017	UC/Disease_T_Lymphocytes	REL,CYLD,BRD7,CREM,STAT3,ZFP36L1,GPX1,LNPEP,NDFIP1,ITLN1,RPL37,ASH1L,SMCHD1,TNFRSF4,PTPRC,SELL,TNFRSF25,ETS1,TMEM258,CYTH1,VMP1,BRD4,NUCKS1,PRKCB,BATF,IRF2BP2,TNFRSF18,FYB,TRPS1,FAM89B,PRDM1,CAPZA1,CTNNB1,HSPE1,MSI2,DNAJC7,SKP1,CDC42SE2,HSPD1,PIM3,NSD1,RPL9,RPS29,USF2,SON,TCF7,CCR7,HINT1,PDCD1,PPP1CB
PASS_UC_deLange2017	UC/Disease_Enterocytes	C1orf106,REL,RHOA,APEH,GPR35,DLD,GSDMB,STAT3,TNFRSF14,DAG1,GPX1,RNF186,NDUFAF3,SEC16A,NR5A2,RBM5,FAM213B,RBM6,PEX13,UBA7,MED24,MST1R,NXPE1,ERRFI1,AHSA2,USP34,APOBR,CCL20,MTX1,MUC1,SERBP1,PTPN2,CLN3,KRTCAP2,HMGXB4,ZFP91,QARS,ZFP36L1,SMARCE1,CHP1,PDXDC1,HNF4A,MAP

# NMF programs

In [60]:
nmf_trait_pairs = [     ('PASS_Alzheimers_Jansen2019', 8, 'krasnow_pbmc'), # Classical
                        ('PASS_Alzheimers_Jansen2019', 15, 'krasnow_pbmc'), # Nonclassical
                        ('UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED', 5, 'krasnow_pbmc'), # cd4 t cells
                        ('UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED', 7, 'zheng_pbmc'), # IL2 signaling
                        ('PASS_Celiac', 7, 'zheng_pbmc'), # IL2 signaling
                        ('PASS_UC_deLange2017', 5, 'zheng_pbmc'), # MHC
                        ('PASS_Alzheimers_Jansen2019', 5, 'zheng_pbmc'), # MHC
                        ('PASS_Celiac', 2, 'ICA_bonemarrow'), # complement cascade
                        ('PASS_MDD_Wray2018', 0, 'allen_brain'), # SST
                        ('PASS_MDD_Wray2018', 1, 'allen_brain'), # PVALB
                        ('UKB_460K.body_WHRadjBMIz', 9, 'allen_brain'), # LAMP5
                        ('UKB_460K.body_WHRadjBMIz', 10, 'allen_brain'), # VIP
                        ('PASS_Neuroticism_Nagel2018', 2, 'allen_brain'), # IT
                        ('Fluid_intelligence_score', 4, 'allen_brain'), # L6
                        ('UKB_460K.cov_EDU_YEARS', 5, 'allen_brain'), # electron transport
                       ]

"""
Alzheimers Healthy Blood Classical Monocytes
Alzheimers Healthy Blood MHC-II antigen processing
Ulcerative colitis Healthy Blood MHC-II antigen processing 
Eczema Healthy Blood IL-2 signaling 
Rheumatoid arthritis Healthy Blood Complement cascade
Major Depressive Disorder Healthy Brain SST 
Major Depressive Disorder Healthy Brain PVALB 
Number of children born Healthy Brain LAMP5 
Number of children born Healthy Brain VIP
Neuroticism Healthy Brain Neurexin.Neureligin
Neuroticism Healthy Brain Addiction
"""

'\nAlzheimers Healthy Blood Classical Monocytes\nAlzheimers Healthy Blood MHC-II antigen processing\nUlcerative colitis Healthy Blood MHC-II antigen processing \nEczema Healthy Blood IL-2 signaling \nRheumatoid arthritis Healthy Blood Complement cascade\nMajor Depressive Disorder Healthy Brain SST \nMajor Depressive Disorder Healthy Brain PVALB \nNumber of children born Healthy Brain LAMP5 \nNumber of children born Healthy Brain VIP\nNeuroticism Healthy Brain Neurexin.Neureligin\nNeuroticism Healthy Brain Addiction\n'

In [65]:
for trait, nmf_program, tissue in nmf_trait_pairs:
    topgenes = compute_celltypeprogram_genes(magma_scores, None, trait, 'healthytopics', tissue=tissue, nmfprogram=celltype)
    print("%s\t%s\t%s\t%s"%(trait, tissue, nmf_program, ','.join(topgenes)))

PASS_Alzheimers_Jansen2019	krasnow_pbmc	8	MS4A6A,CD33,CR1,FCER1G,LILRA5,ALDOA,PILRA,CTSH,SPI1,LAMTOR4,CSGALNACT2,ITGAM,GRN,ANPEP,LILRB2,RAC1,SRGN,TREM1,PYCARD,GAPDH,TMEM170B,CD93,SAMSN1,CD55,CD14,FKBP5,FCGR2A,ROGDI,HNMT,SERPINA1,VNN1,NRG1,CD1D,GRINA,CPVL,FPR1,QPCT,FPR2,H2AFY,AQP9,FAM101B,ACSL1,CD68,NUDT16,NFE2,RNF130,TGFBI,ASPH,CD302,IFI30
PASS_Alzheimers_Jansen2019	krasnow_pbmc	15	MS4A6A,MS4A4A,CD33,FCER1G,LILRA5,ALDOA,PILRA,CTSH,ABI3,SPI1,VASP,GRN,SCIMP,SLC16A3,LILRB2,RAC1,SRGN,CLTA,LRRC25,PYCARD,LILRB1,GAPDH,TAGLN,ADGRE2,CD55,PPP1R17,CD14,FCGR2A,ROGDI,HNMT,SERPINA1,PDK4,CSTB,CD300A,SIGLEC10,TNFRSF1B,DUSP6,TBC1D8,ITM2B,GRINA,CPVL,FPR1,SYNGR2,H2AFY,ARPC3,ARL4A,CD68,NUDT16,MIS18BP1,RNF130
UKB_460K.disease_ALLERGY_ECZEMA_DIAGNOSED	krasnow_pbmc	5	IL7R,STMN3,NDFIP1,LIME1,RPL5,CD6,ARHGAP15,GPR183,HMGN1,ADTRP,GIMAP7,CD52,RPS16,RPS11,RPL13A,FLT3LG,NOSIP,BCL2,TAGAP,TNFSF8,RPL38,CCR7,SIRPG,SELM,RPS3A,RPL41,TRABD2A,CD5,RPL37,RPL27A,FOXP1,LCK,SOD1,RHOH,BTG1,RPS10,TSTD1,SH3YL1,RPS23,TOB1,RPS8,TRA

# joint NMF programs

In [68]:
nmf_trait_pairs = [     ('PASS_UC_deLange2017', 11, 'disease', 'colon'), # EGFR1 pathway
                        ('PASS_Multiple_sclerosis', 16, 'disease', 'ms'), # L2_3
                        ('PASS_Multiple_sclerosis', 21, 'disease', 'ms'), # Complement Cascade
                        ('PASS_Alzheimers_Jansen2019', 17, 'disease', 'alzheimers'), # Complement Cascade
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 31, 'healthy', 'asthma'), # macrophage neutrophil transition and asthma from asthma data
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 23, 'healthy', 'asthma'), # asthma data asthma trait fc epsilon receptor
                        ('UKB_460K.lung_FEV1FVCzSMOKE', 14, 'disease', 'asthma'), # asthma data lung capacity trait mapk signaling
                        ('PASS_Alzheimers_Jansen2019', 17, 'disease', 'alzheimers'), # Apelin signaling
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 31, 'healthy', 'asthma'), # Apelin signaling
                        ('PASS_ChildOnsetAsthma_Ferreira2019', 32, 'healthy', 'asthma'), # Apelin signaling
]

In [69]:
for trait, nmf_program, status, tissue in nmf_trait_pairs:
    topgenes = compute_celltypeprogram_genes(magma_scores, None, trait, 'jointtopic', tissue=tissue, nmfprogram=celltype, status=status)
    print("%s\t%s\t%s\t%s"%(trait, tissue, nmf_program, ','.join(topgenes)))

PASS_UC_deLange2017	colon	11	C1orf106,SLC26A3,NXPE4,NXPE1,UQCRC1,CLDN3,CLDN4,HNF4A,CDH1,TSPO,SATB2,SULT1A1,ITPKA,DGAT1,CHP2,PPP1R1B,MS4A12,RAPGEFL1,LSR,GOLM1,RAB25,TSPAN1,DHRS11,GPA33,FXYD3,RETSAT,SCNN1B,CDX1,TDP2,PRR15L,GUCA2B,CES2,CHCHD10,BEST4,KRTCAP3,IFT172,TSPAN3,PIGR,ASL,MUC12,TMEM171,DDC,FAM162A,CES3,USMG5,HSBP1L1,CKB,FAM3D,GUCA2A,NR2F6
PASS_Multiple_sclerosis	ms	16	AHI1,STAT4,PITPNM2,SLC25A12,ANKRD33B,ICA1L,ROCK2,ARHGAP26,FAM163A,MED12L,RAPGEFL1,ICA1,CA10,MLIP,RCAN2,GAP43,ITPKA,KIAA1549L,CAMK2B,MAL2,CAMKK1,SLC8A1,RASAL2,NLK,STRBP,GNG2,EPB41L4B,RAPH1,SIDT1,PKIG,NEK10,PRICKLE1,C11orf87,NPTXR,CDYL2,GRM5,CCBE1,ITPR1,ANO3,PTPRR,ANKRD18B,PTH2R,FAM84A,STEAP2,KCNK1,LAMB1,SLC24A4,TARBP1,FBXW7,TMEM232
PASS_Multiple_sclerosis	ms	21	CD37,RGS14,NCF4,RGS1,EAF2,CSF2RB,SOCS1,SLAMF7,ICAM3,FCRL2,ISG20,CD27,ST6GALNAC4,FCRL1,PECAM1,AEN,CD48,RPS9,IKZF3,RPS6,SLC16A6,NUDT22,SEC24D,GLCCI1,RPL18,FCRL5,HCLS1,FAM46C,RPL34,LRRK1,PTPN1,RPL7A,RPS8,JSRP1,ERP29,SND1,MANF,LCP1,OSTC,RPL19,POU2AF1,PROM2,OOEP,SEC