In [1]:
import subprocess
import sys
import os
import shutil
import pandas as pd
import numpy as np

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [3]:
#set paths
basedir = '/data/songy4/proteomics_196'
datadir = f'{basedir}/data_folder'
twasdir = f'/data/songy4/twas'
fusiondir = f'{twasdir}/fusion_twas'
sumstat_path = f'{datadir}/meta.txt'
top_dir = f'{basedir}/output/top_csf_cardio_case'
anal_dir = f'{basedir}/output/analysis_csf_cardio_case'
fusion_ldref_basename = f'{fusiondir}/LDREF/1000G.EUR'
fusion_post_script = f'{fusiondir}/FUSION.post_process.R'

!mkdir --parents output/top_csf_cardio_case
!mkdir --parents output/analysis_csf_cardio_case

#!for i in {1..22}; do mkdir --parents output/analysis_csf_cardio_case/PD.$i; done;

In [4]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1, usecols=[1])

files = glob.glob(f'{basedir}/output/pd_csf_cardio_case/PD.*.dat')

gene_ls = []
for i in range(1, 23):
    pd_i = "/data/songy4/proteomics_196/output/pd_csf_cardio_case/PD." + str(i) + ".dat"
    if pd_i in files:
        pd_i = pd.read_csv(f"{basedir}/output/pd_csf_cardio_case/PD.{i}.dat", sep='\t', index_col=False)
        print('number of rows in PD.' + str(i) + '.dat:', pd_i.shape[0])
        gene_ls.append(pd_i.shape[0])
    else:
        print('number of rows in PD.' + str(i) + '.dat:', '0')
        gene_ls.append(0)
print('number of rows in dat files', gene_ls)

number of rows in PD.1.dat: 7
number of rows in PD.2.dat: 2
number of rows in PD.3.dat: 0
number of rows in PD.4.dat: 1
number of rows in PD.5.dat: 3
number of rows in PD.6.dat: 1
number of rows in PD.7.dat: 3
number of rows in PD.8.dat: 0
number of rows in PD.9.dat: 0
number of rows in PD.10.dat: 3
number of rows in PD.11.dat: 1
number of rows in PD.12.dat: 0
number of rows in PD.13.dat: 1
number of rows in PD.14.dat: 1
number of rows in PD.15.dat: 1
number of rows in PD.16.dat: 1
number of rows in PD.17.dat: 0
number of rows in PD.18.dat: 0
number of rows in PD.19.dat: 3
number of rows in PD.20.dat: 0
number of rows in PD.21.dat: 0
number of rows in PD.22.dat: 2
number of rows in dat files [7, 2, 0, 1, 3, 1, 3, 0, 0, 3, 1, 0, 1, 1, 1, 1, 0, 0, 3, 0, 0, 2]


In [5]:
#from each .dat file in pd, grab rows where TWAS.P column has smaller value than 0.05/number of rows and save them as .top file in top folder
#grab the i number in dat files
file_name = [s.replace('/data/songy4/proteomics_196/output/pd_csf_cardio_case/PD.', '') for s in files]
file_name = [s.replace('.dat', '') for s in file_name]

for i in file_name:
    pd_i = pd.read_csv(f"{basedir}/output/pd_csf_cardio_case/PD.{i}.dat", sep='\t', index_col=False)
    pd_i['TWAS.P'] = pd.to_numeric(pd_i['TWAS.P'], errors='coerce')
    pd_i['PANEL'] = pd_i['PANEL'].astype(str).str[3].fillna('NA')
    value_i = 0.05/pd_i.shape[0]
    df_i = pd.DataFrame(columns = pd_i.columns.tolist())
    for index, row in pd_i.iterrows():
        if row['TWAS.P'] < value_i:
            df_i.loc[index] = row
    df_i.to_csv(f'./output/top_csf_cardio_case/PD.{i}.top', sep='\t' ,index=False)

In [39]:
#use row numbers (number of genes which is ID) to divide 0.05  ---> this is manula way

#!cat ./output/pd/PD.1.dat | awk 'NR == 1 || $NF < 0.05/1036' > ./output/top/PD.1.top
#!cat ./output/pd/PD.2.dat | awk 'NR == 1 || $NF < 0.05/667' > ./output/top/PD.2.top
#!cat ./output/pd/PD.3.dat | awk 'NR == 1 || $NF < 0.05/567' > ./output/top/PD.3.top
#!cat ./output/pd/PD.4.dat | awk 'NR == 1 || $NF < 0.05/411' > ./output/top/PD.4.top
#!cat ./output/pd/PD.5.dat | awk 'NR == 1 || $NF < 0.05/465' > ./output/top/PD.5.top
#!cat ./output/pd/PD.6.dat | awk 'NR == 1 || $NF < 0.05/467' > ./output/top/PD.6.top
#!cat ./output/pd/PD.7.dat | awk 'NR == 1 || $NF < 0.05/498' > ./output/top/PD.7.top
#!cat ./output/pd/PD.8.dat | awk 'NR == 1 || $NF < 0.05/341' > ./output/top/PD.8.top
#!cat ./output/pd/PD.9.dat | awk 'NR == 1 || $NF < 0.05/386' > ./output/top/PD.9.top
#!cat ./output/pd/PD.10.dat | awk 'NR == 1 || $NF < 0.05/389' > ./output/top/PD.10.top
#!cat ./output/pd/PD.11.dat | awk 'NR == 1 || $NF < 0.05/575' > ./output/top/PD.11.top
#!cat ./output/pd/PD.12.dat | awk 'NR == 1 || $NF < 0.05/504' > ./output/top/PD.12.top
#!cat ./output/pd/PD.13.dat | awk 'NR == 1 || $NF < 0.05/178' > ./output/top/PD.13.top
#!cat ./output/pd/PD.14.dat | awk 'NR == 1 || $NF < 0.05/301' > ./output/top/PD.14.top
#!cat ./output/pd/PD.15.dat | awk 'NR == 1 || $NF < 0.05/320' > ./output/top/PD.15.top
#!cat ./output/pd/PD.16.dat | awk 'NR == 1 || $NF < 0.05/406' > ./output/top/PD.16.top
#!cat ./output/pd/PD.17.dat | awk 'NR == 1 || $NF < 0.05/520' > ./output/top/PD.17.top
#!cat ./output/pd/PD.18.dat | awk 'NR == 1 || $NF < 0.05/147' > ./output/top/PD.18.top
#!cat ./output/pd/PD.19.dat | awk 'NR == 1 || $NF < 0.05/692' > ./output/top/PD.19.top
#!cat ./output/pd/PD.20.dat | awk 'NR == 1 || $NF < 0.05/250' > ./output/top/PD.20.top
#!cat ./output/pd/PD.21.dat | awk 'NR == 1 || $NF < 0.05/112' > ./output/top/PD.21.top
#!cat ./output/pd/PD.22.dat | awk 'NR == 1 || $NF < 0.05/246' > ./output/top/PD.22.top

In [6]:
#fusion post process 
for i in file_name:
    anal_dir = f'output/analysis_csf_cardio_case'
    top_dir = f'output/top_csf_cardio_case'
    fusion_post_cmd_i = f'\
    Rscript {fusion_post_script} \
    --sumstats {sumstat_path} \
    --input {top_dir}/PD.{i}.top \
    --out {anal_dir}/PD.{i}.top.analysis \
    --ref_ld_chr {fusion_ldref_basename}. \
    --chr {i} \
    --plot --locus_win 100000'
    shell_do(fusion_post_cmd_i)

Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/proteomics_196/data_folder/meta.txt --input output/top_csf_cardio_case/PD.1.top --out output/analysis_csf_cardio_case/PD.1.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 1 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/proteomics_196/data_folder/meta.txt --input output/top_csf_cardio_case/PD.10.top --out output/analysis_csf_cardio_case/PD.10.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 10 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/proteomics_196/data_folder/meta.txt --input output/top_csf_cardio_case/PD.11.top --out output/analysis_csf_cardio_case/PD.11.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 11 --plot --locus_win 100000
Executing: Rscript 

In [9]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1, usecols=[1])

files = glob.glob(f'{anal_dir}/PD.*.joint_included.dat')
    
csf_cardio_case_df = pd.concat([read_id(file) for file in files], axis=0)

csf_cardio_case_df.to_csv(r'./data_folder/pdbp_ppmi_proteomics196_csf_cardio_result_case.csv',index=False)

In [4]:
#compare twas_genes_case.csv, twas_genes_control.csv and gwas_genes.csv
gwas = pd.read_csv(f"{datadir}/gwas.csv")
csf_cardio_case = pd.read_csv(r"./data_folder/pdbp_ppmi_proteomics196_csf_cardio_result_case.csv", sep=' ')
#twas_cont = pd.read_csv(r"./data_folder/twas_genes_control.csv", sep=' ')

print('shape of gwas:', gwas.shape)
print('shape of csf_cardio_case:', csf_cardio_case.shape)
#print('shape of twas control:', twas_cont.shape)
csf_cardio_case

shape of gwas: (439, 2)
shape of csf_cardio_case: (4, 1)


Unnamed: 0,ID
0,FCGR2A
1,PM20D1
2,LGALS3
3,PON2


In [5]:
#TWAS case & GWAS significant hits
csf_cardio_case_gwas = csf_cardio_case[csf_cardio_case['ID'].isin(gwas['Gene'])].reset_index(drop=True)
print("number of csf_cardio_case genes in GWAS genes:", csf_cardio_case_gwas.shape[0])

#TWAS case & GWAS significant hits
#twas_cont_gwas = twas_cont[twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
#print("number of TWAS control genes in GWAS genes:", twas_cont_gwas.shape[0])

#TWAS case & GWAS significant hits
csf_cardio_case_not_gwas = csf_cardio_case[~csf_cardio_case['ID'].isin(gwas['Gene'])].reset_index(drop=True)
print("number of csf_cardio_case genes not in GWAS genes:", csf_cardio_case_not_gwas.shape[0])

#TWAS case & GWAS significant hits
#twas_cont_not_gwas = twas_cont[~twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
#print("number of TWAS control genes not in GWAS genes:", twas_cont_not_gwas.shape[0])

number of csf_cardio_case genes in GWAS genes: 1
number of csf_cardio_case genes not in GWAS genes: 3


In [6]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1)

files = glob.glob(f'{top_dir}/*.top')
    
case_top = pd.concat([read_id(file) for file in files], axis=0).reset_index(drop=True)
case_top.drop(case_top.iloc[:,0:2], axis=1, inplace=True)
csf_cardio_case_top = case_top.merge(csf_cardio_case, on='ID', how='inner').reset_index(drop=True)
csf_cardio_case_top
#twas_case_top.to_csv(r'./data_folder/twas_case_top_all.csv',index=False)

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,FCGR2A,1,161475220,161493803,0.759,rs1801274,-6.79,rs12722986,0.1395,-6.21,-4.951,442,15,lasso,0.26,1.6e-07,4.676231,2.92e-06
1,PM20D1,1,205797150,205819260,0.587,rs823114,-11.31,rs12565968,0.1471,4.05,-7.559,386,15,lasso,0.19,1.2e-05,-8.463696,2.5900000000000002e-17
2,LGALS3,14,55590828,55612126,0.441,rs8018800,-8.22,rs17672376,0.191,-5.48,-2.34,431,15,lasso,0.28,3.3e-08,4.36,1.28e-05
3,PON2,7,95034175,95064510,0.46,rs7778623,-4.76,rs6973380,0.4133,-6.33,-4.19,475,1,top1,0.41,3.8e-12,4.19,2.8e-05


In [7]:
##grab significant GWAS hits (P-value threshold to 0.05/N-tests per datatype)
sig_csf_cardio_case_top = csf_cardio_case_top[csf_cardio_case_top['TWAS.P']< 1.40E-04]
#reset index
sig_csf_cardio_case_top = sig_csf_cardio_case_top.reset_index(drop=True)
print("shape of significant csf_cardio_case data:", sig_csf_cardio_case_top.shape)

sig_csf_cardio_case_top.head()

shape of significant csf_cardio_case data: (4, 18)


Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,FCGR2A,1,161475220,161493803,0.759,rs1801274,-6.79,rs12722986,0.1395,-6.21,-4.951,442,15,lasso,0.26,1.6e-07,4.676231,2.92e-06
1,PM20D1,1,205797150,205819260,0.587,rs823114,-11.31,rs12565968,0.1471,4.05,-7.559,386,15,lasso,0.19,1.2e-05,-8.463696,2.5900000000000002e-17
2,LGALS3,14,55590828,55612126,0.441,rs8018800,-8.22,rs17672376,0.191,-5.48,-2.34,431,15,lasso,0.28,3.3e-08,4.36,1.28e-05
3,PON2,7,95034175,95064510,0.46,rs7778623,-4.76,rs6973380,0.4133,-6.33,-4.19,475,1,top1,0.41,3.8e-12,4.19,2.8e-05


In [8]:
from scipy import stats
#remove some columns
case_top = sig_csf_cardio_case_top[['ID', 'CHR', 'EQTL.ID', 'EQTL.Z', 'TWAS.Z', 'TWAS.P']]
#add EQTL.P
#case_top['EQTL.P'] = case_top.stats.norm.cdf(-Z)
#add GWAS.hit column 1 if ID is in twas_case_gwas 0 if not
case_top['GWAS.hit'] = case_top.ID.isin(csf_cardio_case_gwas.ID).astype(int)
#convert 0 to no and 1 to yes
case_top['GWAS.hit'] = case_top['GWAS.hit'].map({0: 'no', 1: 'yes'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['GWAS.hit'] = case_top.ID.isin(csf_cardio_case_gwas.ID).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['GWAS.hit'] = case_top['GWAS.hit'].map({0: 'no', 1: 'yes'})


In [9]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#convert CHR from object data type to int
case_top['CHR'] = case_top['CHR'].astype(int)

case_top data types: 
 ID           object
CHR          object
EQTL.ID      object
EQTL.Z      float64
TWAS.Z      float64
TWAS.P      float64
GWAS.hit     object
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['CHR'] = case_top['CHR'].astype(int)


In [10]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#sort the rows by CHR order
case_top = case_top.sort_values('CHR').reset_index(drop=True)
case_top

case_top data types: 
 ID           object
CHR           int64
EQTL.ID      object
EQTL.Z      float64
TWAS.Z      float64
TWAS.P      float64
GWAS.hit     object
dtype: object


Unnamed: 0,ID,CHR,EQTL.ID,EQTL.Z,TWAS.Z,TWAS.P,GWAS.hit
0,FCGR2A,1,rs12722986,-6.21,4.676231,2.92e-06,yes
1,PM20D1,1,rs12565968,4.05,-8.463696,2.5900000000000002e-17,no
2,PON2,7,rs6973380,-6.33,4.19,2.8e-05,no
3,LGALS3,14,rs17672376,-5.48,4.36,1.28e-05,no


In [14]:
#check if GWAS.hit has 40 yes and 60 no
pd.value_counts(case_top['GWAS.hit'])

no    1
Name: GWAS.hit, dtype: int64

In [12]:
#save twas_cont_top and cont_top --run only once
#twas_case_top.to_csv(r'./data_folder/twas_case_top_all.csv',index=False)
sig_csf_cardio_case_top.to_csv(r'./data_folder/significant_csf_cardio_case_top_all.csv',index=False)
case_top.to_csv(r'./data_folder/csf_cardio_case_top.csv', index=False)

Convert QTL to SMR

In [None]:
geno_path = f'{datadir}/qc_proteomics196_csf_cardio'
sumstat_path = f'{datadir}/meta.txt'

In [None]:
!smr --bfile {geno_path}_case_hg19_lifted --gwas-summary mygwas.ma --beqtl-summary myeqtl --out mysmr --thread-num 10 
