In [1]:
import subprocess
import sys
import os
import shutil
import pandas as pd
import numpy as np

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [3]:
#set paths
basedir = '/data/songy4/tes'
datadir = f'{basedir}/data_folder'
twasdir = f'/data/songy4/twas'
fusiondir = f'{twasdir}/fusion_twas'
sumstat_path = f'{datadir}/meta.txt'
top_dir = f'{basedir}/output/top_case'
anal_dir = f'{basedir}/output/analysis_case'
fusion_ldref_basename = f'{fusiondir}/LDREF/1000G.EUR'
fusion_post_script = f'{fusiondir}/FUSION.post_process.R'

!mkdir --parents output/top_case
!mkdir --parents output/analysis_case

#!for i in {1..22}; do mkdir --parents output/analysis_case/PD.$i; done;

In [5]:
#check shape of .dat files
gene_ls = []
for i in range(1, 23):
    pd_i = pd.read_csv(f"{basedir}/output/pd_case/PD_case.{i}.dat", sep='\t', index_col=False)
    print('number of rows in PD.' + str(i) + '.dat:', pd_i.shape[0])
    gene_ls.append(pd_i.shape[0])
print('number of rows in dat files', gene_ls)
##row numbers are number of genes per chromosom

number of rows in PD.1.dat: 1007
number of rows in PD.2.dat: 653
number of rows in PD.3.dat: 710
number of rows in PD.4.dat: 362
number of rows in PD.5.dat: 229
number of rows in PD.6.dat: 579
number of rows in PD.7.dat: 796
number of rows in PD.8.dat: 251
number of rows in PD.9.dat: 178
number of rows in PD.10.dat: 245
number of rows in PD.11.dat: 896
number of rows in PD.12.dat: 668
number of rows in PD.13.dat: 78
number of rows in PD.14.dat: 309
number of rows in PD.15.dat: 454
number of rows in PD.16.dat: 548
number of rows in PD.17.dat: 444
number of rows in PD.18.dat: 82
number of rows in PD.19.dat: 384
number of rows in PD.20.dat: 127
number of rows in PD.21.dat: 17
number of rows in PD.22.dat: 362
number of rows in dat files [1007, 653, 710, 362, 229, 579, 796, 251, 178, 245, 896, 668, 78, 309, 454, 548, 444, 82, 384, 127, 17, 362]


In [6]:
#check null value in TWAS.P in each dat file
for i in range(1,23):
    pd_i = pd.read_csv(f"{basedir}/output/pd_case/PD_case.{i}.dat", sep='\t', index_col=False)
    print('Number of null value in TWAS.P in pd_' + str(i) + ':', pd_i['TWAS.P'].isnull().sum())

Number of null value in TWAS.P in pd_1: 0
Number of null value in TWAS.P in pd_2: 0
Number of null value in TWAS.P in pd_3: 0
Number of null value in TWAS.P in pd_4: 0
Number of null value in TWAS.P in pd_5: 0
Number of null value in TWAS.P in pd_6: 0
Number of null value in TWAS.P in pd_7: 0
Number of null value in TWAS.P in pd_8: 0
Number of null value in TWAS.P in pd_9: 0
Number of null value in TWAS.P in pd_10: 0
Number of null value in TWAS.P in pd_11: 0
Number of null value in TWAS.P in pd_12: 0
Number of null value in TWAS.P in pd_13: 0
Number of null value in TWAS.P in pd_14: 0
Number of null value in TWAS.P in pd_15: 0
Number of null value in TWAS.P in pd_16: 0
Number of null value in TWAS.P in pd_17: 0
Number of null value in TWAS.P in pd_18: 0
Number of null value in TWAS.P in pd_19: 0
Number of null value in TWAS.P in pd_20: 0
Number of null value in TWAS.P in pd_21: 0
Number of null value in TWAS.P in pd_22: 0


In [7]:
#from each .dat file in pd, grab rows where TWAS.P column has smaller value than 0.05/number of rows and save them as .top file in top folder
for i in range(1,23):
    pd_i = pd.read_csv(f"{basedir}/output/pd_case/PD_case.{i}.dat", sep='\t', index_col=False)
    pd_i['TWAS.P'] = pd.to_numeric(pd_i['TWAS.P'], errors='coerce')
    pd_i['PANEL'] = pd_i['PANEL'].astype(str).str[3].fillna('NA')
    value_i = 0.05/pd_i.shape[0]
    df_i = pd.DataFrame(columns = pd_i.columns.tolist())
    for index, row in pd_i.iterrows():
        if row['TWAS.P'] < value_i:
            df_i.loc[index] = row
    df_i.to_csv(f'./output/top_case/PD.{i}.top', sep='\t' ,index=False)

In [39]:
#use row numbers (number of genes which is ID) to divide 0.05  ---> this is manula way

#!cat ./output/pd/PD.1.dat | awk 'NR == 1 || $NF < 0.05/1036' > ./output/top/PD.1.top
#!cat ./output/pd/PD.2.dat | awk 'NR == 1 || $NF < 0.05/667' > ./output/top/PD.2.top
#!cat ./output/pd/PD.3.dat | awk 'NR == 1 || $NF < 0.05/567' > ./output/top/PD.3.top
#!cat ./output/pd/PD.4.dat | awk 'NR == 1 || $NF < 0.05/411' > ./output/top/PD.4.top
#!cat ./output/pd/PD.5.dat | awk 'NR == 1 || $NF < 0.05/465' > ./output/top/PD.5.top
#!cat ./output/pd/PD.6.dat | awk 'NR == 1 || $NF < 0.05/467' > ./output/top/PD.6.top
#!cat ./output/pd/PD.7.dat | awk 'NR == 1 || $NF < 0.05/498' > ./output/top/PD.7.top
#!cat ./output/pd/PD.8.dat | awk 'NR == 1 || $NF < 0.05/341' > ./output/top/PD.8.top
#!cat ./output/pd/PD.9.dat | awk 'NR == 1 || $NF < 0.05/386' > ./output/top/PD.9.top
#!cat ./output/pd/PD.10.dat | awk 'NR == 1 || $NF < 0.05/389' > ./output/top/PD.10.top
#!cat ./output/pd/PD.11.dat | awk 'NR == 1 || $NF < 0.05/575' > ./output/top/PD.11.top
#!cat ./output/pd/PD.12.dat | awk 'NR == 1 || $NF < 0.05/504' > ./output/top/PD.12.top
#!cat ./output/pd/PD.13.dat | awk 'NR == 1 || $NF < 0.05/178' > ./output/top/PD.13.top
#!cat ./output/pd/PD.14.dat | awk 'NR == 1 || $NF < 0.05/301' > ./output/top/PD.14.top
#!cat ./output/pd/PD.15.dat | awk 'NR == 1 || $NF < 0.05/320' > ./output/top/PD.15.top
#!cat ./output/pd/PD.16.dat | awk 'NR == 1 || $NF < 0.05/406' > ./output/top/PD.16.top
#!cat ./output/pd/PD.17.dat | awk 'NR == 1 || $NF < 0.05/520' > ./output/top/PD.17.top
#!cat ./output/pd/PD.18.dat | awk 'NR == 1 || $NF < 0.05/147' > ./output/top/PD.18.top
#!cat ./output/pd/PD.19.dat | awk 'NR == 1 || $NF < 0.05/692' > ./output/top/PD.19.top
#!cat ./output/pd/PD.20.dat | awk 'NR == 1 || $NF < 0.05/250' > ./output/top/PD.20.top
#!cat ./output/pd/PD.21.dat | awk 'NR == 1 || $NF < 0.05/112' > ./output/top/PD.21.top
#!cat ./output/pd/PD.22.dat | awk 'NR == 1 || $NF < 0.05/246' > ./output/top/PD.22.top

In [12]:
#fusion post process 
for i in range(1, 23):
    anal_dir = f'output/analysis_case'
    top_dir = f'output/top_case'
    fusion_post_cmd_i = f'\
    Rscript {fusion_post_script} \
    --sumstats {sumstat_path} \
    --input {top_dir}/PD.{i}.top \
    --out {anal_dir}/PD.{i}.top.analysis \
    --ref_ld_chr {fusion_ldref_basename}. \
    --chr {i} \
    --plot --locus_win 100000'
    shell_do(fusion_post_cmd_i)

Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/tes/data_folder/meta.txt --input output/top_case/PD.1.top --out output/analysis_case/PD.1.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 1 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/tes/data_folder/meta.txt --input output/top_case/PD.2.top --out output/analysis_case/PD.2.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 2 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/tes/data_folder/meta.txt --input output/top_case/PD.3.top --out output/analysis_case/PD.3.top.analysis --ref_ld_chr /data/songy4/twas/fusion_twas/LDREF/1000G.EUR. --chr 3 --plot --locus_win 100000
Executing: Rscript /data/songy4/twas/fusion_twas/FUSION.post_process.R --sumstats /data/songy4/tes/data_folder/meta.txt --in

In [13]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1, usecols=[1])

files = glob.glob(f'{anal_dir}/*.joint_included.dat')
    
case_df = pd.concat([read_id(file) for file in files], axis=0)

case_df.to_csv(r'./data_folder/tes_genes_case.csv',index=False)

In [15]:
case_df = case_df.reset_index(drop=True)
case_df

Unnamed: 0,ID
0,ENST00000469861.1
1,ENST00000649236.1
2,ENST00000416185.2
3,ENST00000464824.2
4,ENST00000256178.8
5,ENST00000531786.1
6,ENST00000613543.1
7,ENST00000563933.1
8,ENST00000533372.1
9,ENST00000659330.1


In [4]:
#compare twas_genes_case.csv, twas_genes_control.csv and gwas_genes.csv
#gwas = pd.read_csv(r"./data_folder/gwas_genes.csv", sep=' ')
tes_case = pd.read_csv(r"./data_folder/tes_genes_case.csv", sep=' ')
#twas_cont = pd.read_csv(r"./data_folder/twas_genes_control.csv", sep=' ')

#print('shape of gwas:', gwas.shape)
print('shape of tes cases:', tes_case.shape)
#print('shape of twas control:', twas_cont.shape)

shape of tes cases: (27, 1)


In [5]:
#TWAS case & GWAS significant hits
twas_case_gwas = twas_case[twas_case['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS case genes in GWAS genes:", twas_case_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_cont_gwas = twas_cont[twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS control genes in GWAS genes:", twas_cont_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_case_not_gwas = twas_case[~twas_case['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS case genes not in GWAS genes:", twas_case_not_gwas.shape[0])

#TWAS case & GWAS significant hits
twas_cont_not_gwas = twas_cont[~twas_cont['ID'].isin(gwas['ID'])].reset_index(drop=True)
print("number of TWAS control genes not in GWAS genes:", twas_cont_not_gwas.shape[0])

number of TWAS case genes in GWAS genes: 40
number of TWAS control genes in GWAS genes: 30
number of TWAS case genes not in GWAS genes: 60
number of TWAS control genes not in GWAS genes: 57


In [5]:
#grab all gene ID from to.analysis.joint_included.dat files
import glob

def read_id(file):
    return pd.read_csv(file, delim_whitespace=1)

files = glob.glob(f'{top_dir}/*.top')
    
case_top = pd.concat([read_id(file) for file in files], axis=0).reset_index(drop=True)
case_top.drop(case_top.iloc[:,0:2], axis=1, inplace=True)
tes_case_top = case_top.merge(tes_case, on='ID', how='inner').reset_index(drop=True)
tes_case_top
#twas_case_top.to_csv(r'./data_folder/twas_case_top_all.csv',index=False)

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,ENST00000469861.1,1,205828333,205830289,0.1732,rs11240599,-3.592,rs7517009,0.0714,-7.25,-0.22917,19,19,enet,0.11,2.6999999999999996e-19,-4.45835,8.26e-06
1,ENST00000649236.1,1,156243687,156248073,0.027,rs10737170,6.461,rs10737170,0.0125,3.36,6.46104,18,1,top1,0.012,0.0017,6.46104,1.04e-10
2,ENST00000416185.2,1,161536571,161538158,0.044,rs1801274,-6.787,rs1801274,0.0271,4.66,-6.78723,25,1,top1,0.027,6.7e-06,-6.78723,1.14e-11
3,ENST00000464824.2,1,155062161,155062775,0.0568,rs11264303,4.337,rs11264303,0.0232,4.57,4.33684,35,1,top1,0.023,2.8e-05,4.33684,1.45e-05
4,ENST00000256178.8,11,10556966,10568665,0.0663,rs7938782,6.0,rs7938782,0.0385,-5.8,6.0,31,1,top1,0.039,8.9e-08,-6.0,1.97e-09
5,ENST00000531786.1,11,10632032,10693804,0.0418,rs7938782,6.0,rs7938782,0.0317,-5.05,6.0,37,1,top1,0.032,1.2e-06,-6.0,1.97e-09
6,ENST00000613543.1,12,122634130,122634673,0.0369,rs11058868,-5.9273,rs11058868,0.0364,5.48,-5.92727,7,2,enet,0.037,1.9e-07,-6.21823,5.03e-10
7,ENST00000563933.1,12,40140926,40142876,0.0625,rs7138679,-5.2083,rs11175454,0.0556,-6.37,1.85185,7,5,enet,0.072,2.1e-13,-4.18936,2.8e-05
8,ENST00000533372.1,12,49582886,49605627,0.0372,rs7296288,-3.9072,rs4898506,0.00427,-3.29,-3.12121,16,5,enet,0.013,0.0015,5.66456,1.47e-08
9,ENST00000659330.1,14,88036880,88097053,0.021,rs979812,-6.56,rs979812,0.0213,4.15,-6.5591,7,1,top1,0.021,6e-05,-6.55914,5.41e-11


In [6]:
##grab significant GWAS hits (P-value threshold to 0.05/N-tests per datatype)
sig_tes_case_top = tes_case_top[tes_case_top['TWAS.P']< 2.28e-7]
#reset index
sig_tes_case_top = sig_tes_case_top.reset_index(drop=True)
print("shape of significant TES data:", sig_tes_case_top.shape)

sig_tes_case_top.head()

shape of significant TES data: (16, 18)


Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,EQTL.GWAS.Z,NSNP,NWGT,MODEL,MODELCV.R2,MODELCV.PV,TWAS.Z,TWAS.P
0,ENST00000649236.1,1,156243687,156248073,0.027,rs10737170,6.461,rs10737170,0.0125,3.36,6.46104,18,1,top1,0.012,0.0017,6.46104,1.04e-10
1,ENST00000416185.2,1,161536571,161538158,0.044,rs1801274,-6.787,rs1801274,0.0271,4.66,-6.78723,25,1,top1,0.027,6.7e-06,-6.78723,1.14e-11
2,ENST00000256178.8,11,10556966,10568665,0.0663,rs7938782,6.0,rs7938782,0.0385,-5.8,6.0,31,1,top1,0.039,8.9e-08,-6.0,1.97e-09
3,ENST00000531786.1,11,10632032,10693804,0.0418,rs7938782,6.0,rs7938782,0.0317,-5.05,6.0,37,1,top1,0.032,1.2e-06,-6.0,1.97e-09
4,ENST00000613543.1,12,122634130,122634673,0.0369,rs11058868,-5.9273,rs11058868,0.0364,5.48,-5.92727,7,2,enet,0.037,1.9e-07,-6.21823,5.03e-10


In [7]:
from scipy import stats
#remove some columns
case_top = sig_tes_case_top[['ID', 'CHR', 'EQTL.ID', 'EQTL.Z', 'TWAS.Z', 'TWAS.P']]
#add EQTL.P
#case_top['EQTL.P'] = case_top.stats.norm.cdf(-Z)
#add GWAS.hit column 1 if ID is in twas_case_gwas 0 if not
##case_top['GWAS.hit'] = case_top.ID.isin(tes_case_gwas.ID).astype(int)
#convert 0 to no and 1 to yes
##case_top['GWAS.hit'] = case_top['GWAS.hit'].map({0: 'no', 1: 'yes'})

In [8]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#convert CHR from object data type to int
case_top['CHR'] = case_top['CHR'].astype(int)

case_top data types: 
 ID          object
CHR         object
EQTL.ID     object
EQTL.Z     float64
TWAS.Z     float64
TWAS.P     float64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_top['CHR'] = case_top['CHR'].astype(int)


In [9]:
#check data type for the dataframe
print("case_top data types: \n", case_top.dtypes)
#sort the rows by CHR order
case_top = case_top.sort_values('CHR').reset_index(drop=True)
case_top

case_top data types: 
 ID          object
CHR          int64
EQTL.ID     object
EQTL.Z     float64
TWAS.Z     float64
TWAS.P     float64
dtype: object


Unnamed: 0,ID,CHR,EQTL.ID,EQTL.Z,TWAS.Z,TWAS.P
0,ENST00000649236.1,1,rs10737170,3.36,6.46104,1.04e-10
1,ENST00000416185.2,1,rs1801274,4.66,-6.78723,1.14e-11
2,ENST00000429900.6,3,rs12497850,3.83,-6.42424,1.33e-10
3,ENST00000473285.5,3,rs1450522,-3.69,6.22222,4.9e-10
4,ENST00000514698.5,4,rs1051613,-5.16,-11.15306,6.920000000000001e-29
5,ENST00000510286.1,4,rs1051613,-8.26,-11.15306,6.920000000000001e-29
6,ENST00000413042.3,7,rs1964536,4.87,7.3299,2.3e-13
7,ENST00000523349.5,8,rs2280104,3.32,5.6801,1.35e-08
8,ENST00000256178.8,11,rs7938782,-5.8,-6.0,1.97e-09
9,ENST00000531786.1,11,rs7938782,-5.05,-6.0,1.97e-09


In [23]:
#check if GWAS.hit has 40 yes and 60 no
pd.value_counts(case_top['GWAS.hit'])

yes    24
no     13
Name: GWAS.hit, dtype: int64

In [11]:
#save twas_cont_top and cont_top --run only once
#twas_case_top.to_csv(r'./data_folder/tes_case_top_all.csv',index=False)
sig_tes_case_top.to_csv(r'./data_folder/significant_tes_case_top_all.csv',index=False)
case_top.to_csv(r'./data_folder/tes_case_top.csv', index=False)

Make BESD files to get SMR

In [None]:
#####Dont merge with meta_bim, just calculate Freq, Effect, SE, P-value

In [10]:
meta_bim = pd.read_csv(f"/data/songy4/proteomics_196/data_folder/meta_bim.txt", sep='\t')
meta_bim

Unnamed: 0,MarkerName,Allele1,Allele2,Freq1,Effect,StdErr,P-value,chr,rsid,kb,pos,a1,a2
0,chr1:60320992,g,a,0.0625,-0.0185,0.0210,0.380100,1,rs116406626,0,60320992,G,A
1,chr8:135908647,a,g,0.2112,-0.0054,0.0123,0.659800,8,rs11992603,0,135908647,A,G
2,chr10:120407145,c,t,0.0888,0.0278,0.0179,0.120300,10,rs4336949,0,120407145,C,T
3,chr11:29372878,t,c,0.0592,0.0049,0.0218,0.822100,11,rs72638174,0,29372878,T,C
4,chr3:164053059,a,g,0.0146,0.0607,0.0488,0.214100,3,rs115079612,0,164053059,A,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6478463,chr16:84525085,t,c,0.0268,-0.0387,0.0310,0.211700,16,rs2927754,0,84525085,T,C
6478464,chr11:47906648,c,t,0.0548,0.0062,0.0207,0.765700,11,rs148423893,0,47906648,C,T
6478465,chr3:47855594,c,t,0.3501,-0.0348,0.0105,0.000944,3,rs62263573,0,47855594,C,T
6478466,chr3:194372962,a,g,0.0166,0.0268,0.0500,0.591900,3,rs149847890,0,194372962,A,G


In [17]:
#merge pcs_df into covariate_df
sig_twas_case_meta = sig_twas_case_top.merge(meta_bim, left_on = "EQTL.ID", right_on= "rsid")
sig_twas_case_meta

Unnamed: 0,ID,CHR,P0,P1,HSQ,BEST.GWAS.ID,BEST.GWAS.Z,EQTL.ID,EQTL.R2,EQTL.Z,...,Freq1,Effect,StdErr,P-value,chr,rsid,kb,pos,a1,a2
0,ENSG00000187010,1,25598884,25656936,0.7739,rs35589882,-4.46,rs3091242,0.366,20.48,...,0.457,-0.0386,0.0097,6.383e-05,1,rs3091242,0,25674785,C,T
1,ENSG00000143537,1,155023042,155035252,0.4704,rs12726330,15.55,rs35902694,0.16,13.54,...,0.3824,0.0616,0.0096,1.586e-10,1,rs35902694,0,155042886,T,G
2,ENSG00000160783,1,156182784,156212874,0.0522,rs34372695,11.6,rs2758603,-0.00085,-3.95,...,0.3468,-0.0384,0.0099,0.0001051,1,rs2758603,0,156198994,C,T
3,ENSG00000072694,1,161551101,161648444,0.8061,rs1801274,-6.79,rs7529425,0.182,14.45,...,0.1331,-0.0732,0.0143,3.247e-07,1,rs7529425,0,161479599,A,G
4,ENSG00000117280,1,205737114,205744588,0.1237,rs823114,-11.31,rs7522056,0.079,-9.65,...,0.3128,-0.0808,0.0101,1.502e-15,1,rs7522056,0,205735891,A,G
5,ENSG00000143772,1,226819391,226927024,0.117,rs10495249,-7.98,rs10495249,0.037,7.95,...,0.2802,-0.083,0.0104,1.666e-15,1,rs10495249,0,226919119,G,A
6,ENSG00000128805,10,49654077,49864310,0.66193,rs10857614,-4.74,rs1822861,0.173928,14.15,...,0.4896,-0.042,0.0093,6.678e-06,10,rs1822861,0,49834326,T,G
7,ENSG00000139351,12,102122426,102133250,0.0273,rs17032033,5.13,rs1544922,0.023,5.63,...,0.1259,0.0685,0.0145,2.456e-06,12,rs1544922,0,102109893,C,T
8,ENSG00000255398,12,123199303,123201439,0.2238,rs11060180,-10.55,rs1798192,0.0886,-10.27,...,0.4285,0.0322,0.0095,0.0007096,12,rs1798192,0,123200768,T,G
9,ENSG00000130787,12,123319000,123347507,0.0334,rs11060180,-10.55,rs11060180,0.0178,-5.54,...,0.4388,-0.1034,0.0098,4.4479999999999997e-26,12,rs11060180,0,123303586,G,A


In [24]:
#make .esd file (Chr    SNP Bp  A1  A2  Freq    Beta    se  p)
sig_twas_case_esd = sig_twas_case_meta[['chr', 'rsid', 'pos', 'a1', 'a2', 'Freq1', 'Effect', 'StdErr', 'P-value']]
sig_twas_case_esd.rename(columns={'chr':'Chr', 'rsid':'SNP', 'pos':'Bp', 'a1':'A1', 'a2':'A2', 'Freq1':'Freq', 'Effect':'Beta', 'StdErr':'se', 'P-value':'p'}, inplace=True)
sig_twas_case_esd

Unnamed: 0,Chr,SNP,Bp,A1,A2,Freq,Beta,se,p
0,1,rs3091242,25674785,C,T,0.457,-0.0386,0.0097,6.383e-05
1,1,rs35902694,155042886,T,G,0.3824,0.0616,0.0096,1.586e-10
2,1,rs2758603,156198994,C,T,0.3468,-0.0384,0.0099,0.0001051
3,1,rs7529425,161479599,A,G,0.1331,-0.0732,0.0143,3.247e-07
4,1,rs7522056,205735891,A,G,0.3128,-0.0808,0.0101,1.502e-15
5,1,rs10495249,226919119,G,A,0.2802,-0.083,0.0104,1.666e-15
6,10,rs1822861,49834326,T,G,0.4896,-0.042,0.0093,6.678e-06
7,12,rs1544922,102109893,C,T,0.1259,0.0685,0.0145,2.456e-06
8,12,rs1798192,123200768,T,G,0.4285,0.0322,0.0095,0.0007096
9,12,rs11060180,123303586,G,A,0.4388,-0.1034,0.0098,4.4479999999999997e-26


In [25]:
#save sig_twas_case_esd
sig_twas_case_esd.to_csv(r'./data_folder/sig_twas_case_esd.esd', index=False)

In [26]:
#make .flist file (Chr    ProbeID GeneticDistance ProbeBp Gene    Orientation PathOfEsd)
sig_twas_case_flist = sig_twas_case_meta[['chr', 'ID', 'kb', 'pos']]
sig_twas_case_flist.rename(columns={'chr':'Chr', 'ID':'ProbeID', 'kb':'GeneticDistance', 'pos':'ProbeBp'}, inplace=True)
sig_twas_case_flist

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp
0,1,ENSG00000187010,0,25674785
1,1,ENSG00000143537,0,155042886
2,1,ENSG00000160783,0,156198994
3,1,ENSG00000072694,0,161479599
4,1,ENSG00000117280,0,205735891
5,1,ENSG00000143772,0,226919119
6,10,ENSG00000128805,0,49834326
7,12,ENSG00000139351,0,102109893
8,12,ENSG00000255398,0,123200768
9,12,ENSG00000130787,0,123303586


In [31]:
#merge pcs_df into covariate_df
sig_twas_case_gene = pd.read_csv(f"./data_folder/sig_twas_case_gene.csv")
sig_twas_case_flist = sig_twas_case_flist.merge(sig_twas_case_gene, left_on = "ProbeID", right_on= "ProbeID")
sig_twas_case_flist

Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp,Gene
0,1,ENSG00000187010,0,25674785,RHD
1,1,ENSG00000143537,0,155042886,ADAM15
2,1,ENSG00000160783,0,156198994,PMF1
3,1,ENSG00000072694,0,161479599,FCGR2B
4,1,ENSG00000117280,0,205735891,RAB29
5,1,ENSG00000143772,0,226919119,ITPKB
6,10,ENSG00000128805,0,49834326,ARHGAP22
7,12,ENSG00000139351,0,102109893,SYCP3
8,12,ENSG00000255398,0,123200768,HCAR3
9,12,ENSG00000130787,0,123303586,HIP1R


In [34]:
#add Orientation and PathOfEsd columns and add values
sig_twas_case_flist["Orientation"] = 'NA'
sig_twas_case_flist["PathOfEsd"] = '/data/songy4/twas/data_folder/sig_twas_case_esd.esd'
sig_twas_case_flist

Unnamed: 0,Chr,ProbeID,GeneticDistance,ProbeBp,Gene,Orientation,PathOfEsd
0,1,ENSG00000187010,0,25674785,RHD,,/data/songy4/twas/data_folder/sig_twas_case_es...
1,1,ENSG00000143537,0,155042886,ADAM15,,/data/songy4/twas/data_folder/sig_twas_case_es...
2,1,ENSG00000160783,0,156198994,PMF1,,/data/songy4/twas/data_folder/sig_twas_case_es...
3,1,ENSG00000072694,0,161479599,FCGR2B,,/data/songy4/twas/data_folder/sig_twas_case_es...
4,1,ENSG00000117280,0,205735891,RAB29,,/data/songy4/twas/data_folder/sig_twas_case_es...
5,1,ENSG00000143772,0,226919119,ITPKB,,/data/songy4/twas/data_folder/sig_twas_case_es...
6,10,ENSG00000128805,0,49834326,ARHGAP22,,/data/songy4/twas/data_folder/sig_twas_case_es...
7,12,ENSG00000139351,0,102109893,SYCP3,,/data/songy4/twas/data_folder/sig_twas_case_es...
8,12,ENSG00000255398,0,123200768,HCAR3,,/data/songy4/twas/data_folder/sig_twas_case_es...
9,12,ENSG00000130787,0,123303586,HIP1R,,/data/songy4/twas/data_folder/sig_twas_case_es...


In [35]:
#save sig_twas_case_flist
sig_twas_case_flist.to_csv(r'./data_folder/sig_twas_case_flist.flist', index=False)

In [39]:
#make BESD file

!smr --eqtl-flist /data/songy4/twas/data_folder/sig_twas_case_flist.flist --make-besd --out mybesd 

/bin/bash: smr: command not found
