In [2]:
import subprocess
import sys
import os
import shutil
import pandas as pd

In [3]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

In [17]:
# set paths
basedir = '/data/songy4/tes'
datadir = f'{basedir}/data_folder'
fusiondir = f'/data/songy4/twas/fusion_twas'
geno_path = f'{datadir}/qc_genotypes_tes_case'
geno_path_1 = f'{datadir}/qc_genotypes_tes_control'
#tr_list_path = f'{datadir}/transcript_list.txt'
#te_list_path = f'{datadir}/te_list.txt'

#pheno_path = f'{datadir}/expression_matrix_final.txt'
#coord_path = f'{datadir}/twas_coordinate.txt'
#covar_path = f'{datadir}/covariates_case.txt'

In [6]:
#open bim file 
bim_df = pd.read_csv(f"{geno_path}.bim", sep='\t', header=None,  names=('chr', 'rsid', 'kb', 'pos', 'a1', 'a2'))

#combine chr and pos columns
bim_df['MarkerName'] = str('chr') + bim_df.chr.astype(str) + ':' + bim_df.pos.astype(str)

#check the shape of meta
print("shape of bim data:", bim_df.shape)

#check the shape of meta
print("number of unique MarkerName in bim data:", bim_df['MarkerName'].nunique())
print("number of unique rsid in bim data:", bim_df['rsid'].nunique())
bim_df.head()

shape of bim data: (305138, 7)
number of unique MarkerName in bim data: 305138
number of unique rsid in bim data: 305138


Unnamed: 0,chr,rsid,kb,pos,a1,a2,MarkerName
0,1,rs200683566,0,13838,T,C,chr1:13838
1,1,rs370886505,0,14397,C,CTGT,chr1:14397
2,1,rs375086259,0,14653,T,C,chr1:14653
3,1,rs79585140,0,14907,G,A,chr1:14907
4,1,rs199856693,0,14933,A,G,chr1:14933


# Liftover genotype from hg38 to hg19 to match 1kg ref

In [7]:
# get chrN:start-end positions for liftover of genotype from hg38 to hg19 to match 1kG LD ref
lift_outname = f'{datadir}/geno_hg38_positions.bed'
bim_df['chr'] = 'chr' + bim_df['chr'].astype('str')
bim_df['end'] = bim_df['pos'] + 1
lift_out = bim_df[['chr', 'pos', 'end', 'rsid' ]].copy()
lift_out.to_csv(lift_outname, sep='\t', header=False, index=False)

In [12]:
# pull liftOver from UCSC
#!wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver -P /data/songy4/twas/liftover/
#!wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' -O /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz
#!chmod +x /data/songy4/twas/liftover/liftOver

--2021-04-01 21:44:51--  http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
Resolving dtn06-e0 (dtn06-e0)... 10.1.200.242
Connecting to dtn06-e0 (dtn06-e0)|10.1.200.242|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 37464040 (36M)
Saving to: '/data/songy4/twas/liftover/liftOver.1'


2021-04-01 21:44:53 (22.1 MB/s) - '/data/songy4/twas/liftover/liftOver.1' saved [37464040/37464040]

for details.

--2021-04-01 21:44:53--  ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
Resolving dtn06-e0 (dtn06-e0)... 10.1.200.242
Connecting to dtn06-e0 (dtn06-e0)|10.1.200.242|:3128... connected.
Proxy request sent, awaiting response... 200 Gatewaying
Length: 1246411 (1.2M) [text/plain]
Saving to: '/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz'


2021-04-01 21:44:54 (2.45 MB/s) - '/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz' saved [1246411/1246411]



In [9]:
# create command for liftover
liftOver = f'/data/songy4/twas/liftover/liftOver'
chainfile = f'/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz'
liftover_output = f'{datadir}/geno_hg19_positions.bed'
unlifted = f'{datadir}/unlifted.bed'

liftover_cmd = f'\
{liftOver} {lift_outname} {chainfile} {liftover_output} {unlifted}'

shell_do(liftover_cmd)
liftover_cmd

Executing: /data/songy4/twas/liftover/liftOver /data/songy4/tes/data_folder/geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/tes/data_folder/geno_hg19_positions.bed /data/songy4/tes/data_folder/unlifted.bed


'/data/songy4/twas/liftover/liftOver /data/songy4/tes/data_folder/geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/tes/data_folder/geno_hg19_positions.bed /data/songy4/tes/data_folder/unlifted.bed'

In [10]:
# create map update file
lifted = pd.read_csv(liftover_output, sep='\t', header=None, names=['chr', 'pos', 'end', 'rsid' ])

bim_lift_merge = bim_df.merge(lifted, how='right', on='rsid')
lifted_bim = bim_lift_merge[['chr_x', 'rsid', 'kb', 'pos_y', 'a1', 'a2']].copy()
lifted_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

In [12]:
# use only lifted snps
lifted_bim['rsid'].to_csv(f'{geno_path}_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_cmd = f'\
plink --bfile {geno_path}\
 --extract {geno_path}_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path}_hg19_lifted'

shell_do(plink_extract_cmd)


Executing: plink --bfile /data/songy4/tes/data_folder/qc_genotypes_tes_case --extract /data/songy4/tes/data_folder/qc_genotypes_tes_case_hg19_lifted.snplist --make-bed --out /data/songy4/tes/data_folder/qc_genotypes_tes_case_hg19_lifted


In [13]:
# move bim with old positions to new file
!mv {geno_path}_hg19_lifted.bim {geno_path}_hg19_lifted_old_positions.bim

In [14]:
# write lifted bim to _hg19_lifted genotype name
lifted_bim.to_csv(f'{geno_path}_hg19_lifted.bim', sep='\t', header=False, index=False)

In [15]:
#check number of IDs in original .bim file
!cat {datadir}/qc_genotypes_tes_case_hg19_lifted.bim | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_genotypes_tes_case_hg19_lifted.bim

297498
chr1	rs200683566	0	13838	T	C
chr1	rs370886505	0	14397	C	CTGT
chr1	rs375086259	0	14653	T	C
chr1	rs79585140	0	14907	G	A
chr1	rs199856693	0	14933	A	G
chr1	rs71252250	0	15118	G	A
chr1	rs201635489	0	15447	G	A
chr1	rs201026389	0	16125	G	T
chr1	rs78588380	0	16257	C	G
chr1	rs200736374	0	16288	G	C


In [16]:
#check number of IDs in original .bim file
!cat {datadir}/qc_genotypes_tes_case_hg19_lifted.fam | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_genotypes_tes_case_hg19_lifted.fam

705
PP-3001 PP-3001 0 0 0 -9
PP-3002 PP-3002 0 0 0 -9
PP-3003 PP-3003 0 0 0 -9
PP-3006 PP-3006 0 0 0 -9
PP-3007 PP-3007 0 0 0 -9
PP-3010 PP-3010 0 0 0 -9
PP-3012 PP-3012 0 0 0 -9
PP-3014 PP-3014 0 0 0 -9
PP-3018 PP-3018 0 0 0 -9
PP-3020 PP-3020 0 0 0 -9


Control liftover

In [18]:
# get chrN:start-end positions for liftover of genotype from hg38 to hg19 to match 1kG LD ref
lift_outname_1 = f'{datadir}/geno_hg38_positions_control.bed'
bim = pd.read_csv(f'{geno_path_1}.bim', sep='\t', header=None)
bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
bim['chr'] = 'chr' + bim['chr'].astype('str')
bim['end'] = bim['pos'] + 1
lift_out_1 = bim[['chr', 'pos', 'end', 'rsid' ]].copy()
lift_out_1.to_csv(lift_outname_1, sep='\t', header=False, index=False)

In [19]:
# create command for liftover
liftOver = f'/data/songy4/twas/liftover/liftOver'
chainfile = f'/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz'
liftover_output_1 = f'{basedir}/geno_hg19_positions_control.bed'
unlifted_1 = f'{basedir}/unlifted_control.bed'

liftover_1_cmd = f'\
{liftOver} {lift_outname_1} {chainfile} {liftover_output_1} {unlifted_1}'

shell_do(liftover_1_cmd)
liftover_1_cmd

Executing: /data/songy4/twas/liftover/liftOver /data/songy4/tes/data_folder/geno_hg38_positions_control.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/tes/geno_hg19_positions_control.bed /data/songy4/tes/unlifted_control.bed


'/data/songy4/twas/liftover/liftOver /data/songy4/tes/data_folder/geno_hg38_positions_control.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/tes/geno_hg19_positions_control.bed /data/songy4/tes/unlifted_control.bed'

In [20]:
# create map update file
lifted_1 = pd.read_csv(liftover_output_1, sep='\t', header=None, names=['chr', 'pos', 'end', 'rsid' ])

bim_lift_merge_1 = bim.merge(lifted_1, how='right', on='rsid')
lifted_bim_1 = bim_lift_merge_1[['chr_x', 'rsid', 'kb', 'pos_y', 'a1', 'a2']].copy()
lifted_bim_1.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

In [21]:
# use only lifted snps
lifted_bim_1['rsid'].to_csv(f'{geno_path_1}_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_1_cmd = f'\
plink --bfile {geno_path_1}\
 --extract {geno_path_1}_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path_1}_hg19_lifted'

shell_do(plink_extract_1_cmd)

Executing: plink --bfile /data/songy4/tes/data_folder/qc_genotypes_tes_control --extract /data/songy4/tes/data_folder/qc_genotypes_tes_control_hg19_lifted.snplist --make-bed --out /data/songy4/tes/data_folder/qc_genotypes_tes_control_hg19_lifted


In [22]:
# move bim with old positions to new file
!mv {geno_path_1}_hg19_lifted.bim {geno_path_1}_hg19_lifted_old_positions.bim

In [23]:
# write lifted bim to _hg19_lifted genotype name
lifted_bim_1.to_csv(f'{geno_path_1}_hg19_lifted.bim', sep='\t', header=False, index=False)

In [24]:
#check number of IDs in original .bim file
!cat {datadir}/qc_genotypes_tes_control_hg19_lifted.bim | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_genotypes_tes_control_hg19_lifted.bim

297496
chr1	rs200683566	0	13838	T	C
chr1	rs370886505	0	14397	C	CTGT
chr1	rs375086259	0	14653	T	C
chr1	rs79585140	0	14907	G	A
chr1	rs199856693	0	14933	A	G
chr1	rs71252250	0	15118	G	A
chr1	rs201635489	0	15447	G	A
chr1	rs201026389	0	16125	G	T
chr1	rs78588380	0	16257	C	G
chr1	rs200736374	0	16288	G	C


In [25]:
#check number of IDs in original .bim file
!cat {datadir}/qc_genotypes_tes_control_hg19_lifted.fam | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_genotypes_tes_control_hg19_lifted.fam

526
PP-3000 PP-3000 0 0 0 -9
PP-3004 PP-3004 0 0 0 -9
PP-3008 PP-3008 0 0 0 -9
PP-3011 PP-3011 0 0 0 -9
PP-3013 PP-3013 0 0 0 -9
PP-3016 PP-3016 0 0 0 -9
PP-3029 PP-3029 0 0 0 -9
PP-3053 PP-3053 0 0 0 -9
PP-3055 PP-3055 0 0 0 -9
PP-3057 PP-3057 0 0 0 -9


liftover coordinate file from hg38 to hg19   ---- IGNORE THIS AS POSITION IS ALREADY IN gr38.87

If you submit data to the browser in position format (chr#:##-##), the browser assumes this information is 1-based. If you submit data in any other format (BED (chr# ## ##) or otherwise), the browser will assume it is 0-based.
https://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1

In [6]:
# get chrN:start-end positions for liftover of coordinate file from hg38 to hg19 to match 1kG LD ref
#coords_lift_outname = f'{datadir}/coords_hg38_positions.bed'
#coords = pd.read_csv(coord_path, sep='\t')
#coords = coords.rename(columns={'X.Chr':'chr', 'start':'pos', 'end':'end', 'ID':'id'})
#coords['chr'] = 'chr' + coords['chr'].astype('str')
#coords.to_csv(coords_lift_outname, sep='\t', header=False, index=False)

In [7]:
# create command for liftover
#liftOver = f'{basedir}/liftover/liftOver'
#chainfile = f'{basedir}/liftover/hg38ToHg19.over.chain.gz'
#coords_liftover_output = f'{basedir}/coords_hg19_positions.bed'
#coords_unlifted = f'{basedir}/coords_unlifted.bed'

#coords_liftover_cmd = f'\
#{liftOver} {coords_lift_outname} {chainfile} {coords_liftover_output} {coords_unlifted}'

#shell_do(coords_liftover_cmd)
#coords_liftover_cmd

Executing: /data/songy4/twas/liftover/liftOver /data/songy4/twas/data_folder/coords_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/twas/coords_hg19_positions.bed /data/songy4/twas/coords_unlifted.bed


'/data/songy4/twas/liftover/liftOver /data/songy4/twas/data_folder/coords_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/twas/coords_hg19_positions.bed /data/songy4/twas/coords_unlifted.bed'

In [12]:
#read lifted coordinate file
#lifted = pd.read_csv(coords_liftover_output, sep='\t', header=None, names=['X.Chr', 'start', 'end', 'ID' ])
#lifted

Unnamed: 0,X.Chr,start,end,ID
0,chr1,69091,70008,ENSG00000186092
1,chr1,795640,815335,ENSG00000187634
2,chr1,814964,830069,ENSG00000188976
3,chr1,831347,836475,ENSG00000187961
4,chr1,837257,846625,ENSG00000187583
...,...,...,...,...
17782,chr21,47645446,47658608,ENSG00000184900
17783,chr21,47689414,47713666,ENSG00000183255
17784,chr21,47725782,47771818,ENSG00000160255
17785,chr21,47779840,47816818,ENSG00000160256


In [13]:
#drop chr from X.Chr
#lifted['X.Chr'] = lifted['X.Chr'].str.replace('chr', '')
#lifted

Unnamed: 0,X.Chr,start,end,ID
0,1,69091,70008,ENSG00000186092
1,1,795640,815335,ENSG00000187634
2,1,814964,830069,ENSG00000188976
3,1,831347,836475,ENSG00000187961
4,1,837257,846625,ENSG00000187583
...,...,...,...,...
17782,21,47645446,47658608,ENSG00000184900
17783,21,47689414,47713666,ENSG00000183255
17784,21,47725782,47771818,ENSG00000160255
17785,21,47779840,47816818,ENSG00000160256


In [14]:
#save as text file  --> this is the new coordinate data 
#lifted.to_csv(r'./data_folder/twas_coordinate_lifted.txt', sep='\t' ,index=False)

In [14]:
#create gene_list as list to drop columns in exp_case 
#gene_list = lifted['ID']
#print("number of genes in gene_list:", len(gene_list))

number of genes in gene_list: 17787


In [15]:
#save gene_list as txt -- run only once for download
gene_list.to_csv(r'./data_folder/gene_list.txt', sep='\t' ,index=False)

Liftover control genotype from hg38 to hg19

In [9]:
# set paths for controls
control_geno_path = f'{datadir}/qc_genotypes_twas_control'
pheno_path = f'{datadir}/twas_expression_control.txt'
covar_path = f'{datadir}/covariates_control.txt'

In [10]:
# get chrN:start-end positions for liftover of genotype from hg38 to hg19 to match 1kG LD ref
lift_outname_cont = f'{datadir}/control_geno_hg38_positions.bed'
bim_cont = pd.read_csv(f'{geno_path}_control.bim', sep='\t', header=None)
bim_cont.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
bim_cont['chr'] = 'chr' + bim_cont['chr'].astype('str')
bim_cont['end'] = bim_cont['pos'] + 1
lift_out_cont = bim_cont[['chr', 'pos', 'end', 'rsid' ]].copy()
lift_out_cont.to_csv(lift_outname_cont, sep='\t', header=False, index=False)

In [14]:
# create command for liftover
liftOver = f'{basedir}/liftover/liftOver'
chainfile = f'{basedir}/liftover/hg38ToHg19.over.chain.gz'
liftover_output_cont = f'{basedir}/control_geno_hg19_positions.bed'
unlifted_cont = f'{basedir}/control_unlifted.bed'

liftover_cont_cmd = f'\
{liftOver} {lift_outname_cont} {chainfile} {liftover_output_cont} {unlifted_cont}'

shell_do(liftover_cont_cmd)
liftover_cont_cmd

Executing: /data/songy4/twas/liftover/liftOver /data/songy4/twas/data_folder/control_geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/twas/control_geno_hg19_positions.bed /data/songy4/twas/control_unlifted.bed


'/data/songy4/twas/liftover/liftOver /data/songy4/twas/data_folder/control_geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/twas/control_geno_hg19_positions.bed /data/songy4/twas/control_unlifted.bed'

In [15]:
# create map update file
lifted_cont = pd.read_csv(liftover_output_cont, sep='\t', header=None, names=['chr', 'pos', 'end', 'rsid' ])

bim_cont = pd.read_csv(f'{geno_path}_control.bim', sep='\t', header=None, names=['chr', 'rsid', 'kb', 'pos', 'a1', 'a2'])
bim_lift_merge_cont = bim_cont.merge(lifted_cont, how='right', on='rsid')
lifted_bim_cont = bim_lift_merge_cont[['chr_x', 'rsid', 'kb', 'pos_y', 'a1', 'a2']].copy()
lifted_bim_cont.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

In [16]:
# use only lifted snps
lifted_bim_cont['rsid'].to_csv(f'{geno_path}_control_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_cont_cmd = f'\
plink --bfile {geno_path}_control\
 --extract {geno_path}_control_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path}_control_hg19_lifted'

shell_do(plink_extract_cont_cmd)

Executing: plink --bfile /data/songy4/twas/data_folder/qc_genotypes_twas_control --extract /data/songy4/twas/data_folder/qc_genotypes_twas_control_hg19_lifted.snplist --make-bed --out /data/songy4/twas/data_folder/qc_genotypes_twas_control_hg19_lifted


In [17]:
# move bim with old positions to new file
!mv {geno_path}_control_hg19_lifted.bim {geno_path}_control_hg19_lifted_old_positions.bim

In [19]:
# write lifted bim to _hg19_lifted genotype name
lifted_bim_cont.to_csv(f'{geno_path}_control_hg19_lifted.bim', sep='\t', header=False, index=False)

In [20]:
#check number of IDs in original .bim file
!cat {datadir}/qc_genotypes_twas_control_hg19_lifted.bim | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_genotypes_twas_control_hg19_lifted.bim

23060234
1	rs145427775	0	10291	T	C
1	rs55998931	0	10492	T	C
1	rs199896944	0	13504	A	G
1	rs199856693	0	14933	A	G
1	rs201855936	0	14948	A	G
1	rs71252251	0	14976	A	G
1	rs201045431	0	15029	A	G
1	rs368345873	0	15208	A	G
1	rs374029747	0	15774	A	G
1	rs201330479	0	16792	A	G
