In [1]:
import subprocess
import sys
import os
import shutil
import pandas as pd

# set paths
basedir = '/data/songy4/proteomics_196'
datadir = f'{basedir}/data_folder'
twasdir = f'/data/songy4/twas'
fusiondir = f'{twasdir}/fusion_twas'
liftdir = f'{twasdir}/liftover'
geno_path = f'{datadir}/qc_proteomics196_csf_cardio'

In [2]:
def shell_do(command, log=False, return_log=False):
    print(f'Executing: {(" ").join(command.split())}', file=sys.stderr)

    res=subprocess.run(command.split(), stdout=subprocess.PIPE)

    if log:
        print(res.stdout.decode('utf-8'))
    if return_log:
        return(res.stdout.decode('utf-8'))

# Liftover genotype from hg38 to hg19 to match 1kg ref

In [9]:
# get chrN:start-end positions for liftover of genotype from hg38 to hg19 to match 1kG LD ref
bim_df = pd.read_csv(f"{geno_path}_case.bim", sep='\t', header=None,  names=('chr', 'rsid', 'kb', 'pos', 'a1', 'a2'))
print(bim_df)
lift_outname = f'{datadir}/geno_hg38_positions.bed'
bim_df['chr'] = 'chr' + bim_df['chr'].astype('str')
bim_df['end'] = bim_df['pos'] + 1
lift_out = bim_df[['chr', 'pos', 'end', 'rsid' ]].copy()
lift_out.to_csv(lift_outname, sep='\t', header=False, index=False)
print(lift_out)

          chr         rsid  kb       pos a1 a2
0           1  rs145427775   0     10291  T  C
1           1   rs55998931   0     10492  T  C
2           1  rs199896944   0     13504  A  G
3           1  rs199856693   0     14933  A  G
4           1  rs201855936   0     14948  A  G
...       ...          ...  ..       ... .. ..
23858194   24  rs375378036   0  56887099  T  C
23858195   24  rs113496864   0  56887221  T  C
23858196   24   rs77686620   0  56887583  A  G
23858197   24  rs376130607   0  56887631  T  C
23858198   24  rs376828276   0  56887659  G  T

[23858199 rows x 6 columns]
            chr       pos       end         rsid
0          chr1     10291     10292  rs145427775
1          chr1     10492     10493   rs55998931
2          chr1     13504     13505  rs199896944
3          chr1     14933     14934  rs199856693
4          chr1     14948     14949  rs201855936
...         ...       ...       ...          ...
23858194  chr24  56887099  56887100  rs375378036
23858195  chr24

In [12]:
# pull liftOver from UCSC
#!wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver -P /data/songy4/twas/liftover/
#!wget --timestamping 'ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' -O /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz
#!chmod +x /data/songy4/twas/liftover/liftOver

--2021-04-01 21:44:51--  http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
Resolving dtn06-e0 (dtn06-e0)... 10.1.200.242
Connecting to dtn06-e0 (dtn06-e0)|10.1.200.242|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 37464040 (36M)
Saving to: '/data/songy4/twas/liftover/liftOver.1'


2021-04-01 21:44:53 (22.1 MB/s) - '/data/songy4/twas/liftover/liftOver.1' saved [37464040/37464040]

for details.

--2021-04-01 21:44:53--  ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
Resolving dtn06-e0 (dtn06-e0)... 10.1.200.242
Connecting to dtn06-e0 (dtn06-e0)|10.1.200.242|:3128... connected.
Proxy request sent, awaiting response... 200 Gatewaying
Length: 1246411 (1.2M) [text/plain]
Saving to: '/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz'


2021-04-01 21:44:54 (2.45 MB/s) - '/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz' saved [1246411/1246411]



In [10]:
# create command for liftover
liftOver = f'/data/songy4/twas/liftover/liftOver'
chainfile = f'/data/songy4/twas/liftover/hg38ToHg19.over.chain.gz'
liftover_output = f'{datadir}/csf_cardio_geno_hg19_positions.bed'
unlifted = f'{datadir}/csf_cardio_unlifted.bed'

liftover_cmd = f'\
{liftOver} {lift_outname} {chainfile} {liftover_output} {unlifted}'

shell_do(liftover_cmd)
liftover_cmd

Executing: /data/songy4/twas/liftover/liftOver /data/songy4/proteomics_196/data_folder/geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/proteomics_196/data_folder/geno_hg19_positions.bed /data/songy4/proteomics_196/data_folder/unlifted.bed


'/data/songy4/twas/liftover/liftOver /data/songy4/proteomics_196/data_folder/geno_hg38_positions.bed /data/songy4/twas/liftover/hg38ToHg19.over.chain.gz /data/songy4/proteomics_196/data_folder/geno_hg19_positions.bed /data/songy4/proteomics_196/data_folder/unlifted.bed'

In [18]:
# create map update file
lifted = pd.read_csv(liftover_output, sep='\t', header=None, names=['chr', 'pos', 'end', 'rsid' ])
print(lifted)
bim_df = pd.read_csv(f"{geno_path}_case.bim", sep='\t', header=None,  names=('chr', 'rsid', 'kb', 'pos', 'a1', 'a2'))
bim_lift_merge = bim_df.merge(lifted, how='right', on='rsid')
lifted_bim = bim_lift_merge[['chr_x', 'rsid', 'kb', 'pos_y', 'a1', 'a2']].copy()
lifted_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
print(lifted_bim)

            chr       pos       end         rsid
0          chr1     10291     10292  rs145427775
1          chr1     10492     10493   rs55998931
2          chr1     13504     13505  rs199896944
3          chr1     14933     14934  rs199856693
4          chr1     14948     14949  rs201855936
...         ...       ...       ...          ...
23060229  chr22  51239721  51239722  rs374333198
23060230  chr22  51239861  51239862  rs367873634
23060231  chr22  51239953  51239954  rs372413129
23060232  chr22  51240820  51240821  rs202228854
23060233  chr22  51242613  51242614  rs140611932

[23060234 rows x 4 columns]
          chr         rsid  kb       pos a1 a2
0           1  rs145427775   0     10291  T  C
1           1   rs55998931   0     10492  T  C
2           1  rs199896944   0     13504  A  G
3           1  rs199856693   0     14933  A  G
4           1  rs201855936   0     14948  A  G
...       ...          ...  ..       ... .. ..
23060229   22  rs374333198   0  51239721  T  C
2306023

In [19]:
####CASE
# use only lifted snps
lifted_bim['rsid'].to_csv(f'{geno_path}_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_cmd = f'\
plink --bfile {geno_path}_case\
 --extract {geno_path}_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path}_case_hg19_lifted'

shell_do(plink_extract_cmd)

Executing: plink --bfile /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_case --extract /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_hg19_lifted.snplist --make-bed --out /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_case_hg19_lifted


In [20]:
####CONTROL
# use only lifted snps
lifted_bim['rsid'].to_csv(f'{geno_path}_hg19_lifted.snplist', sep='\t', header=False, index=False)

plink_extract_cmd = f'\
plink --bfile {geno_path}_control\
 --extract {geno_path}_hg19_lifted.snplist\
 --make-bed\
 --out {geno_path}_control_hg19_lifted'

shell_do(plink_extract_cmd)

Executing: plink --bfile /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_control --extract /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_hg19_lifted.snplist --make-bed --out /data/songy4/proteomics_196/data_folder/qc_proteomics196_csf_cardio_control_hg19_lifted


In [21]:
# move bim with old positions to new file
!mv {geno_path}_case_hg19_lifted.bim {geno_path}_case_hg19_lifted_old_positions.bim
!mv {geno_path}_control_hg19_lifted.bim {geno_path}_control_hg19_lifted_old_positions.bim

In [22]:
# write lifted bim to _hg19_lifted genotype name
lifted_bim.to_csv(f'{geno_path}_case_hg19_lifted.bim', sep='\t', header=False, index=False)
lifted_bim.to_csv(f'{geno_path}_control_hg19_lifted.bim', sep='\t', header=False, index=False)

In [23]:
#check number of IDs in original .bim file
!cat {datadir}/qc_proteomics196_csf_cardio_case_hg19_lifted.bim | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_proteomics196_csf_cardio_case_hg19_lifted.bim

#check number of IDs in original .bim file
!cat {datadir}/qc_proteomics196_csf_cardio_control_hg19_lifted.bim | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_proteomics196_csf_cardio_control_hg19_lifted.bim

23060234
1	rs145427775	0	10291	T	C
1	rs55998931	0	10492	T	C
1	rs199896944	0	13504	A	G
1	rs199856693	0	14933	A	G
1	rs201855936	0	14948	A	G
1	rs71252251	0	14976	A	G
1	rs201045431	0	15029	A	G
1	rs368345873	0	15208	A	G
1	rs374029747	0	15774	A	G
1	rs201330479	0	16792	A	G
23060234
1	rs145427775	0	10291	T	C
1	rs55998931	0	10492	T	C
1	rs199896944	0	13504	A	G
1	rs199856693	0	14933	A	G
1	rs201855936	0	14948	A	G
1	rs71252251	0	14976	A	G
1	rs201045431	0	15029	A	G
1	rs368345873	0	15208	A	G
1	rs374029747	0	15774	A	G
1	rs201330479	0	16792	A	G


In [24]:
#check number of IDs in original .bim file
!cat {datadir}/qc_proteomics196_csf_cardio_case_hg19_lifted.fam | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_proteomics196_csf_cardio_case_hg19_lifted.fam

#check number of IDs in original .bim file
!cat {datadir}/qc_proteomics196_csf_cardio_control_hg19_lifted.fam | wc -l

#look at the bim file (variance, chromosome and position, allele1 and allele2)
! head {datadir}/qc_proteomics196_csf_cardio_control_hg19_lifted.fam

91
PD-PDBE283ED7 PD-PDBE283ED7 0 0 2 2
PD-PDBN655REA PD-PDBN655REA 0 0 1 2
PD-PDCC181AT5 PD-PDCC181AT5 0 0 1 2
PD-PDCF389XTT PD-PDCF389XTT 0 0 2 2
PD-PDCH970MXR PD-PDCH970MXR 0 0 1 2
PD-PDCZ367CD0 PD-PDCZ367CD0 0 0 2 2
PD-PDDM051VVT PD-PDDM051VVT 0 0 1 2
PD-PDDZ137TEG PD-PDDZ137TEG 0 0 1 2
PD-PDEE477WMA PD-PDEE477WMA 0 0 1 2
PD-PDER909CXU PD-PDER909CXU 0 0 2 2
70
PD-PDAB411CTU PD-PDAB411CTU 0 0 1 1
PD-PDAE940EZ5 PD-PDAE940EZ5 0 0 2 1
PD-PDBB001EMM PD-PDBB001EMM 0 0 1 1
PD-PDBD207ERC PD-PDBD207ERC 0 0 2 1
PD-PDBE349TEN PD-PDBE349TEN 0 0 1 1
PD-PDBG800HGQ PD-PDBG800HGQ 0 0 2 1
PD-PDCR944LPD PD-PDCR944LPD 0 0 1 1
PD-PDDB073UG2 PD-PDDB073UG2 0 0 1 1
PD-PDDD218KEH PD-PDDD218KEH 0 0 2 1
PD-PDDL082XHM PD-PDDL082XHM 0 0 2 1
