In [13]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import os


In [14]:

gwas_df = pd.read_csv('gwas_catalog_v1.0.2-associations_e96_r2019-11-21.tsv', delimiter='\t')

print(gwas_df.head())


  DATE ADDED TO CATALOG  PUBMEDID FIRST AUTHOR        DATE   JOURNAL  \
0            2018-08-29  29912962   Feitosa MF  2018-06-18  PLoS One   
1            2018-08-29  29912962   Feitosa MF  2018-06-18  PLoS One   
2            2018-08-29  29912962   Feitosa MF  2018-06-18  PLoS One   
3            2018-08-29  29912962   Feitosa MF  2018-06-18  PLoS One   
4            2018-08-29  29912962   Feitosa MF  2018-06-18  PLoS One   

                                   LINK  \
0  www.ncbi.nlm.nih.gov/pubmed/29912962   
1  www.ncbi.nlm.nih.gov/pubmed/29912962   
2  www.ncbi.nlm.nih.gov/pubmed/29912962   
3  www.ncbi.nlm.nih.gov/pubmed/29912962   
4  www.ncbi.nlm.nih.gov/pubmed/29912962   

                                               STUDY  \
0  Novel genetic associations for blood pressure ...   
1  Novel genetic associations for blood pressure ...   
2  Novel genetic associations for blood pressure ...   
3  Novel genetic associations for blood pressure ...   
4  Novel genetic association

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
gwas_df.columns.values

array(['DATE ADDED TO CATALOG', 'PUBMEDID', 'FIRST AUTHOR', 'DATE',
       'JOURNAL', 'LINK', 'STUDY', 'DISEASE/TRAIT', 'INITIAL SAMPLE SIZE',
       'REPLICATION SAMPLE SIZE', 'REGION', 'CHR_ID', 'CHR_POS',
       'REPORTED GENE(S)', 'MAPPED_GENE', 'UPSTREAM_GENE_ID',
       'DOWNSTREAM_GENE_ID', 'SNP_GENE_IDS', 'UPSTREAM_GENE_DISTANCE',
       'DOWNSTREAM_GENE_DISTANCE', 'STRONGEST SNP-RISK ALLELE', 'SNPS',
       'MERGED', 'SNP_ID_CURRENT', 'CONTEXT', 'INTERGENIC',
       'RISK ALLELE FREQUENCY', 'P-VALUE', 'PVALUE_MLOG',
       'P-VALUE (TEXT)', 'OR or BETA', '95% CI (TEXT)',
       'PLATFORM [SNPS PASSING QC]', 'CNV', 'MAPPED_TRAIT',
       'MAPPED_TRAIT_URI', 'STUDY ACCESSION', 'GENOTYPING TECHNOLOGY'],
      dtype=object)

In [16]:
gwas_df['STRONGEST SNP-RISK ALLELE']

0          rs2013002-T
1          rs7953257-A
2          rs1579381-C
3          rs3753584-T
4         rs72640287-T
5         rs79211428-T
6           rs988397-T
7          rs7497026-T
8          rs1894400-T
9         rs11099097-T
10        rs66724425-?
11         rs1570204-?
12         rs2797560-?
13        rs10918274-?
14         rs2472496-?
15         rs9913911-A
16         rs6478746-?
17        rs28500712-?
18        rs12377624-?
19         rs2745572-A
20          rs199529-?
21         rs1558225-?
22        rs34952318-?
23         rs6065171-?
24         rs6095946-?
25         rs4629237-?
26        rs76945759-?
27         rs9608740-?
28          rs756481-?
29         rs5756813-?
              ...     
161495    rs13059636-?
161496     rs3850174-?
161497     rs1906508-?
161498     rs4241964-?
161499     rs4992300-?
161500     rs4246036-?
161501     rs2842638-?
161502     rs2653343-?
161503    rs28479767-?
161504     rs6967481-?
161505     rs2189008-?
161506    rs35524253-?
161507     

In [17]:

gwas_bed = gwas_df[['CHR_ID', 'CHR_POS', 'MAPPED_GENE', 'DISEASE/TRAIT', 'STRONGEST SNP-RISK ALLELE', 'P-VALUE', 'PVALUE_MLOG', 'CONTEXT']]
gwas_bed = gwas_bed.rename(columns={
    'CHR_ID' : 'chr',
    'CHR_POS' : 'start',
    'MAPPED_GENE' : 'gene',
    'DISEASE/TRAIT' : 'trait',
    'STRONGEST SNP-RISK ALLELE' : 'snp_id',
    'P-VALUE' : 'p_value',
    'PVALUE_MLOG' : 'p_value_mlog',
    'CONTEXT' : 'molecular_cause'
})

gwas_bed = gwas_bed.loc[~gwas_bed['start'].str.contains(";", na=True)].copy().reset_index(drop=True)
gwas_bed = gwas_bed.loc[~gwas_bed['start'].str.contains("x", na=True)].copy().reset_index(drop=True)
gwas_bed = gwas_bed.loc[gwas_bed['snp_id'].str.len() <= 20].copy().reset_index(drop=True)

gwas_bed = gwas_bed.sort_values(by='p_value_mlog', ascending=False)
gwas_bed = gwas_bed.drop_duplicates(subset='snp_id', keep='first')

gwas_bed['end'] = pd.to_numeric(gwas_bed['start']) + 1
gwas_bed['end'] = gwas_bed['end'].astype(int).astype(str)

gwas_bed = gwas_bed[['chr', 'start', 'end', 'gene', 'trait', 'snp_id', 'p_value', 'p_value_mlog', 'molecular_cause']]

gwas_bed['chr'] = "chr" + gwas_bed['chr']


In [18]:

gwas_bed


Unnamed: 0,chr,start,end,gene,trait,snp_id,p_value,p_value_mlog,molecular_cause
35891,chr19,51125272,51125273,SIGLEC9,Blood protein levels,rs2075803-G,6E-2142,2141.221849,missense_variant
35870,chr1,161509955,161509956,FCGR2A,Blood protein levels,rs1801274-G,1E-2102,2102.000000,missense_variant
37261,chr12,9984073,9984074,CLEC12A,Blood protein levels,rs2961544-A,9E-1973,1972.045757,3_prime_UTR_variant
15759,chr7,121322110,121322111,CPED1 - WNT16,Heel bone mineral density,rs2908007-?,7E-1700,1699.154902,intergenic_variant
37790,chr19,10285007,10285008,"ICAM1, AC011511.2",Blood protein levels,rs5498-G,8E-1683,1682.096910,missense_variant
112100,chr2,210678331,210678332,CPS1,Glycine levels,rs715-C,3E-1632,1631.522879,3_prime_UTR_variant
29227,chr6,43957870,43957871,AL157371.1 - AL109615.3,Vascular endothelial growth factor levels,rs6921438-A,2E-1449,1448.698970,intergenic_variant
38500,chr17,28367840,28367841,"SARM1, AC002094.1, AC002094.3, VTN",Blood protein levels,rs704-A,1E-1442,1442.000000,missense_variant
37904,chr20,1915642,1915643,SIRPA,Blood protein levels,rs6136377-G,8E-1361,1360.096910,intron_variant
37120,chr12,6455044,6455045,TAPBPL,Blood protein levels,rs2532497-A,4E-1297,1296.397940,intron_variant


In [19]:

gwas_bed_narrow = gwas_bed[['chr', 'start', 'end', 'snp_id']]

gwas_bed_narrow.to_csv('gwas_bed.bed', sep='\t', header=False, index=False)

print(gwas_bed_narrow.head())


         chr      start        end       snp_id
35891  chr19   51125272   51125273  rs2075803-G
35870   chr1  161509955  161509956  rs1801274-G
37261  chr12    9984073    9984074  rs2961544-A
15759   chr7  121322110  121322111  rs2908007-?
37790  chr19   10285007   10285008     rs5498-G


In [20]:

!./liftOver gwas_bed.bed hg38ToHg19.over.chain gwas_bed_hg19.bed gwas_bed_hg38_unmapped.bed


Reading liftover chains
Mapping coordinates


In [21]:

gwas_bed_hg19 = pd.read_csv("gwas_bed_hg19.bed", sep='\t', error_bad_lines=False, names=["chr", "start", "end", "snp_id"])

print(gwas_bed_hg19.head())


     chr      start        end       snp_id
0  chr19   51628529   51628530  rs2075803-G
1   chr1  161479745  161479746  rs1801274-G
2  chr12   10136672   10136673  rs2961544-A
3   chr7  120962164  120962165  rs2908007-?
4  chr19   10395683   10395684     rs5498-G


In [22]:

gwas_bed.query("snp_id == 'rs2013002-T'")


Unnamed: 0,chr,start,end,gene,trait,snp_id,p_value,p_value_mlog,molecular_cause
4209,chr12,111762346,111762347,AC002996.1,Diastolic blood pressure x alcohol consumption...,rs2013002-T,4.0000000000000004e-39,38.39794,intron_variant


In [23]:

gwas_hg19 = gwas_bed_hg19.join(gwas_bed.set_index("snp_id"), on="snp_id", how="inner", rsuffix="_old")

gwas_hg19 = gwas_hg19[['chr', 'start', 'end', 'gene', 'snp_id', 'trait', 'p_value', 'p_value_mlog', 'molecular_cause']]

print(gwas_hg19.head())


     chr      start        end               gene       snp_id  \
0  chr19   51628529   51628530            SIGLEC9  rs2075803-G   
1   chr1  161479745  161479746             FCGR2A  rs1801274-G   
2  chr12   10136672   10136673            CLEC12A  rs2961544-A   
3   chr7  120962164  120962165      CPED1 - WNT16  rs2908007-?   
4  chr19   10395683   10395684  ICAM1, AC011511.2     rs5498-G   

                       trait  p_value  p_value_mlog      molecular_cause  
0       Blood protein levels  6E-2142   2141.221849     missense_variant  
1       Blood protein levels  1E-2102   2102.000000     missense_variant  
2       Blood protein levels  9E-1973   1972.045757  3_prime_UTR_variant  
3  Heel bone mineral density  7E-1700   1699.154902   intergenic_variant  
4       Blood protein levels  8E-1683   1682.096910     missense_variant  


In [24]:

gwas_hg19.to_csv('gwas_catalog_hg19.bed', sep='\t', header=False, index=False)
