In [70]:
import os 
import sys
import pybedtools as pbt
import pandas as pd
import numpy as np
import subprocess as sp
import json
import argparse
from itertools import chain
pbt.helpers.set_tempdir('/mnt/BioHome/jreyna/tmp/')
pbt.set_bedtools_path('/mnt/BioApps/bedtools/bin/')
bgzip = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip'
tabix = '/mnt/BioApps/tabix/tabix-0.2.6/tabix'

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [71]:
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

In [72]:
# setting basical column names for bedpe
bedpe_6cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB']
bedpe_10cols = ['chrA', 'startA', 'endA', 'chrB', 'startB', 'endB', 'name', 'score', 'strandA', 'strandB']

# Making a command line interface

In [73]:
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, required=True)
parser.add_argument('--convert-from', type=str, choices=['bed', 'bedpe', 'bp'], required=True)
parser.add_argument('--convert-to', type=str, choices=['bed', 'longrange', 'vcf'], required=True)
parser.add_argument('--header', action='store_true', default=True)

# arguments for bed 
parser.add_argument('--chr', type=int, default=1)
parser.add_argument('--start', type=int, default=2)
parser.add_argument('--end', type=int, default=3)

# arguments for bedpe 
parser.add_argument('--chrB', type=int, default=4)
parser.add_argument('--startB', type=int, default=5)
parser.add_argument('--endB', type=int, default=6)
parser.add_argument('--score', type=int, default=7)

# arguments for vcf
parser.add_argument('--pos', type=int, default=-1)
parser.add_argument('--id', type=int, default=None)
parser.add_argument('--ref', type=int, default=None)
parser.add_argument('--alt', type=int, default=None)
parser.add_argument('--qual', type=int, default=None)
parser.add_argument('--filter', type=int, default=None)
parser.add_argument('--info', type=int, default=None)
parser.add_argument('--format', type=int, default=None)
parser.add_argument('--samples', type=int, default=None)

_StoreAction(option_strings=['--samples'], dest='samples', nargs=None, const=None, default=None, type=<class 'int'>, choices=None, help=None, metavar=None)

In [74]:
# creating dummy values for debugging, necessary since I started this 
# code within a jupyter notebook
debug = 'bp_to_vcf'
if debug == 'bp_to_vcf': 
    debug = []
    debug.append('--input A')
    debug.append('--convert-from bp')
    debug.append('--convert-to vcf')
    debug.append('--header')
    debug.append('--chr 1')
    debug.append('--pos 2')  
    
    # split and chain everything together
    debug = [x.split() for x in debug]
    debug = list(chain(*debug))
    params = parser.parse_args(debug)
    params.input = 'results/main/GRCh37/sgls/ImmuNexUT/T1D_32005708/ImmuNexUT/Naive_CD4/eqtls.coloc_filtered.tsv.gz'
    params.chr = 1
    params.pos = -1
    params.start = 2
    params.end = 3
    params.chrB = 4
    params.startB = 5
    params.startB = 6
    
elif debug == 'bedpe_to_longrange':

    debug = []
    debug.append('--convert-from bp')
    debug.append('--convert-to vcf')
    debug.append('--header')
    debug.append('--end 3')

    params = parser.parse_args()

# Updating column index fields

In [75]:
# Conversions available are:
# 1) bed intervals to bed intervals (--convert-from bp --convert-to vcf)
# 2) single basepair file to variant file (--convert-from bp --convert-to vcf)
# 1) bedpe intervals to longrange (--convert-from bedpe --convert-to longrange)

In [76]:
import gzip

In [None]:
if params.convert_from == 'bp' and params.convert_to == 'vcf':

    # convert to zero-based indexing
    params.chr -= 1
    params.pos -= 1

    # write out the header
    vcf_header = """##fileformat=VCFv4.3
    #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"""
    print(vcf_header)

    # parse the file 
    fr = gzip.open(params.input)
    if params.header == True:
        next(fr)
    i = 0 
    for line in fr:

        line_info = line.strip().decode().split()
        chrom = line_info[params.chr].replace('chr', '')
        pos = line_info[params.pos]
        idd = line_info[params.id] if params.id != None else '.'
        ref = line_info[params.ref] if params.ref != None else '.'
        alt = line_info[params.alt] if params.alt != None else '.'
        qual = line_info[params.qual] if params.qual != None else '.'
        flt = line_info[params.filter] if params.filter != None else 'PASS'
        info = life_info[params.info] if params.info != None else '.'
        #form = line_info[params.format] if params.format != None else '.'
        #samples = line_info[params.samples] if params.samples != None else '.'

        new_info = [chrom, pos, idd, ref, alt, qual, flt, info] #, form, samples]
        new_info = '\t'.join(new_info)
        print(new_info, file=fw)

        if i == 100:
            break
        i += 1 

    fr.close()
    fw.close()

In [None]:
    msg = '/mnt/BioApps/tabix/tabix-0.2.6/bgzip my.vcf;'
    msg += '/mnt/BioApps/tabix/tabix-0.2.6/tabix my.vcf.gz'