Script adds the distance column to eQTL files that are missing it.

In [114]:
import os
import gzip
import pandas as pd
import re

In [115]:
os.chdir('/mnt/BioHome/jreyna/jreyna/projects/dchallenge/')

In [123]:
# load gencode
gencode = pd.read_table('results/refs/gencode/v39/gencode.v39lift37.annotation.bed', header=None)
gencode.columns = ['chr', 'start', 'end', 'strand', 'gene_name', 'score', 'gene_id']
gencode.set_index('gene_id', inplace=True)

eqtl = 'results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_T-cell.all.tsv.gz'
outfn = 'results/main/eqtl/BLUEPRINT/ge/BLUEPRINT_ge_T-cell.all.dist.tsv.gz'

if os.path.splitext(eqtl)[1] == '.gz':
    
    print('Dealing with .gz files.')
    
    # calculate distance
    with gzip.open(eqtl) as fr, gzip.open(outfn, 'wb') as fw:

        header = next(fr)
        fw.write(header)
        cheader = header.decode().replace('\r', '').strip().split()

        for i, line in enumerate(fr):
            
            if i == 10000:
                break

            # get eqtl info 
            eqtl_info = line.decode().replace('\r', '').strip().split()
            eqtl_info = pd.Series(eqtl_info, index=cheader)    
            gene_info = gencode.loc[sr.gene_id]

            # calculate distance based on the strand information
            if gene_info.strand == '+':
                dist = abs(gene_info.start - sr.position)
            elif gene_info.strand == '-':
                dist = abs(gene_info.end - sr.position)
            else:
                raise Exception('Mistake, {} is not a strand orientation.'.format(gene_info.strand))

            # write out the new eQTL line
            s = eqtl_info.to_string(header=False, index=False)
            s = re.sub('\n\s*', '\t', s)
            s += '\n'
            fw.write(s.encode())
            
elif os.path.splitext(eqtl)[1] == '.tsv':
    
    print('Dealing with .tsv files.')
    
    # calculate distance
    with open(eqtl) as fr, open(outfn, 'w') as fw:

        header = next(fr)
        fw.write(header)
        cheader = header.strip().split()

        for line in fr:

            # get eqtl info 
            eqtl_info = line.strip().split()
            eqtl_info = pd.Series(eqtl_info, index=cheader)    
            gene_info = gencode.loc[sr.gene_id]

            # calculate distance based on the strand information
            if gene_info.strand == '+':
                dist = abs(gene_info.start - sr.position)
            elif gene_info.strand == '-':
                dist = abs(gene_info.end - sr.position)
            else:
                raise Exception('Mistake, {} is not a strand orientation.'.format(gene_info.strand))

            # write out the new eQTL line
            s = eqtl_info.to_string(header=False, index=False)
            s = re.sub('\n\s*', '\t', s)
            s += '\n'
            fw.write(s)
            
else:
    msg = 'Incorrect file types. Both input and output have to use the same extension and '
    msg += 'these extensions must either be .gz or .tsv'
    raise Exception(msg)

Dealing with .gz files.
