In [3]:
import gzip
import sys
from copy import deepcopy

sys.path.insert(0, '../')
from ref_lib.GTF import GTFfile, GTFEntry, get_gtf_contents
from ref_lib.Fasta import FastaEntry, FastaFile

from collections import defaultdict, OrderedDict

It would be good to document this a bit more!

In [4]:
reverse_complement_dict = { "A" : "T",
                            "C" : "G",
                            "G" : "C",
                            "T" : "A",
                            "N" : "N"}

In [5]:
VCF_FIELDS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",  "INFO",    "FORMAT",  "CAST_EiJ", 
              "CHR_GENOMIC", "POS_GENOMIC", "STRAND", "TRANS_HEADER", "KOZAK_START", "KOZAK_END", "KOZAK_MOTIF", 
              "DUMMY_SCORE", "DUMMY_STRAND"]

class VcfEntry:

    def __init__(self , vcf_line_contents ):
        assert len(vcf_line_contents) >= len(VCF_FIELDS)
        
        self.fields = { VCF_FIELDS[i] : vcf_line_contents[i] for i in range( len(VCF_FIELDS) ) }
        
        
    def __str__(self ):
        """
        This needs to be rewritten 
        """
        return "\t".join( [self.fields[f] for f in VCF_FIELDS] )


############################################################################
    
class VcfFile:
    '''
    This is a reader for 
    '''
    
    def __init__(self , file):
        myopen = open
        if file.endswith(".gz"):
            myopen = gzip.open

        if(file):
            self.f = myopen(file , "rt")
        else:
            self.f = stdin

    #####################################################

    def __enter__(self):
        return self

    #####################################################

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    ######################################################

    def __getitem__(self, index):
        line = self.f.readline().strip()
        
        while line.startswith("#"):
            line = self.f.readline().strip()
        
        if line == "":
            raise IndexError
        #line_contents = line.split("\t")
        line_contents = line.split()
        if len(line_contents) < 9:
            raise IndexError
        return VcfEntry(line_contents)
                
    #########################################################

    def __del__(self):
        self.f.close()



In [6]:
main_vcf_file = "transcriptomic_variants.vcf.gz"

In [7]:
vcf_file = "./appris_mouse_v2_riboITP_transcriptomic_variants_Kozak.vcf"

my_vcf = VcfFile(vcf_file)

In [8]:
bed_file = "./rbp_selected_snps_intersect.bed"

In [9]:
fasta_file   = "../../../mouse_itp_reference/transcriptome/varnt_masked_and_filtered_mouse_transcriptome.fa.gz" 
fasta_reader = FastaFile(fasta_file)

In [10]:
fasta_entries = [a for a in fasta_reader]

In [11]:
fasta_dict = OrderedDict()

for f in fasta_entries:
    fasta_dict[f.header] = f.sequence

In [12]:
output_vcf = "appris_mouse_v2_riboITP_transcriptomic_variants_Kozak_maternal_paternal_seq_added.vcf"

with open(output_vcf, "wt") as output_stream:
    for entry in my_vcf:

        sequence_start = int(entry.fields["KOZAK_START"] )
        sequence_end   = int(entry.fields["KOZAK_END"] )
        this_gene      = entry.fields["TRANS_HEADER"]

        # Note that VCF is 1-based.
        # So we need to subtract 1 to make it 0 based
        snp_position = int(entry.fields["POS"]) - 1

        snp_offset = snp_position - sequence_start 

        this_kozak_sequence_raw = fasta_dict[this_gene][sequence_start: sequence_end]

        kozak_sequence_ref = this_kozak_sequence_raw[0: 0 +  snp_offset] +\
                            entry.fields["REF"] +\
                            this_kozak_sequence_raw[ snp_offset + 1 :]

        kozak_sequence_alt = this_kozak_sequence_raw[0: 0 +  snp_offset] +\
                            entry.fields["ALT"] +\
                            this_kozak_sequence_raw[ snp_offset + 1 :]
                            
        print( "{}\t{}\t{}".format(entry, kozak_sequence_ref, kozak_sequence_alt), file=output_stream )



## Bed file Sequences

In [14]:
VCF_FIELDS = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",  "INFO",    "FORMAT",  "CAST_EiJ", 
              "CHR_GENOMIC", "POS_GENOMIC", "STRAND"]

class VcfEntry:

    def __init__(self , vcf_line_contents ):
        assert len(vcf_line_contents) >= len(VCF_FIELDS)
        
        self.fields = { VCF_FIELDS[i] : vcf_line_contents[i] for i in range( len(VCF_FIELDS) ) }
        
        
    def __str__(self ):
        """
        This needs to be rewritten 
        """
        return "\t".join( [self.fields[f] for f in VCF_FIELDS] )


############################################################################
    
class VcfFile:
    '''
    This is a reader for 
    '''
    
    def __init__(self , file):
        myopen = open
        if file.endswith(".gz"):
            myopen = gzip.open

        if(file):
            self.f = myopen(file , "rt")
        else:
            self.f = stdin

    #####################################################

    def __enter__(self):
        return self

    #####################################################

    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

    ######################################################

    def __getitem__(self, index):
        line = self.f.readline().strip()
        
        while line.startswith("#"):
            line = self.f.readline().strip()
        
        if line == "":
            raise IndexError
        #line_contents = line.split("\t")
        line_contents = line.split()
        if len(line_contents) < 9:
            raise IndexError
        return VcfEntry(line_contents)
                
    #########################################################

    def __del__(self):
        self.f.close()



In [15]:

vcf_chr_dict = defaultdict(dict)

main_vcf = VcfFile(main_vcf_file)

for entry in main_vcf:

    vcf_chr_dict[entry.fields["CHR_GENOMIC"]][ int(entry.fields["POS_GENOMIC"]) ] = entry


In [16]:
! head './rbp_selected_snps_intersect.bed'

chr1	42851330	42851337	Tbg97262300	chr1	42851336	42851337
chr1	42851331	42851338	ELAVL1	chr1	42851336	42851337
chr1	42851331	42851338	Rbp9	chr1	42851336	42851337
chr1	42851331	42851338	ZC3H14CONSTRUCT	chr1	42851336	42851337
chr1	42851331	42851338	elav	chr1	42851336	42851337
chr1	42851331	42851338	fne	chr1	42851336	42851337
chr1	42862479	42862486	NOVA1	chr1	42862485	42862486
chr1	42862480	42862487	DAZAP1	chr1	42862485	42862486
chr1	42862480	42862487	MAL8P140	chr1	42862485	42862486
chr1	42862480	42862487	RBM41	chr1	42862485	42862486


In [74]:
output_bed_file = "./rbp_selected_snps_intersect_maternal_paternal_seq_added.bed"

with open(bed_file, "rt") as input_stream,\
     open(output_bed_file, "wt") as output_stream:
    i = 0

    for line in input_stream:
        
        contents = line.strip().split()

        if len(contents) < 6:
            continue

        seq_start_genomic = int(contents[1])
        seq_end_genomic   = int(contents[2])
        chr_genomic       = contents[0] 
        snp_pos_genomic   = int(contents[6]) 

        # We need this (-1) adjustment because VCF is 1-based
        left_offset  = (snp_pos_genomic -1 ) - seq_start_genomic 
        right_offset = seq_end_genomic - (snp_pos_genomic -1)
        
        vcf_entry = vcf_chr_dict[chr_genomic][snp_pos_genomic]

        snp_pos_trans   = int(vcf_entry.fields["POS"])

        # snp_pos_trans is also 1-based so we need -1
        seq_start_trans = (snp_pos_trans -1) - left_offset
        seq_end_trans   = (snp_pos_trans -1) + right_offset 

        raw_sequence = fasta_dict[ vcf_entry.fields["CHROM"] ][seq_start_trans: seq_end_trans]

        snp_local_offset = left_offset

        ref_sequence = raw_sequence[:snp_local_offset] + vcf_entry.fields["REF"] + raw_sequence[snp_local_offset + 1:]
        alt_sequence = raw_sequence[:snp_local_offset] + vcf_entry.fields["ALT"] + raw_sequence[snp_local_offset + 1:]


        #print(str(vcf_entry))
        #print("----   {}   {} -------".format(seq_start_trans, seq_end_trans))
        #print(vcf_entry.fields["CHROM"] )
        print( "{}\t{}\t{}".format(line.strip(), ref_sequence, alt_sequence) , file = output_stream)