In [1]:
import pandas as pd

import numpy as np
import scipy.sparse as sp
import scipy.io as spio

def reverse_complement(seq) :
    seq_prime = ''
    for j in range(0, len(seq)) :
        if seq[j] == 'A' :
            seq_prime = 'T' + seq_prime
        elif seq[j] == 'C' :
            seq_prime = 'G' + seq_prime
        elif seq[j] == 'G' :
            seq_prime = 'C' + seq_prime
        elif seq[j] == 'T' :
            seq_prime = 'A' + seq_prime
        else :
            seq_prime = seq[j] + seq_prime

    return seq_prime

In [2]:

emitted_id = []
emitted_chrom = []
emitted_start = []
emitted_end = []
emitted_isoform_start = []
emitted_isoform_end = []
emitted_strand = []
emitted_isoform = []
emitted_search = []
emitted_reads = []

i = 0
with open('TandemUTR.hg19.gff3') as f:
    for line in f:
        
        if i > 0 :
            lineparts = line[:-1].split('\t')
            
            chrom = lineparts[0]
            event_type = lineparts[2]
            start = int(lineparts[3])
            end = int(lineparts[4])
            strand = lineparts[6]
            
            id_str = lineparts[8]
            
            if event_type == 'mRNA' :
                emitted_id.append(chrom + ':' + str(start) + '-' + str(end))
                
                emitted_chrom.append(chrom)
                
                if strand == '+' :
                    emitted_start.append(end - 225)
                    emitted_end.append(end + 175)
                    
                    emitted_search.append(chrom[3:] + ':' + str(end - 225) + '-' + str(end + 175))
                else :
                    emitted_start.append(start - 175)
                    emitted_end.append(start + 225)
                    
                    emitted_search.append(chrom[3:] + ':' + str(start - 175) + '-' + str(start + 225))
                
                emitted_isoform_start.append(start)
                emitted_isoform_end.append(end)
                emitted_strand.append(strand)
                
                #Prox = B, Dist = A
                emitted_isoform.append(id_str.split(';')[0][-1])
                
                emitted_reads.append(1)
        
        i += 1


bed_df = pd.DataFrame({'chr' : emitted_chrom,
                       'start' : emitted_start,
                       'end' : emitted_end,
                       'gene' : emitted_id,
                       'reads' : emitted_reads,
                       'strand' : emitted_strand,
                       'search_region' : emitted_search,
                       'isoform' : emitted_isoform,
                })

bed_df = bed_df[['chr', 'start', 'end', 'gene', 'reads', 'strand', 'search_region', 'isoform']]
bed_df = bed_df.sort_values(by='gene')

print(bed_df.head())
print(len(bed_df))

bed_df.to_csv('Emitted_Tandem_UTR_200up_200dn.bed', sep='\t', header=False, index=False)

        chr      start        end                       gene  reads strand  \
3592  chr10  100007273  100007673  chr10:100007448-100008676      1      -   
3593  chr10  100007374  100007774  chr10:100007549-100008676      1      -   
4716  chr10  101909674  101910074  chr10:101909849-101911886      1      -   
4717  chr10  101911390  101911790  chr10:101911565-101911886      1      -   
4946  chr10  101991879  101992279  chr10:101992054-101992982      1      -   

               search_region isoform  
3592  10:100007273-100007673       A  
3593  10:100007374-100007774       B  
4716  10:101909674-101910074       A  
4717  10:101911390-101911790       B  
4946  10:101991879-101992279       A  
5312


In [3]:

hg19_fai     = '../apadb/hg19.fa.fai'
hg19_fa      = '../apadb/hg19.fa'

# fasta
output_fa = 'Emitted_Tandem_UTR_200up_200dn_Seqs.fa'
#!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo | cut -d : -f-4 > "$output_fa"
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "Emitted_Tandem_UTR_200up_200dn.bed" -fo "$output_fa"
    
# file tops
!head -5 "Emitted_Tandem_UTR_200up_200dn.bed" | column -t ; echo
!head -10 "$output_fa" ; echo


chr10  100007273  100007673  chr10:100007448-100008676  1  -  10:100007273-100007673  A
chr10  100007374  100007774  chr10:100007549-100008676  1  -  10:100007374-100007774  B
chr10  101909674  101910074  chr10:101909849-101911886  1  -  10:101909674-101910074  A
chr10  101911390  101911790  chr10:101911565-101911886  1  -  10:101911390-101911790  B
chr10  101991879  101992279  chr10:101992054-101992982  1  -  10:101991879-101992279  A

>chr10:100007448-100008676
GAAATCAGAAGGGGGAACAGTCAGTTTAGTTAAGGATGGAACCTGGGAAAGGCCACCATTCCTGCTTGATGGGGCTCTGATTTGCTCTTGCTCAAGTGGAATAAAACCCCATGGTCTTCTTGACATGATTCTTGATCTTTTCTCCACTGAGACACACTTAAGTGATGATCCTTACAGGACTGACACCCTAATGCCAATAAAAGTTGCTCATTATGGACTGCTACAAAGACCAGACCAGCTGTGATTATGTACACACCCCAGAACCACAGACCTACCCTTTTGCCTCCCATGGAGCCAGGGCCTGGGATACACAGGGCACACATGCTTCTCATGACTGCCCATCTACATTTCTCCAATCAGACATAAGTCTGCTTCCCAGTCAGCCTCCCCAGGCTTAGAG
>chr10:100007549-100008676
ATTGAATAAATTTTTGCCATGGAAAGAACATCAAACAAGCCACTCATCTCTACAGAGATAAGAAAACAAGTTTGGCAGAGCAAGAGACAGAAGACCGTGGAGAA

In [3]:
#Inflate sample whitelist
sample_set = {}
i = 0
with open('E-GEUV-1.sdrf.txt') as f:
    for line in f:
        
        if i > 0 :
            lineparts = line[:-1].split('\t')
            sample_set[lineparts[0]] = True

        i += 1

#Inflate Tandem UTR events
event_dict = {}
with open('Emitted_Tandem_UTR_200up_200dn.bed') as f:
    for line in f:
        lineparts = line[:-1].split('\t')
        event_id = lineparts[3]
        
        event_dict[event_id] = {}
        event_dict[event_id]['chrom'] = lineparts[0]
        event_dict[event_id]['start'] = int(lineparts[1])
        event_dict[event_id]['end'] = int(lineparts[2])
        event_dict[event_id]['strand'] = lineparts[5]
        event_dict[event_id]['isoform'] = lineparts[7]
        
        event_dict[event_id]['ref'] = {}
        event_dict[event_id]['ref']['samples'] = {}
        event_dict[event_id]['var'] = {}
        event_dict[event_id]['var']['samples'] = {}
        
        event_dict[event_id]['seq_map'] = {}

i = 0
with open('Emitted_Tandem_UTR_200up_200dn_Seqs.fa') as f:
    event_id = ''
    for line in f:
        
        linep = line[:-1]
        
        if i % 2 == 0 :
            event_id = linep[1:]
        else :
            event_dict[event_id]['seq'] = linep.upper()
        
        i += 1


print(len(event_dict))
print(event_dict['chr10:102587376-102589698'])


5312
{'end': 102589873, 'seq': 'GGCGGCGAGGGCGGCGAGGGCGCCGAGGTCCGGCCCATCCCAGTCCTGTGGGGCTGGCCGGGCAGAGACCCCGGACCCAGGCCCAGGCCTAACCTGCTAAATGTCCCCGGACGGTTCTGGTCTCCTCGGCCACTTTCAGTGCGTCGGTTCGTTTTGATTCTTTTTCTTTTGTGCACATAAGAAATAAATAATAATAATAAATAAAGAATAAAATTTTGTATGTCACTCCCCATGGCTCCAAGTTTGTCTCTCCCTGTCTCTGAGATGGGCCTCCCCTCCATTGGTCGATCCCCAAAAGCCCCTTCAATGATCCTCCCAACTACACTCCCGCTGCCACCTCCAACTCCTTTGCCGAGACCCCCTTGGCGGCAGCTGAACCACGGCGAAGGCCGAGACTAGC', 'chrom': 'chr10', 'isoform': 'A', 'start': 102589473, 'seq_map': {}, 'var': {'samples': {}}, 'ref': {'samples': {}}, 'strand': '+'}


In [4]:

def add_snp(seq, strand, var_type, ref, var, start_pos, var_pos) :
    
    rel_pos_start = var_pos - start_pos - 1
    rel_pos_end = rel_pos_start + len(ref)
    if rel_pos_start < 5 or rel_pos_end > 395 :
        return '', 0
    
    var_seq = seq[:]
    rel_pos = 0
    if strand == '-' :
        var_seq = reverse_complement(var_seq)
    
    if var_type == 'SNP' :
        rel_pos = var_pos - start_pos - 1
        if var_seq[rel_pos] == ref and rel_pos >= 0:
            var_seq = var_seq[0:rel_pos] + var + var_seq[rel_pos+1:]
        elif rel_pos != -1 :
            print(seq)
            print(rel_pos)
            print(strand)
            print(ref)
            print(var)
            
            print('ERROR (SNP): Sequence not aligned with genome reference.')
            return '', -1
    #else :
    #    return '', -1
    elif var_type == 'INDEL' :
        rel_pos_start = var_pos - start_pos - 1
        rel_pos_end = rel_pos_start + len(ref)
        
        rel_pos = rel_pos_start
        
        if var_seq[rel_pos_start:rel_pos_end] == ref :
            var_seq = var_seq[0:rel_pos_start] + var + var_seq[rel_pos_end:]
        else :
            
            print(var_seq)
            print(rel_pos_start)
            print(rel_pos_end)
            print(var_seq[rel_pos_start:rel_pos_end])
            print(ref)
            print(var)
            
            print('ERROR (INDEL): Sequence not aligned with genome reference.')
            print('' + 1)
    elif var_type == 'OTHER' and var == '<DEL>' :
        rel_pos_start = var_pos - start_pos - 1
        rel_pos_end = rel_pos_start + len(ref)
        
        rel_pos = rel_pos_start
        
        if var_seq[rel_pos_start:rel_pos_end] == ref :
            var_seq = var_seq[0:rel_pos_start] + var_seq[rel_pos_end:]
        else :
            print('ERROR (DEL): Sequence not aligned with genome reference.')
            print('' + 1)
    #elif var_type == 'OTHER' and ref == '<INS>' :
    #    rel_pos = var_pos - start_pos - 1
    #    var_seq = var_seq[0:rel_pos] + var + var_seq[rel_pos:]
    else :
        return '', 0
    
    
    var_seq += ('X' * 20)
    var_seq = var_seq[:400]
    
    if strand == '-' :
        var_seq = reverse_complement(var_seq)
        rel_pos = 400 - (rel_pos + 1)
        
    return var_seq, rel_pos


In [5]:

#Inflate sample variant calls

valid_sample_dict = {}

event_i = 0
for event_id in event_dict :
    
    seq = event_dict[event_id]['seq']
    chrom = event_dict[event_id]['chrom']
    start = event_dict[event_id]['start']
    end = event_dict[event_id]['end']
    strand = event_dict[event_id]['strand']
    
    valid_sample_dict[event_id] = {}
    
    call_file = 'snps2/' + event_id.replace(':', '_') + '_' + chrom[3:] + '_' + str(start) + '-' + str(end) + '.txt'
    try :
        with open(call_file) as f:
            for line in f:
                lineparts = line[:-1].split('\t')

                snp_type = lineparts[0]
                snp_pos = int(lineparts[2])

                ref = lineparts[3]
                var = lineparts[4]

                if len(ref) > 10 or len(var) > 10 :
                    continue

                for sample_index in range(5, len(lineparts)) :
                    sample_lineparts = lineparts[sample_index].split('=')

                    sample = sample_lineparts[0]

                    alleles = sample_lineparts[1].split('|')
                    if len(alleles) == 1 :
                        alleles = sample_lineparts[1].split('/')
                    if len(alleles) == 1 :
                        continue

                    allele1 = int(alleles[0])
                    allele2 = int(alleles[1])

                    if sample not in sample_set :
                        continue

                    valid_sample_dict[event_id][sample] = True

                    zyg = ''
                    if allele1 == 0 and allele2 == 0 :
                        continue
                    elif allele1 > 0 and allele2 > 0 :
                        zyg = 2
                    else :
                        zyg = 1

                    if sample not in event_dict[event_id]['var']['samples'] :
                        event_dict[event_id]['var']['samples'][sample] = {}
                        event_dict[event_id]['var']['samples'][sample]['seq'] = seq
                        event_dict[event_id]['var']['samples'][sample]['zyg'] = 2
                        event_dict[event_id]['var']['samples'][sample]['count'] = 0
                        event_dict[event_id]['var']['samples'][sample]['type'] = ''
                        event_dict[event_id]['var']['samples'][sample]['pos'] = ''
                        event_dict[event_id]['var']['samples'][sample]['snpid'] = ''

                    add_mut = False
                    if event_dict[event_id]['var']['samples'][sample]['count'] == 0 :
                        add_mut = True
                    if snp_type == 'SNP' and 'OTHER' not in event_dict[event_id]['var']['samples'][sample]['type'] and 'INDEL' not in event_dict[event_id]['var']['samples'][sample]['type'] :
                        add_mut = True
                    
                    if add_mut == True :
                        var_seq, rel_pos = add_snp(event_dict[event_id]['var']['samples'][sample]['seq'], strand, snp_type, ref, var, start, snp_pos)
                        if var_seq != '' :
                            event_dict[event_id]['var']['samples'][sample]['seq'] = var_seq
                            event_dict[event_id]['var']['samples'][sample]['zyg'] = min(zyg, event_dict[event_id]['var']['samples'][sample]['zyg'])
                            event_dict[event_id]['var']['samples'][sample]['count'] += 1
                            if event_dict[event_id]['var']['samples'][sample]['type'] == '' :
                                event_dict[event_id]['var']['samples'][sample]['type'] = snp_type
                            else :
                                event_dict[event_id]['var']['samples'][sample]['type'] += ',' + snp_type
                            if event_dict[event_id]['var']['samples'][sample]['pos'] == '' :
                                event_dict[event_id]['var']['samples'][sample]['pos'] = str(rel_pos)
                            else :
                                event_dict[event_id]['var']['samples'][sample]['pos'] += ',' + str(rel_pos)
                            
                            if event_dict[event_id]['var']['samples'][sample]['snpid'] == '' :
                                event_dict[event_id]['var']['samples'][sample]['snpid'] = str(chrom) + str(strand) + ':' + str(int(start)) + '-' + str(int(end)) + ':' + str(int(snp_pos)) + '/' + str(snp_type) + '/' + str(ref) + '/' + str(var)
                            else :
                                event_dict[event_id]['var']['samples'][sample]['snpid'] += ',' + str(chrom) + str(strand) + ':' + str(int(start)) + '-' + str(int(end)) + ':' + str(int(snp_pos)) + '/' + str(snp_type) + '/' + str(ref) + '/' + str(var)

        #print('Number of variant samples for event ' + event_id + ': ' + str(len(event_dict[event_id]['var']['samples'])))

        if event_i % 1000 == 0 :
            print('Processed ' + str(event_i + 1) + ' events.')
    except IOError :
        print('ERROR: Could not open file: ' + call_file)
        
    event_i += 1


for event_id in event_dict :
    for sample in sample_set :
        
        if sample not in event_dict[event_id]['var']['samples'] and sample in valid_sample_dict[event_id] :
            event_dict[event_id]['ref']['samples'][sample] = {}
        
        elif sample in valid_sample_dict[event_id] :
            var_event = event_dict[event_id]['var']['samples'][sample]
            
            if var_event['seq'] not in event_dict[event_id]['seq_map'] :
                event_dict[event_id]['seq_map'][var_event['seq']] = {}
            event_dict[event_id]['seq_map'][var_event['seq']][sample] = True

Processed 1 events.
Processed 1001 events.
Processed 2001 events.
Processed 3001 events.
Processed 4001 events.
Processed 5001 events.


In [6]:

print(event_dict['chr5:34019556-34020686'])


{'end': 34019781, 'seq': 'ATAAACTTTAATAGTCATAGAATTGTAAATCACTATGGTTAACAGAAAGTGAAAATATTTTCATGCAGATGATGTGAACAGCCATGTGAATAGGTGACTTGGGCACACAGCAGGGTCATATGACTTCAGAAAACTTCGCTTTTCAGTTATTCCATTGTTATAATGTCAACCCTTTAAGACATTGATGTTTAGAGGGCTCACAAATAAAATCTGAATACCTGTAAGGAAAGAGGTTTTTTATCACATACCTTAAGTCTTTGTAATGTTCATGCTTAAATTCTAAGTTTTCACCTTAGTGACACACAAGGTTTGGTTGTAGGCAACAAGTCCCAGGTGTGTGGGAAATTGATTCACAACAGAGATGGGAAAAGGTGCAGATAATTTCCAATGCCTTCACAAT', 'chrom': 'chr5', 'isoform': 'A', 'start': 34019381, 'seq_map': {}, 'var': {'samples': {}}, 'ref': {'samples': {}}, 'strand': '-'}


In [7]:

#Inflate MISO expression

for sample in sample_set :
    
    i = 0
    with open('geuvadis/' + sample + '_summary/summary/geuvadis_output.miso_summary') as f:
        for line in f:

            if i > 0 :
                lineparts = line[:-1].split('\t')
                
                psi_mean = float(lineparts[1])
                psi_low = float(lineparts[2])
                psi_high = float(lineparts[3])
                
                chrom = lineparts[7]
                
                start_positions = lineparts[9].split(',')
                end_positions = lineparts[10].split(',')
                
                a_event_id = chrom + ":" + start_positions[0] + '-' + end_positions[0]
                b_event_id = chrom + ":" + start_positions[1] + '-' + end_positions[1]
                
                if a_event_id in event_dict and sample in event_dict[a_event_id]['var']['samples'] :
                    event_dict[a_event_id]['var']['samples'][sample]['psi_mean'] = psi_mean
                    event_dict[a_event_id]['var']['samples'][sample]['psi_low'] = psi_low
                    event_dict[a_event_id]['var']['samples'][sample]['psi_high'] = psi_high
                elif a_event_id in event_dict and sample in event_dict[a_event_id]['ref']['samples'] :
                    event_dict[a_event_id]['ref']['samples'][sample]['psi_mean'] = psi_mean
                    event_dict[a_event_id]['ref']['samples'][sample]['psi_low'] = psi_low
                    event_dict[a_event_id]['ref']['samples'][sample]['psi_high'] = psi_high
                
                if b_event_id in event_dict and sample in event_dict[b_event_id]['var']['samples'] :
                    event_dict[b_event_id]['var']['samples'][sample]['psi_mean'] = 1.0 - psi_mean
                    event_dict[b_event_id]['var']['samples'][sample]['psi_low'] = 1.0 - psi_high
                    event_dict[b_event_id]['var']['samples'][sample]['psi_high'] = 1.0 - psi_low
                elif b_event_id in event_dict and sample in event_dict[b_event_id]['ref']['samples'] :
                    event_dict[b_event_id]['ref']['samples'][sample]['psi_mean'] = 1.0 - psi_mean
                    event_dict[b_event_id]['ref']['samples'][sample]['psi_low'] = 1.0 - psi_high
                    event_dict[b_event_id]['ref']['samples'][sample]['psi_high'] = 1.0 - psi_low
                

            i += 1
    
    


In [8]:

#Filter variant events

ci_limit = 0.25#0.25

for event_id in event_dict :
    
    delete_list = []
    for sample in event_dict[event_id]['var']['samples'] :
        if 'psi_mean' not in event_dict[event_id]['var']['samples'][sample] :
            delete_list.append(sample)
            continue
        if event_dict[event_id]['var']['samples'][sample]['psi_high'] - event_dict[event_id]['var']['samples'][sample]['psi_low'] > ci_limit :
            delete_list.append(sample)
            continue
        
        if event_dict[event_id]['var']['samples'][sample]['count'] == 0 :
            delete_list.append(sample)
            continue
        
    for sample in delete_list :
        del event_dict[event_id]['var']['samples'][sample]
    
    delete_list = []
    for sample in event_dict[event_id]['ref']['samples'] :
        if 'psi_mean' not in event_dict[event_id]['ref']['samples'][sample] :
            delete_list.append(sample)
            continue
        if event_dict[event_id]['ref']['samples'][sample]['psi_high'] - event_dict[event_id]['ref']['samples'][sample]['psi_low'] > ci_limit :
            delete_list.append(sample)
            continue
        
    for sample in delete_list :
        del event_dict[event_id]['ref']['samples'][sample]


min_ref_samples = 5
min_var_samples = 1

delete_list = []
for event_id in event_dict :
    for seq in event_dict[event_id]['seq_map'] :
        delete_list_seq = []
        for sample in event_dict[event_id]['seq_map'][seq] :
            if sample not in event_dict[event_id]['var']['samples'] :
                delete_list_seq.append(sample)
        for sample in delete_list_seq :
            del event_dict[event_id]['seq_map'][seq][sample]
    
    if len(event_dict[event_id]['var']['samples']) <= min_var_samples :
        delete_list.append(event_id)
    elif len(event_dict[event_id]['ref']['samples']) <= min_ref_samples :
        delete_list.append(event_id)

for event_id in delete_list :
    del event_dict[event_id]

In [9]:

print(len(event_dict))


1945


In [10]:

#Make Valid PAS lookup hierarchy

cano_pas1 = 'AATAAA'
cano_pas2 = 'ATTAAA'

valid_pas = []

valid_pas.append({})
valid_pas[0]['AATAAA'] = True
valid_pas[0]['ATTAAA'] = True

valid_pas.append({})
valid_pas[1]['AGTAAA'] = True
valid_pas[1]['TATAAA'] = True
valid_pas[1]['CATAAA'] = True
valid_pas[1]['GATAAA'] = True

valid_pas.append({})
for pos in range(0, 6) :
    for base in ['A', 'C', 'G', 'T'] :
        valid_pas[2][cano_pas1[:pos] + base + cano_pas1[pos+1:]] = True

valid_pas.append({})
for pos1 in range(0, 6) :
    for pos2 in range(pos1 + 1, 6) :
        for base1 in ['A', 'C', 'G', 'T'] :
            for base2 in ['A', 'C', 'G', 'T'] :
                valid_pas[3][cano_pas1[:pos1] + base1 + cano_pas1[pos1+1:pos2] + base2 + cano_pas1[pos2+1:]] = True


In [11]:

def get_average_ref_psi(event) :
    psi_mean = 0.0
    psi_mean_count = 0.0
    
    any_member = None
    
    for sample in event['ref']['samples'] :
        if event['ref']['samples'][sample]['psi_mean'] >= 0 :
            psi_mean += event['ref']['samples'][sample]['psi_mean']
            psi_mean_count += 1.0
            any_member = event['ref']['samples'][sample]
    
    return psi_mean / psi_mean_count, psi_mean_count, any_member

def get_average_var_psi(event_id, event, seq, zyg) :
    psi_mean = 0.0
    psi_mean_count = 0.0
    
    any_member = None
    
    for sample in event['seq_map'][seq] :
        if event['var']['samples'][sample]['psi_mean'] >= 0 and event['var']['samples'][sample]['zyg'] == zyg :
            psi_mean += event['var']['samples'][sample]['psi_mean']
            psi_mean_count += 1.0
            any_member = event['var']['samples'][sample]
    
    if psi_mean_count <= 0 :
        return -1, 0, None
    
    return psi_mean / psi_mean_count, psi_mean_count, any_member

def align_seqs(ref_seq, var_seq, cut_start, cut_end, before_cut = 35, after_cut = 5) :
    
    align_j = cut_start - 25
    aligned = -1
    
    for i in range(0, len(valid_pas)) :
        for j in range(cut_start - before_cut, cut_start + after_cut) :

            candidate_pas_ref = ref_seq[j:j+6]
            candidate_pas_var = var_seq[j:j+6]

            if candidate_pas_ref in valid_pas[i] or candidate_pas_var in valid_pas[i] :
                align_j = j
                aligned = i
                break
        if aligned != -1 :
            break
    
    aligned_ref_seq = (ref_seq[align_j-50:])[:186]
    aligned_var_seq = (var_seq[align_j-50:])[:186]
    
    return aligned_ref_seq, aligned_var_seq, aligned, get_mut_pos(aligned_ref_seq, aligned_var_seq)

def get_mut_pos(ref_seq, var_seq) :
    mut_pos = ''
    for j in range(0, len(ref_seq)) :
        if ref_seq[j] != var_seq[j] :
            mut_pos += str(j) + ','
    return mut_pos[:-1]

In [15]:

#Deflate data set

with open('APA_Tandem_UTR_GEUV_With_Id.csv', 'w') as out_f :
    
    out_f.write('snp_id' + '\t' + 'snp_type' + '\t' + 'isoform' + '\t' + 'zyg' + '\t' + 'pas' + '\t' + 'ref_seq' + '\t' + 'var_seq' + '\t' + 'ref_ratio' + '\t' + 'var_ratio' + '\t' 'diff' + '\t' 'diff_logodds' + '\t' + 'snp_count' + '\t' + 'snp_pos' + '\t' + 'ref_samples' + '\t' + 'var_samples' + '\n')
    
    for event_id in event_dict :
        
        ref_seq = event_dict[event_id]['seq']
        ref_psi, ref_count, ref_member = get_average_ref_psi(event_dict[event_id])
        
        isoform = event_dict[event_id]['isoform']
        
        for var_seq in event_dict[event_id]['seq_map'] :
            
            aligned_ref_seq, aligned_var_seq, aligned, mut_pos = align_seqs(ref_seq, var_seq, 225, 225+1)
            
            #HETEROZYGOUS VARIANT
            var_psi, var_count, var_member = get_average_var_psi(event_id, event_dict[event_id], var_seq, 1)
            
            if var_psi >= 0.0 and var_psi <= 1.0 and aligned != -1 and mut_pos != '' :
                psi_limit = 0.15
                var_sample_limit = 5
                ref_sample_limit = 10

                if np.abs(var_psi - ref_psi) >= psi_limit and var_count >= var_sample_limit and ref_count >= ref_sample_limit :
                    diff_logodds = str(round(np.log(var_psi / (1.0 - var_psi)) - np.log(ref_psi / (1.0 - ref_psi)), 2))
                    out_f.write(var_member['snpid'] + '\t' + var_member['type'] + '\t' + isoform + '\t' + 'HET' + '\t' + str(aligned) + '\t' + aligned_ref_seq + '\t' + aligned_var_seq + '\t' + str(ref_psi) + '\t' + str(var_psi) + '\t' + str(var_psi - ref_psi) + '\t' + diff_logodds + '\t' + str(var_member['count']) + '\t' + str(mut_pos) + '\t' + str(ref_count) + '\t' + str(var_count) + '\n')
                elif np.abs(var_psi - ref_psi) >= 0.3 and var_count >= 3 and ref_count >= ref_sample_limit :
                    diff_logodds = str(round(np.log(var_psi / (1.0 - var_psi)) - np.log(ref_psi / (1.0 - ref_psi)), 2))
                    out_f.write(var_member['snpid'] + '\t' + var_member['type'] + '\t' + isoform + '\t' + 'HET' + '\t' + str(aligned) + '\t' + aligned_ref_seq + '\t' + aligned_var_seq + '\t' + str(ref_psi) + '\t' + str(var_psi) + '\t' + str(var_psi - ref_psi) + '\t' + diff_logodds + '\t' + str(var_member['count']) + '\t' + str(mut_pos) + '\t' + str(ref_count) + '\t' + str(var_count) + '\n')
        
            #HOMOZYGOUS VARIANT
            var_psi, var_count, var_member = get_average_var_psi(event_id, event_dict[event_id], var_seq, 2)
            
            if var_psi >= 0.0 and var_psi <= 1.0 and aligned != -1 and mut_pos != '' :
                psi_limit = 0.10
                var_sample_limit = 5
                ref_sample_limit = 10
                
                if np.abs(var_psi - ref_psi) >= psi_limit and var_count >= var_sample_limit and ref_count >= ref_sample_limit :
                    diff_logodds = str(round(np.log(var_psi / (1.0 - var_psi)) - np.log(ref_psi / (1.0 - ref_psi)), 2))
                    out_f.write(var_member['snpid'] + '\t' + var_member['type'] + '\t' + isoform + '\t' + 'HOM' + '\t' + str(aligned) + '\t' + aligned_ref_seq + '\t' + aligned_var_seq + '\t' + str(ref_psi) + '\t' + str(var_psi) + '\t' + str(var_psi - ref_psi) + '\t' + diff_logodds + '\t' + str(var_member['count']) + '\t' + str(mut_pos) + '\t' + str(ref_count) + '\t' + str(var_count) + '\n')
                elif np.abs(var_psi - ref_psi) >= 0.25 and var_count >= 2 and ref_count >= ref_sample_limit :
                    diff_logodds = str(round(np.log(var_psi / (1.0 - var_psi)) - np.log(ref_psi / (1.0 - ref_psi)), 2))
                    out_f.write(var_member['snpid'] + '\t' + var_member['type'] + '\t' + isoform + '\t' + 'HOM' + '\t' + str(aligned) + '\t' + aligned_ref_seq + '\t' + aligned_var_seq + '\t' + str(ref_psi) + '\t' + str(var_psi) + '\t' + str(var_psi - ref_psi) + '\t' + diff_logodds + '\t' + str(var_member['count']) + '\t' + str(mut_pos) + '\t' + str(ref_count) + '\t' + str(var_count) + '\n')



In [16]:

df = pd.read_csv('APA_Tandem_UTR_GEUV_With_Id.csv', sep='\t')

print(df.head())

df = df.sort_values(by='diff')

df.to_csv('APA_Tandem_UTR_GEUV_With_Id_Sorted.csv', sep='\t', header=True, index=False)


                                         snp_id snp_type isoform  zyg  pas  \
0      chr1-:52818031-52818431:52818188/SNP/G/A      SNP       B  HOM    1   
1  chr11+:10328480-10328880:10328747/INDEL/T/TC    INDEL       B  HOM    2   
2      chr6+:71570645-71571045:71570880/SNP/T/A      SNP       B  HET    0   
3      chr6+:71570645-71571045:71570880/SNP/T/A      SNP       B  HOM    0   
4   chr7+:128589864-128590264:128590132/SNP/T/G      SNP       A  HET    0   

                                             ref_seq  \
0  GGTCTACCCACCCAAGAGAAAAGACTGTTAACTGGAAGAAAAAATA...   
1  AACTATCAATATTTAAGTTTGTTGCTGTCAAGATTTTTTTTGTAAC...   
2  TGTGTTTACATTTTATGGTGCCTAGTATTGACAAAATGTTATTTCC...   
3  TGTGTTTACATTTTATGGTGCCTAGTATTGACAAAATGTTATTTCC...   
4  CCAGGTGCAGATGCCCAATCTTGATGCCCAGCCATCAGAACTGTGA...   

                                             var_seq  ref_ratio  var_ratio  \
0  GGTCTACCCACCCAAGAGAAAAGACTGTTAACTGGAAGAAAAAATA...   0.794158   0.896942   
1  AACTATCAATATTTAAGTTTGTTGCTGTCAAGATT