In [39]:
import shutil
import pysam
import argparse
import os
import itertools
from glob import glob
from collections import *
from tools.fileOps import *
from tools.procOps import *
from tools.intervals import *
from tools.transcripts import *
from tools.mathOps import *
from tools.misc import *
from tools.bio import *
from tools.toilInterface import *
from tools.dataOps import *

In [None]:
# first, we need to run cDNA_cupcake on all of the combinations we are interested in
# I did this before for all of the original primates
# bringing in those data
!mkdir -p tofu_results
!ln /hive/groups/recon/projs/primates/susie_indel_corrected/ice_validation/old_assemblies/* tofu_results/

In [45]:
# now, we need to map the quivered reads to our gmap references, filter out mapped reads, keep as sam
read_map = {'Human': 'ice_data/human.all_sizes.quivered_hq.fastq',
           'Susie_Gorilla': 'ice_data/gorilla.all_sizes.quivered_hq.fastq',
           'Clint_Chimp': 'ice_data/chimp.all_sizes.quivered_hq.fastq',
           'Susie_Orangutan': 'ice_data/orangutan.all_sizes.quivered_hq.fastq',
           'gorGor4_ensembl': 'ice_data/gorilla.all_sizes.quivered_hq.fastq',
           'panTro4_ensembl': 'ice_data/chimp.all_sizes.quivered_hq.fastq'}

gmap_references = {'Human': ('gmap_references/Human/', 'Human'),
                  'Susie_Gorilla': ('gmap_references/Susie_Gorilla/', 'Susie_Gorilla'),
                  'Clint_Chimp': ('gmap_references/Clint_Chimp/', 'clint'),
                  'Susie_Orangutan': ('gmap_references/Susie_Orangutan/', 'susie'),
                  'gorGor4_ensembl': ('gmap_references/gorGor4_ensembl/', 'gorGor4_ensembl'),
                  'panTro4_ensembl': ('gmap_references/panTro4_ensembl/', 'panTro4_ensembl')}

for genome, fq in read_map.iteritems():
    ref, ref_name = gmap_references[genome]
    # map reads
    cmd = [['gmap', '-D', ref, '-d', ref_name, '-f', 'samse', '-t', '20', '-n', '0', fq],
          ['samtools', 'view', '-F', 4, '-']]
    run_proc(cmd, stdout=os.path.join('tofu_results', genome + '.mapped.sam'))

In [16]:
# need to sort the SAM
for genome, fq in read_map.iteritems():
    sam = os.path.join('tofu_results', genome + '.mapped.sam')
    !sort -S8G -snk3 -k4 {sam} > {sam}.sorted

In [None]:
for genome, fq in read_map.iteritems():
    sam = os.path.join('tofu_results', genome + '.mapped.sam.sorted')
    out = os.path.join('tofu_results', genome)
    cmd = ['python', '/cluster/home/ifiddes/cDNA_Cupcake/cupcake/tofu/collapse_isoforms_by_sam.py',
          '--input', os.path.abspath(fq), '--fq', '-s', os.path.abspath(sam), '--dun-merge-5-shorter', '-o', os.path.abspath(out)]
    run_proc(cmd)
    !gtfToGenePred {out + '.collapsed.gff'} {out + '.collapsed.gp'} -genePredExt

In [50]:
tx_sets =  [['ponAbe2', 'Ensembl V90', 'tofu_results/ponAbe2.collapsed.gp', 'Pongo_abelii.PPYG2.88.gp'],
            ['Susie_Orangutan', 'CAT', 'tofu_results/Susie_Orangutan.collapsed.gp', '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Susie_Orangutan.gp'],
            ['panTro4', 'Ensembl V91', 'tofu_results/panTro4_ensembl.collapsed.gp', 'Pan_troglodytes.Pan_tro_3.0.91.gp'],
            ['Susie_Gorilla', 'CAT', 'tofu_results/Susie_Gorilla.collapsed.gp', '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Susie_Gorilla.gp'],
            ['Human', 'GENCODE V27', 'tofu_results/Human.collapsed.gp', '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/reference/gencode.v27.annotation.no_PAR.gp'],
            ['panTro4', 'CAT', 'tofu_results/panTro4.collapsed.gp', '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Chimp.gp'],
            ['Clint_Chimp', 'CAT', 'tofu_results/Clint_Chimp.collapsed.gp', '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Clint_Chimp.gp'],
            ['ponAbe2', 'CAT', 'tofu_results/ponAbe2.collapsed.gp', '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Orangutan.gp'],
            ['gorGor4', 'CAT', 'tofu_results/gorGor4.collapsed.gp', '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Gorilla.gp'],
            ['gorGor4', 'Ensembl V91', 'tofu_results/gorGor4_ensembl.collapsed.gp', 'Gorilla_gorilla.gorGor4.91.gp']]

In [51]:
data_holder = defaultdict(dict)
for genome, annotation, iso_gp, gp in tx_sets:
    txs = get_gene_pred_dict(gp)
    data_holder[(genome, annotation)]['txs'] = txs
    iso_txs = get_gene_pred_dict(iso_gp)
    iso_txs = {x: y for x, y in iso_txs.iteritems() if len(y.exon_intervals) > 1}
    data_holder[(genome, annotation)]['iso_txs'] = iso_txs
    clustered = cluster_txs(txs.values() + iso_txs.values())
    data_holder[(genome, annotation)]['clustered'] = clustered
    divided_clusters = divide_clusters(clustered, txs.viewkeys())
    data_holder[(genome, annotation)]['divided_clusters'] = divided_clusters

In [55]:
for (genome, annotation), d in data_holder.iteritems():
    iso_txs = d['iso_txs']
    divided_clusters = d['divided_clusters']
    num_exact = 0
    for cluster_id, (ensts, isos) in divided_clusters.iteritems():
        for iso in isos:
            iso_unstranded = {ChromosomeInterval(x.chromosome, x.start, x.stop, '.') for x in iso.intron_intervals if len(x) > 30}
            for enst in ensts:
                enst_unstranded = {ChromosomeInterval(x.chromosome, x.start, x.stop, '.') for x in enst.intron_intervals if len(x) > 30}
                if len(iso_unstranded & enst_unstranded) == len(iso_unstranded):
                    num_exact += 1
                    break
    fuzzy_matches = calculate_subset_matches(divided_clusters, fuzz_distance=8)
    num_iso = len(iso_txs)
    num_fuzzy = len(fuzzy_matches)
    percent_exact = 1.0 * num_exact / num_iso
    percent_fuzzy = 1.0 * num_fuzzy / num_iso
    print '{}-{}: {:,} ICE isoforms. {:,} ({:.1%}) exact, {:,} ({:.1%}) fuzzy'.format(genome, annotation, num_iso, num_exact, percent_exact, num_fuzzy, percent_fuzzy)

gorGor4-CAT: 20,046 ICE isoforms. 12,811 (63.9%) exact, 14,485 (72.3%) fuzzy
Susie_Orangutan-CAT: 14,377 ICE isoforms. 10,273 (71.5%) exact, 11,585 (80.6%) fuzzy
panTro4-Ensembl V91: 18,665 ICE isoforms. 10,258 (55.0%) exact, 12,003 (64.3%) fuzzy
Susie_Gorilla-CAT: 22,383 ICE isoforms. 15,155 (67.7%) exact, 17,223 (76.9%) fuzzy
panTro4-CAT: 17,455 ICE isoforms. 11,180 (64.1%) exact, 12,761 (73.1%) fuzzy
ponAbe2-CAT: 13,102 ICE isoforms. 6,871 (52.4%) exact, 8,315 (63.5%) fuzzy
ponAbe2-Ensembl V90: 13,102 ICE isoforms. 5,009 (38.2%) exact, 6,435 (49.1%) fuzzy
Human-GENCODE V27: 19,271 ICE isoforms. 14,387 (74.7%) exact, 15,834 (82.2%) fuzzy
Clint_Chimp-CAT: 18,863 ICE isoforms. 13,958 (74.0%) exact, 15,495 (82.1%) fuzzy
gorGor4-Ensembl V91: 20,046 ICE isoforms. 10,155 (50.7%) exact, 11,923 (59.5%) fuzzy


In [54]:
import tools.transcripts
reload(tools.transcripts)
from tools.transcripts import *