In [1]:
from cat.plots import *
from cat.plots import _generic_histogram

In [2]:
consensus_data = OrderedDict([[genome, json.load(open(os.path.join('plot_data', genome, 'consensus.json')))] for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']])

In [3]:
ordered_genomes = ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']
ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB',
                  'Other']
ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups])

def split_fn(s):
    return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other')

modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes')
df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index()
df['Modes'] = df.apply(split_fn, axis=1)
df = df[['Modes'] + ordered_genomes]
ordered_values = [x for x in ordered_groups.itervalues() if x in set(df['Modes'])]
df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True)
df = df.sort_values('Ordered Modes')
df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes')
title_string = 'Transcript modes in protein coding CAT annotation'
ylabel = 'Number of transcripts'
legend_labels = df.index
names = ordered_genomes
box_label = 'Transcript mode(s)'

In [4]:
with open('tx_modes_clean.pdf', 'w') as outf, PdfPages(outf) as pdf:
    fig, ax = plt.subplots(figsize=(2, 5))
    bars = []
    cumulative = np.zeros(len(df.columns))
    color_palette = choose_palette(legend_labels)
    for i, (_, d) in enumerate(df.iterrows()):
        bars.append(ax.bar(np.arange(len(df.columns)), d, bar_width, bottom=cumulative,
                           color=color_palette[i], linewidth=0.0))
        cumulative += d
    fig.legend([x[0] for x in bars[::-1]], legend_labels[::-1], bbox_to_anchor=(1.25, 0.7), frameon=True,
               title=box_label)
    ax.set_title(title_string)
    ax.set_ylabel(ylabel)
    set_ticks(names, ax)
    ax.xaxis.set_ticks(np.arange(0, len(names)) + bar_width / 2.0)
    sns.despine(top=True, right=True)
    ax.set_ylim(0, 80000)
    multipage_close(pdf, tight_layout=False)
    

In [5]:
df = json_biotype_nested_counter_to_df(consensus_data, 'denovo')
# fix column names because json_biotype_nested_counter_to_df makes assumptions
df.columns = ['Result', 'Number of transcripts', 'Augustus mode', 'genome']
df = df[df['Result'] == 'Novel isoforms']

In [6]:
with open('denovo_clean.pdf', 'w') as outf, PdfPages(outf) as pdf:
    plt.figure(figsize=(1.5, 3))
    ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar',
                                            hue='Augustus mode', row_order=ordered_genomes)
    ax.set_xticklabels(rotation=90)
    ax.fig.suptitle('Number of transcripts predicted to be novel isoforms')
    ax.fig.subplots_adjust(top=0.9)
    multipage_close(pdf, tight_layout=False)

In [3]:
name_map = {u'3prime_overlapping_ncrna': 'ncRNA',
 u'IG_C_gene': 'other',
 u'IG_C_pseudogene': 'pseudogene',
 u'IG_D_gene': 'other',
 u'IG_J_gene': 'other',
 u'IG_J_pseudogene': 'pseudogene',
 u'IG_V_gene': 'other',
 u'IG_V_pseudogene': 'pseudogene',
 u'Mt_rRNA': 'other',
 u'Mt_tRNA': 'other',
 u'TEC': 'other',
 u'TR_C_gene': 'other',
 u'TR_J_gene': 'other',
 u'TR_J_pseudogene': 'pseudogene',
 u'TR_V_gene': 'other',
 u'TR_V_pseudogene': 'pseudogene',
 u'antisense': 'other',
 u'bidirectional_promoter_lncrna': 'lncRNA',
 u'lincRNA': 'lncRNA',
 u'macro_lncRNA': 'lncRNA',
 u'miRNA': 'ncRNA',
 u'misc_RNA': 'ncRNA',
 u'non_coding': 'ncRNA',
 u'non_stop_decay': 'other',
 u'nonsense_mediated_decay': 'other',
 u'polymorphic_pseudogene': 'pseudogene',
 u'processed_pseudogene': 'pseudogene',
 u'processed_transcript': 'other',
 u'protein_coding': 'protein coding',
 u'pseudogene': 'pseudogene',
 u'rRNA': 'other',
 u'retained_intron': 'other',
 u'ribozyme': 'ncRNA',
 u'sRNA': 'ncRNA',
 u'scaRNA': 'ncRNA',
 u'sense_intronic': 'other',
 u'sense_overlapping': 'other',
 u'snRNA': 'ncRNA',
 u'snoRNA': 'ncRNA',
 u'transcribed_processed_pseudogene': 'pseudogene',
 u'transcribed_unitary_pseudogene': 'pseudogene',
 u'transcribed_unprocessed_pseudogene': 'pseudogene',
 u'translated_unprocessed_pseudogene': 'pseudogene',
 u'unitary_pseudogene': 'pseudogene',
 u'unprocessed_pseudogene': 'pseudogene',
 u'vaultRNA': 'ncRNA'}

df = json_grouped_biotype_nested_counter_to_df(consensus_data, 'Completeness')

In [4]:
df['fixed biotype'] = [name_map[x] for x in df.biotype]
df = df.groupby(['fixed biotype', 'category', 'genome']).aggregate(sum)
df = df.reset_index()
df = df[df.category == 'Gene']
order = ['protein coding', 'lncRNA', 'ncRNA', 'pseudogene', 'other']
df.columns = ['biotype', 'category', 'genome', 'count']
df['biotype'] = pd.Categorical(df.biotype, order, ordered=True)
df = df.sort_values(['genome', 'biotype'])

In [5]:
from tools.sqlInterface import *
gene_biotype_map = get_gene_biotype_map('databases/Human.db')

In [14]:
counts = Counter()
for gene, biotype in gene_biotype_map.iteritems():
    counts[name_map.get(biotype, 'other')] += 1
r = []
for _, s in df.iterrows():
    r.append([s.biotype, s.category, s.genome, 1.0 * s['count'] / counts[s.biotype]])
df2 = pd.DataFrame(r, columns=['biotype', 'category', 'genome', 'percent difference'])
df2['biotype'] = pd.Categorical(df2['biotype'], order, ordered=True)
df2 = df2.sort_values('biotype')

In [16]:
with open('primate_completeness.pdf', 'w') as outf, PdfPages(outf) as pdf:
    plt.figure(figsize=(1.75, 4.5))
    g = sns.factorplot(data=df, x='genome', y='count', hue='biotype', kind='bar',
                       hue_order=order)
    ax = g.axes[0][0]
    ax.set_ylim(0, 21000)
    ax.set_ylabel('Number of genes')
    g.fig.suptitle('Completeness of comparative annotation')
    for i, p in enumerate(ax.patches):
        ax.text(p.get_x() + 0.02, p.get_height() + 3500,
               '{:.1%}'.format(df2.iloc[i]['percent difference']),
               size=10, rotation=90)
    g.set_xticklabels(rotation=90)
    g.fig.subplots_adjust(top=0.85, bottom=0.4)
    pdf.savefig(bboxes_inches='tight')
    plt.close('all')

In [None]:
iso_txs = tools.sqlInterface.load_isoseq_txs('databases/Clint_Chimp.db')
from tools.transcripts import *
txs = list(gene_pred_iterator('consensus_gene_set/Clint_Chimp.gp'))
clustered = cluster_txs(txs + iso_txs)

In [19]:
divided_clusters = divide_clusters(clustered, {x.name for x in txs})

In [26]:


def find_exact_match(iso_intervals, enst_intervals):
    """
    Compares intervals produced by construct_start_stop_intervals to each other to find subset matches.
    Used for fuzzy matching of IsoSeq transcripts (iso_intervals) to existing annotations (enst_intervals)
    """
    iso_l, iso_r = iso_intervals
    enst_l, enst_r = enst_intervals
    # if we have fewer reference junctions than isoseq, we can't have a subset match by definition
    if len(iso_l) != len(enst_l):
        return False
    lm = all([any([il.overlap(el) for el in enst_l]) for il in iso_l])
    lr = all([any([ir.overlap(er) for er in enst_r]) for ir in iso_r])
    return lm and lr


fuzz_distance=8
r = collections.defaultdict(list)
for cluster_id, (ensts, isos) in divided_clusters.iteritems():
    enst_intervals = collections.defaultdict(list)
    for tx in ensts:
        enst_interval = construct_start_stop_intervals(tx.intron_intervals, fuzz_distance)
        enst_intervals[tuple(enst_interval)].append(tx)
    for iso in isos:
        iso_intervals = construct_start_stop_intervals(iso.intron_intervals, fuzz_distance)
        for enst_interval, enst_txs in enst_intervals.iteritems():
            m = find_exact_match(iso_intervals, enst_interval)
            if m:
                r[iso.name].extend(enst_txs)

In [28]:
validated_ids = set()
for tx_list in r.itervalues():
    for tx in tx_list:
        validated_ids.add(tx.name)