In [51]:
import pandas as pd
import os
from collections import *
from tools.sqlInterface import *
from tools.misc import *
from tools.transcripts import *

In [60]:
dfs = OrderedDict()
base_dir = '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut'
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    dfs[genome] = pd.read_csv(os.path.join(base_dir, 'consensus_gene_set', genome + '.fixed.filtered.gp_info'), sep='\t', header=0)
dfs['Human'] = load_annotation(os.path.join(base_dir, 'databases', 'Human.db'))

In [61]:
txs = OrderedDict()
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    txs[genome] = get_gene_pred_dict(os.path.join(base_dir, 'consensus_gene_set', genome + '.filtered.gp'))
txs['Human'] = get_gene_pred_dict(os.path.join(base_dir, 'reference', 'gencode.v27.annotation.no_PAR.gp'))

In [62]:
# load iPSC expression data
kallisto_dir = 'kallisto_expression/'
kallisto_expressed = {}
kallisto_expressed_txs = {}
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan', 'Human']:
    if genome == 'Human':
        df = pd.read_csv(os.path.join(kallisto_dir, 'Human_GENCODE_V27/abundance.tsv'), sep='\t')
        df = df.merge(dfs[genome], left_on='target_id', right_on='TranscriptId')
        df.rename(columns={'TranscriptId': 'transcript_id', 'GeneId': 'gene_id'}, inplace=True)
    else:
        df = pd.read_csv(os.path.join(kallisto_dir, genome + '_CAT', 'abundance.tsv'), sep='\t')
        # remove stupid strand identifier
        df['target_id'] = [x.split('(')[0] for x in df['target_id']]
        #df = df.merge(dfs[genome], left_on='target_id', right_on='transcript_id')
        df = dfs[genome].merge(df, left_on='transcript_id', right_on='target_id')
    kallisto_expressed_txs[genome] = set(df[df.tpm > 0.1].transcript_id)
    df = df[['gene_id', 'tpm']].groupby('gene_id').aggregate(sum)
    expressed = df[df.tpm > 0.1]
    kallisto_expressed[genome] = set(expressed.reset_index().gene_id)

In [63]:
# add ICE support
ice_dir = 'isoseq_ice_isoform/tofu_results'
exact_matches = {}
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    iso_txs = get_gene_pred_dict(os.path.join(ice_dir, genome + '.collapsed.gp'))
    iso_txs = {x: y for x, y in iso_txs.iteritems() if len(y.exon_intervals) > 1}
    clustered = cluster_txs(txs[genome].values() + iso_txs.values())
    divided_clusters = divide_clusters(clustered, txs[genome].viewkeys())
    subset_matches = calculate_subset_matches(divided_clusters)
    # invert the subset_matches to extract all validated tx_ids
    validated_ids = set()
    for tx_list in subset_matches.itervalues():
        for tx in tx_list:
            validated_ids.add(tx.name)
    exact_matches[genome] = validated_ids

In [69]:
coding_biotypes = {'IG_C_gene',
                           'IG_D_gene',
                           'IG_J_gene',
                           'IG_V_gene',
                           'IG_LV_gene',
                           'polymorphic_pseudogene',
                           'protein_coding',
                           'nonsense_mediated_decay',
                           'TR_gene',
                           'TR_C_gene',
                           'TR_D_gene',
                           'TR_J_gene',
                           'TR_V_gene',
                           'non_stop_decay',
                  'unknown_likely_coding'}


# construct table for great ape paper
# rows:
# number of genes
# number of transcripts
# number of human orthologs
# number of coding genes
# number of coding transcripts
# number of ab-initio transcripts included
# percent of ab-initio predictions with TPM > 0.1
# percent of ab-initio predictions supported by IsoSeq
# number of isoforms with ICE support
d = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    tot_genes = len(set(df.gene_id))
    tot_txs = len(df)
    tot_ortho = len(set(df[df.gene_biotype != 'unknown_likely_coding'].gene_id))
    coding_genes = len(set(df[df.gene_biotype.isin(coding_biotypes)].gene_id))
    coding_txs = len(df[df.transcript_biotype.isin(coding_biotypes)].transcript_id)
    cgp_pb_txs = df[df.transcript_biotype == 'unknown_likely_coding']
    cgp_pb_supported = cgp_pb_txs[cgp_pb_txs.transcript_id.isin(kallisto_expressed_txs[genome])]
    cgp_pb_isoseq = cgp_pb_txs[cgp_pb_txs.pacbio_isoform_supported]
    cgp_pb_ice = len(exact_matches[genome] & set(cgp_pb_txs.transcript_id))
    d.append([genome, tot_genes, tot_txs, tot_ortho, coding_genes, coding_txs, len(cgp_pb_txs), 
              100.0 * len(cgp_pb_supported) / len(cgp_pb_txs),
              100.0 * len(cgp_pb_isoseq) / len(cgp_pb_txs),
             100.0 * cgp_pb_ice / len(cgp_pb_txs)])
    
tdf = pd.DataFrame(d, columns=['genome', 'genes', 'transcripts', 'orthologs', 'coding genes', 'coding txs',
                               'CGP/PB transcripts', 'percent CGP/PB with TPM > 0.1',
                               'percent CGP/PB supported by IsoSeq',
                               'percent CGP/PB supported by ICE'])
print tdf.T
tdf.T.to_csv('count_table.tsv', sep='\t')

                                              0              1  \
genome                              Clint_Chimp  Susie_Gorilla   
genes                                     55894          55985   
transcripts                              192725         192734   
orthologs                                 55586          55570   
coding genes                              19153          19311   
coding txs                                92610          92713   
CGP/PB transcripts                         3237           3471   
percent CGP/PB with TPM > 0.1            66.296        67.3293   
percent CGP/PB supported by IsoSeq      66.5431        46.4708   
percent CGP/PB supported by ICE         21.5632        19.6197   

                                                  2  
genome                              Susie_Orangutan  
genes                                         55222  
transcripts                                  190716  
orthologs                                     54898  
cod

In [70]:
# calculate gene totals for CAT paper
d = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    tot_genes = len(set(df.gene_id))
    tot_txs = len(df)
    #tot_ortho = len(set(df[df.gene_biotype != 'unknown_likely_coding'].gene_id))
    coding_genes = len(set(df[df.gene_biotype.isin(coding_biotypes)].gene_id))
    coding_txs = len(set(df[df.transcript_biotype == 'protein_coding'].transcript_id))
    novel_genes = len(set(df[(df.transcript_class == 'possible_paralog') | (df.transcript_class == 'putative_novel')].gene_id))
    tot_ortho = tot_genes - novel_genes
    novel_iso_pb =  len(df[(df.transcript_class == 'putative_novel_isoform')
                               & (df.transcript_modes == 'augPB')])
    novel_iso_cgp =  len(df[(df.transcript_class == 'putative_novel_isoform')
                               & (df.transcript_modes == 'augCGP')])
    iso_pb = len((df[df.transcript_modes == 'augPB']))
    cgp = len((df[df.transcript_modes == 'augCGP']))
    d.append([genome, tot_genes, tot_txs, tot_ortho, coding_genes, coding_txs, novel_genes, novel_iso_cgp, novel_iso_pb, iso_pb, cgp])

tdf = pd.DataFrame(d, columns=['genome', 'genes', 'transcripts', 'orthologs', 'coding genes', 'coding txs',
                              'novel genes', 'novel isoforms (cgp)', 'novel isoforms (pb)', 'total PB', 'total CGP'])
print tdf.T

                                0              1                2
genome                Clint_Chimp  Susie_Gorilla  Susie_Orangutan
genes                       55894          55985            55222
transcripts                192725         192734           190716
orthologs                   55594          55570            54900
coding genes                19153          19311            19043
coding txs                  75808          75568            75291
novel genes                   300            415              322
novel isoforms (cgp)          586            644              481
novel isoforms (pb)          2223           2258             1852
total PB                     2536           2689             2140
total CGP                     701            782              601


In [71]:
# how many CGP/PB are fully supported?

d2 = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    pb = df[(df.transcript_modes == 'augPB') & (df.pacbio_isoform_supported)]
    cgp = df[(df.transcript_modes == 'augCGP') & (df.pacbio_isoform_supported)]
    pb_iso = pb[pb.transcript_class == 'putative_novel_isoform']
    cgp_iso = cgp[cgp.transcript_class == 'putative_novel_isoform']
    pb_novel = pb[pb.transcript_class == 'possible_paralog']
    cgp_novel = cgp[cgp.transcript_class == 'possible_paralog']
    d2.append([genome, len(pb_iso), len(cgp_iso), len(set(pb_novel.gene_id)), len(set(cgp_novel.gene_id))])
    
d2df = pd.DataFrame(d2, columns=['genome', 'AugustusPB putative novel isoform', 'AugustusCGP putative novel isoform', 'AugustusPB putative paralogous loci', 'AugustusCGP putative paralogous loci'])
print d2df.T

d2df.T.to_csv('novel.tsv', sep='\t')

                                                0              1  \
genome                                Clint_Chimp  Susie_Gorilla   
AugustusPB putative novel isoform            1720           1251   
AugustusCGP putative novel isoform            265            192   
AugustusPB putative paralogous loci            52             48   
AugustusCGP putative paralogous loci           14             11   

                                                    2  
genome                                Susie_Orangutan  
AugustusPB putative novel isoform                1425  
AugustusCGP putative novel isoform                178  
AugustusPB putative paralogous loci                58  
AugustusCGP putative paralogous loci                9  


In [67]:
annotation_df = dfs['Human']
ref_genes = len(set(annotation_df.GeneId))
ref_txs = len(annotation_df)

for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    tot_genes = len(set(df[~df.source_transcript_name.isnull()].gene_id))
    tot_txs = len(df[~df.source_transcript_name.isnull()])
    print "{}: {:.1%} genes".format(genome, 1.0 * tot_genes / ref_genes)
    print "{}: {:.1%} transcripts".format(genome, 1.0 * tot_txs / ref_txs)

Clint_Chimp: 95.4% genes
Clint_Chimp: 94.6% transcripts
Susie_Gorilla: 95.4% genes
Susie_Gorilla: 94.5% transcripts
Susie_Orangutan: 94.2% genes
Susie_Orangutan: 93.9% transcripts


In [68]:
from cat.consensus import *
from tools.fileOps import *
name_map = dict(zip(dfs['Human'].GeneId, dfs['Human'].GeneName))

def find_common(s):
    try:
        names = s['Paralogous Gene Ids'].split(',')
        return ','.join([name_map[x] for x in names])
    except:
        return None

for g, df in dfs.iteritems():
    if g == 'Human':
        continue
    novel_genes = df[(df.transcript_class == 'possible_paralog') | (df.transcript_class == 'putative_novel')]
    alt_names = load_alt_names('../databases/{}.db'.format(g), ['augCGP', 'augPB'])
    m = novel_genes.merge(alt_names, left_on='alignment_id', right_on='AlignmentId')
    tx_objs = [txs[g][x] for x in novel_genes['transcript_id']]
    useful = m[['transcript_id', 'gene_id', 'AlternativeGeneIds', 'transcript_class', 'transcript_modes']]
    useful.columns = ['Transcript Id', 'Gene Id', 'Paralogous Gene Ids', 'Transcript Class', 'Transcript Mode']
    useful['Paralogous Gene Names'] = useful.apply(find_common, axis=1)
    useful = useful[['Transcript Id', 'Gene Id', 'Paralogous Gene Names', 'Paralogous Gene Ids', 'Transcript Class', 'Transcript Mode']]
    useful = useful.set_index('Transcript Id')
    with open('{}_novel_genes.txt'.format(g), 'w') as outf:
        print_row(outf, ['Transcript Id', 'Gene Id', 'Paralogous Gene Names', 'Paralogous Gene Ids', 
                         'Transcript Class', 'Transcript Mode', 'chromosome', 'start', 'stop'])
        for tx_id, s in useful.iterrows():
            tx = txs[g][tx_id]
            s = list(s)
            print_row(outf, [tx_id] + s + [tx.chromosome, tx.start, tx.stop])
        

In [81]:
name_map = {u'3prime_overlapping_ncrna': 'ncRNA',
 u'IG_C_gene': 'protein coding',
 u'IG_C_pseudogene': 'pseudogene',
 u'IG_D_gene': 'protein coding',
 u'IG_J_gene': 'protein coding',
 u'IG_J_pseudogene': 'pseudogene',
 u'IG_V_gene': 'protein coding',
 u'IG_V_pseudogene': 'pseudogene',
 u'Mt_rRNA': 'ncRNA',
 u'Mt_tRNA': 'ncRNA',
 u'TEC': 'protein coding',
 u'TR_C_gene': 'protein coding',
 u'TR_J_gene': 'protein coding',
 u'TR_J_pseudogene': 'pseudogene',
 u'TR_V_gene': 'protein coding',
 u'TR_V_pseudogene': 'pseudogene',
 u'antisense': 'ncRNA',
u'antisense_RNA': 'ncRNA',
 u'bidirectional_promoter_lncrna': 'lncRNA',
 u'lincRNA': 'lncRNA',
 u'macro_lncRNA': 'lncRNA',
 u'miRNA': 'ncRNA',
 u'misc_RNA': 'ncRNA',
 u'non_coding': 'ncRNA',
 u'non_stop_decay': 'protein coding',
 u'nonsense_mediated_decay': 'protein coding',
 u'polymorphic_pseudogene': 'pseudogene',
 u'processed_pseudogene': 'pseudogene',
 u'processed_transcript': 'protein coding',
 u'protein_coding': 'protein coding',
 u'pseudogene': 'pseudogene',
 u'rRNA': 'ncRNA',
 u'retained_intron': 'protein coding',
 u'ribozyme': 'ncRNA',
 u'sRNA': 'ncRNA',
 u'scaRNA': 'ncRNA',
 u'sense_intronic': 'ncRNA',
 u'sense_overlapping': 'ncRNA',
 u'snRNA': 'ncRNA',
 u'snoRNA': 'ncRNA',
 u'transcribed_processed_pseudogene': 'pseudogene',
 u'transcribed_unitary_pseudogene': 'pseudogene',
 u'transcribed_unprocessed_pseudogene': 'pseudogene',
 u'translated_unprocessed_pseudogene': 'pseudogene',
 u'unitary_pseudogene': 'pseudogene',
 u'unprocessed_pseudogene': 'pseudogene',
 u'vaultRNA': 'ncRNA',
           'unknown_likely_coding': 'protein coding',
           'translated_processed_pseudogene': 'pseudogene',
           '3prime_overlapping_ncRNA': 'ncRNA',
           'bidirectional_promoter_lncRNA': 'lncRNA',
           'TR_D_gene': 'protein coding',
           'scRNA': 'ncRNA',
           'IG_pseudogene': 'pseudogene'}

for g, df in dfs.iteritems():
    if g == 'Human':
        continue
    df['simplified biotype'] = [name_map[x] for x in df.gene_biotype]
    df['simplified gene biotype'] = [name_map[x] for x in df.gene_biotype]
    df['simplified transcript biotype'] = [name_map[x] for x in df.transcript_biotype]

In [82]:
# load old data
d = '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set'
old_dfs = {}
for g in ['Chimp', 'Gorilla', 'Orangutan']:
    df = pd.read_csv(os.path.join(d, g + '.filtered.gp_info'), sep='\t')
    df['simplified gene biotype'] = [name_map[x] for x in df.gene_biotype]
    df['simplified transcript biotype'] = [name_map[x] for x in df.transcript_biotype]
    old_dfs[g] = df

In [117]:
for g, df in old_dfs.iteritems():
    print g, len(df)

Gorilla 188403
Chimp 189551
Orangutan 186992


In [118]:
for g, df in dfs.iteritems():
    print g, len(df)

Clint_Chimp 192725
Susie_Gorilla 192734
Susie_Orangutan 190716
Human 200243


In [120]:
print 'gorilla: {}'.format(len(dfs['Susie_Gorilla']) - len(old_dfs['Gorilla']))

print 'orang: {}'.format(len(dfs['Susie_Orangutan']) - len(old_dfs['Orangutan']))

print 'chimp: {}'.format(len(dfs['Clint_Chimp']) - len(old_dfs['Chimp']))

gorilla: 4331
orang: 3724
chimp: 3174


In [122]:
print (4331.0 + 3724 + 3174) / 3

3743.0


In [126]:
print 'gorilla percent: {}'.format(100.0 * (len(dfs['Susie_Gorilla']) - len(old_dfs['Gorilla'])) / len(dfs['Susie_Gorilla']))

print 'orang percent: {}'.format(100.0 * (len(dfs['Susie_Orangutan']) - len(old_dfs['Orangutan'])) / len(dfs['Susie_Orangutan']))

print 'chimp percent: {}'.format(100.0 * (len(dfs['Clint_Chimp']) - len(old_dfs['Chimp'])) / len(dfs['Clint_Chimp']))

gorilla percent: 2.24713854328
orang percent: 1.9526416242
chimp percent: 1.64690621352


In [127]:
print (2.24713854328 + 1.9526416242 + 1.64690621352) / 3

1.94889546033


In [123]:
print 'gorilla gene: {}'.format(len(dfs['Susie_Gorilla'].groupby('gene_id').first()) - len(old_dfs['Gorilla'].groupby('gene_id').first()))

print 'orang gene: {}'.format(len(dfs['Susie_Orangutan'].groupby('gene_id').first()) - len(old_dfs['Orangutan'].groupby('gene_id').first()))

print 'chimp gene: {}'.format(len(dfs['Clint_Chimp'].groupby('gene_id').first()) - len(old_dfs['Chimp'].groupby('gene_id').first()))

gorilla gene: 821
orang gene: 720
chimp gene: 291


In [124]:
print (821 + 720 + 291) / 3.0

610.666666667


In [129]:
g = len(dfs['Susie_Gorilla'].groupby('gene_id').first())
print 'gorilla gene percent: {}'.format((100.0 * g - len(old_dfs['Gorilla'].groupby('gene_id').first())) / g)
o = len(dfs['Susie_Orangutan'].groupby('gene_id').first())
print 'orang gene percent: {}'.format((100.0 * o - len(old_dfs['Orangutan'].groupby('gene_id').first())) / o)
c = len(dfs['Clint_Chimp'].groupby('gene_id').first())
print 'chimp gene percent: {}'.format((100.0 * c - len(old_dfs['Chimp'].groupby('gene_id').first())) / c)

gorilla gene percent: 99.0146646423
orang gene percent: 99.0130382818
chimp gene percent: 99.0052062833


In [137]:
# munge into counts

genome_map = {'Clint_Chimp': 'Clint/Chimp', 'Susie_Gorilla': 'Susie/Gorilla',
             'Susie_Orangutan': 'Susie/Orangutan', 'Chimp': 'panTro4',
             'Gorilla': 'gorGor4', 'Orangutan': 'ponAbe2'}

counts = []
for g, df in dfs.iteritems():
    if g == 'Human':
        continue
    d = df.groupby('gene_id').first()
    c = Counter(d['simplified gene biotype'])
    for x, y in c.iteritems():
        counts.append([genome_map[g], x, y])
for g, df in old_dfs.iteritems():
    d = df.groupby('gene_id').first()
    c = Counter(d['simplified gene biotype'])
    for x, y in c.iteritems():
        counts.append([genome_map[g], x, y])
count_df = pd.DataFrame(counts, columns=['genome', 'biotype', 'count'])

In [138]:
# delta DF
order = ['protein coding', 'pseudogene', 'ncRNA', 'lncRNA']
df = dfs['Human']
d = df.groupby('GeneId').first()
d['simplified gene biotype'] = [name_map[x] for x in d.GeneBiotype]
c = Counter(d['simplified gene biotype'])


In [139]:
r = []
for _, s in count_df.iterrows():
    r.append([s.biotype, s.genome, 1.0 * s['count'] / c[s.biotype]])
df2 = pd.DataFrame(r, columns=['biotype', 'genome', 'percent difference'])
df2['biotype'] = pd.Categorical(df2['biotype'], order, ordered=True)
df2 = df2.sort_values('biotype')

In [140]:
from cat.plots import *
with open('primate_completeness_old_and_new.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.factorplot(data=count_df, x='genome', y='count', hue='biotype', kind='bar',
                       hue_order=order, x_order=['panTro4', 'Clint/Chimp',
                                                 'gorGor4', 'Susie/Gorilla',
                                                 'ponAbe2', 'Susie/Orangutan'], size=6)
    ax = g.axes[0][0]
    ax.set_ylim(0, 21000)
    ax.set_ylabel('Number of genes')
    g.fig.suptitle('Completeness of comparative annotation')
    for i, p in enumerate(ax.patches):
        ax.text(p.get_x() + 0.025, p.get_height() + 2500,
               '{:.1%}'.format(df2.iloc[i]['percent difference']),
               size=10, rotation=90)
    g.set_xticklabels(rotation=90)
    g.fig.subplots_adjust(top=0.85, bottom=0.4)
    pdf.savefig(bboxes_inches='tight')
    plt.close('all')

In [83]:
%connect_info

{
  "stdin_port": 36120, 
  "ip": "127.0.0.1", 
  "control_port": 48150, 
  "hb_port": 43031, 
  "signature_scheme": "hmac-sha256", 
  "key": "06d7fbc4-9d01-4784-9ae4-3630668ae165", 
  "kernel_name": "", 
  "shell_port": 49855, 
  "transport": "tcp", 
  "iopub_port": 41769
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/ifiddes/.local/share/jupyter/runtime/kernel-042fc30c-77b5-48eb-9e5e-62a5d0b38eac.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.
