In [29]:
import pandas as pd
import os
from collections import *
from tools.sqlInterface import *
from tools.misc import *
from tools.transcripts import *

In [30]:
dfs = OrderedDict()
base_dir = '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut'
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    dfs[genome] = pd.read_csv(os.path.join(base_dir, 'consensus_gene_set', genome + '.gp_info'), sep='\t', header=0)
dfs['Human'] = load_annotation(os.path.join(base_dir, 'databases', 'Human.db'))

In [31]:
txs = OrderedDict()
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    txs[genome] = get_gene_pred_dict(os.path.join(base_dir, 'consensus_gene_set', genome + '.gp'))
txs['Human'] = get_gene_pred_dict(os.path.join(base_dir, 'reference', 'gencode.v27.annotation.no_PAR.gp'))

In [32]:
# load iPSC expression data
kallisto_dir = 'kallisto_expression/'
kallisto_expressed = {}
kallisto_expressed_txs = {}
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan', 'Human']:
    if genome == 'Human':
        df = pd.read_csv(os.path.join(kallisto_dir, 'Human_GENCODE_V27/abundance.tsv'), sep='\t')
        df = df.merge(dfs[genome], left_on='target_id', right_on='TranscriptId')
        df.rename(columns={'TranscriptId': 'transcript_id', 'GeneId': 'gene_id'}, inplace=True)
    else:
        df = pd.read_csv(os.path.join(kallisto_dir, genome + '_CAT', 'abundance.tsv'), sep='\t')
        # remove stupid strand identifier
        df['target_id'] = [x.split('(')[0] for x in df['target_id']]
        df = df.merge(dfs[genome], left_on='target_id', right_on='transcript_id')
    kallisto_expressed_txs[genome] = set(df[df.tpm > 0.1].transcript_id)
    df = df[['gene_id', 'tpm']].groupby('gene_id').aggregate(sum)
    expressed = df[df.tpm > 0.1]
    kallisto_expressed[genome] = set(expressed.reset_index().gene_id)

In [33]:
# add ICE support
ice_dir = 'isoseq_ice_isoform/tofu_results'
exact_matches = {}
for genome in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    iso_txs = get_gene_pred_dict(os.path.join(ice_dir, genome + '.collapsed.gp'))
    iso_txs = {x: y for x, y in iso_txs.iteritems() if len(y.exon_intervals) > 1}
    clustered = cluster_txs(txs[genome].values() + iso_txs.values())
    divided_clusters = divide_clusters(clustered, txs[genome].viewkeys())
    subset_matches = calculate_subset_matches(divided_clusters)
    # invert the subset_matches to extract all validated tx_ids
    validated_ids = set()
    for tx_list in subset_matches.itervalues():
        for tx in tx_list:
            validated_ids.add(tx.name)
    exact_matches[genome] = validated_ids

In [34]:
# construct table for great ape paper
# rows:
# number of genes
# number of transcripts
# number of human orthologs
# number of coding genes
# number of coding transcripts
# number of ab-initio transcripts included
# percent of ab-initio predictions with TPM > 0.1
# percent of ab-initio predictions supported by IsoSeq
# number of isoforms with ICE support
d = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    tot_genes = len(set(df.gene_id))
    tot_txs = len(df)
    tot_ortho = len(set(df[df.gene_biotype != 'unknown_likely_coding'].gene_id))
    coding_genes = len(set(df[df.gene_biotype == 'protein_coding'].gene_id))
    coding_txs = len(df[df.transcript_biotype == 'protein_coding'].transcript_id)
    cgp_pb_txs = df[df.transcript_biotype == 'unknown_likely_coding']
    cgp_pb_supported = cgp_pb_txs[cgp_pb_txs.transcript_id.isin(kallisto_expressed_txs[genome])]
    cgp_pb_isoseq = cgp_pb_txs[cgp_pb_txs.pacbio_isoform_supported]
    cgp_pb_ice = len(exact_matches[genome] & set(cgp_pb_txs.transcript_id))
    d.append([genome, tot_genes, tot_txs, tot_ortho, coding_genes, coding_txs, len(cgp_pb_txs), 
              100.0 * len(cgp_pb_supported) / len(cgp_pb_txs),
              100.0 * len(cgp_pb_isoseq) / len(cgp_pb_txs),
             100.0 * cgp_pb_ice / len(cgp_pb_txs)])
    
tdf = pd.DataFrame(d, columns=['genome', 'genes', 'transcripts', 'orthologs', 'coding genes', 'coding txs',
                               'CGP/PB transcripts', 'percent CGP/PB with TPM > 0.1',
                               'percent CGP/PB supported by IsoSeq',
                               'percent CGP/PB supported by ICE'])
print tdf.T
tdf.T.to_csv('count_table.tsv', sep='\t')

                                              0              1  \
genome                              Clint_Chimp  Susie_Gorilla   
genes                                     57001          57091   
transcripts                              198988         199203   
orthologs                                 56721          56750   
coding genes                              19568          19591   
coding txs                                79179          79145   
CGP/PB transcripts                         3363           3565   
percent CGP/PB with TPM > 0.1           66.2801        46.9565   
percent CGP/PB supported by IsoSeq      66.2801        46.2272   
percent CGP/PB supported by ICE         21.1418        19.6634   

                                                  2  
genome                              Susie_Orangutan  
genes                                         56414  
transcripts                                  197713  
orthologs                                     56161  
cod

In [35]:
# calculate gene totals for CAT paper
d = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    tot_genes = len(set(df.gene_id))
    tot_txs = len(df)
    tot_ortho = len(set(df[df.gene_biotype != 'unknown_likely_coding'].gene_id))
    coding_genes = len(set(df[df.gene_biotype == 'protein_coding'].gene_id))
    coding_txs = len(set(df[df.transcript_biotype == 'protein_coding'].transcript_id))
    
    novel_genes = len(set(df[(df.transcript_class == 'possible_paralog')].gene_id))
    novel_iso_pb =  len(df[(df.transcript_class == 'putative_novel_isoform')
                               & (df.transcript_modes == 'augPB')])
    novel_iso_cgp =  len(df[(df.transcript_class == 'putative_novel_isoform')
                               & (df.transcript_modes == 'augCGP')])
    iso_pb = len((df[df.transcript_modes == 'augPB']))
    cgp = len((df[df.transcript_modes == 'augCGP']))
    d.append([genome, tot_genes, tot_txs, tot_ortho, coding_genes, coding_txs, novel_genes, novel_iso_cgp, novel_iso_pb, iso_pb, cgp])

tdf = pd.DataFrame(d, columns=['genome', 'genes', 'transcripts', 'orthologs', 'coding genes', 'coding txs',
                              'novel genes', 'novel isoforms (cgp)', 'novel isoforms (pb)', 'total PB', 'total CGP'])
print tdf.T

                                0              1                2
genome                Clint_Chimp  Susie_Gorilla  Susie_Orangutan
genes                       57001          57091            56414
transcripts                198988         199203           197713
orthologs                   56721          56750            56161
coding genes                19568          19591            19516
coding txs                  79179          79145            79114
novel genes                   200            233              201
novel isoforms (cgp)          591            646              482
novel isoforms (pb)          2231           2273             1859
total PB                     2630           2768             2193
total CGP                     733            797              617


In [36]:
# how many CGP/PB are fully supported?

d2 = []
for genome, df in dfs.iteritems():
    if genome == 'Human':
        continue
    pb = df[(df.transcript_modes == 'augPB') & (df.pacbio_isoform_supported)]
    cgp = df[(df.transcript_modes == 'augCGP') & (df.pacbio_isoform_supported)]
    pb_iso = pb[pb.transcript_class == 'putative_novel_isoform']
    cgp_iso = cgp[cgp.transcript_class == 'putative_novel_isoform']
    pb_novel = pb[pb.transcript_class == 'possible_paralog']
    cgp_novel = cgp[cgp.transcript_class == 'possible_paralog']
    d2.append([genome, len(pb_iso), len(cgp_iso), len(set(pb_novel.gene_id)), len(set(cgp_novel.gene_id))])
    
d2df = pd.DataFrame(d2, columns=['genome', 'AugustusPB putative novel isoform', 'AugustusCGP putative novel isoform', 'AugustusPB putative paralogous loci', 'AugustusCGP putative paralogous loci'])
print d2df.T

d2df.T.to_csv('novel.tsv', sep='\t')

                                                0              1  \
genome                                Clint_Chimp  Susie_Gorilla   
AugustusPB putative novel isoform            1726           1255   
AugustusCGP putative novel isoform            268            190   
AugustusPB putative paralogous loci            71             63   
AugustusCGP putative paralogous loci           20             12   

                                                    2  
genome                                Susie_Orangutan  
AugustusPB putative novel isoform                1428  
AugustusCGP putative novel isoform                177  
AugustusPB putative paralogous loci                72  
AugustusCGP putative paralogous loci               12  


In [37]:
annotation_df = dfs['Human']
ref_genes = len(set(annotation_df.GeneId))
ref_txs = len(annotation_df)

for genome, df in dfs.iteritems():
    tot_genes = len(set(df[~df.source_transcript_name.isnull()].gene_id))
    tot_txs = len(df[~df.source_transcript_name.isnull()])
    print "{}: {:.1%} genes".format(genome, 1.0 * tot_genes / ref_genes)
    print "{}: {:.1%} transcripts".format(genome, 1.0 * tot_txs / ref_txs)

Clint_Chimp: 97.5% genes
Clint_Chimp: 97.9% transcripts
Susie_Gorilla: 97.5% genes
Susie_Gorilla: 97.9% transcripts
Susie_Orangutan: 96.5% genes
Susie_Orangutan: 97.5% transcripts


AttributeError: 'DataFrame' object has no attribute 'source_transcript_name'

In [None]:
name_map = {u'3prime_overlapping_ncrna': 'ncRNA',
 u'IG_C_gene': 'other',
 u'IG_C_pseudogene': 'pseudogene',
 u'IG_D_gene': 'other',
 u'IG_J_gene': 'other',
 u'IG_J_pseudogene': 'pseudogene',
 u'IG_V_gene': 'other',
 u'IG_V_pseudogene': 'pseudogene',
 u'Mt_rRNA': 'other',
 u'Mt_tRNA': 'other',
 u'TEC': 'other',
 u'TR_C_gene': 'other',
 u'TR_J_gene': 'other',
 u'TR_J_pseudogene': 'pseudogene',
 u'TR_V_gene': 'other',
u'TR_D_gene': 'other',
 u'TR_V_pseudogene': 'pseudogene',
 u'antisense': 'other',
 u'bidirectional_promoter_lncrna': 'lncRNA',
 u'lincRNA': 'lncRNA',
 u'macro_lncRNA': 'lncRNA',
 u'miRNA': 'ncRNA',
 u'misc_RNA': 'ncRNA',
 u'non_coding': 'ncRNA',
 u'non_stop_decay': 'other',
 u'nonsense_mediated_decay': 'other',
 u'polymorphic_pseudogene': 'pseudogene',
 u'processed_pseudogene': 'pseudogene',
 u'processed_transcript': 'other',
 u'protein_coding': 'protein coding',
 u'pseudogene': 'pseudogene',
 u'rRNA': 'other',
 u'retained_intron': 'other',
 u'ribozyme': 'ncRNA',
 u'sRNA': 'ncRNA',
 u'scaRNA': 'ncRNA',
 u'sense_intronic': 'other',
 u'sense_overlapping': 'other',
 u'snRNA': 'ncRNA',
 u'snoRNA': 'ncRNA',
 u'transcribed_processed_pseudogene': 'pseudogene',
 u'transcribed_unitary_pseudogene': 'pseudogene',
 u'transcribed_unprocessed_pseudogene': 'pseudogene',
 u'translated_unprocessed_pseudogene': 'pseudogene',
 u'unitary_pseudogene': 'pseudogene',
 u'unprocessed_pseudogene': 'pseudogene',
 u'vaultRNA': 'ncRNA'}


for g, df in dfs.iteritems():
    df['simplified_biotype'] = [name_map.get(x, x) for x in df.gene_biotype]