In [2]:
# load old data
import pandas as pd
d = '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set'
dfs = {}
for g in ['Chimp', 'Gorilla', 'Orangutan']:
    df = pd.read_csv(os.path.join(d, g + '.filtered.gp_info'), sep='\t')
    dfs[g] = df

In [33]:
# load ensembl
d = '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/primate_paper/kallisto_expression'
files = {'Gorilla': os.path.join(d, 'Gorilla_gorilla.gorGor4.91.gp'),
        'Orangutan': os.path.join(d, 'Pongo_abelii.PPYG2.88.gp'),
        'Chimp': os.path.join(d, 'Pan_troglodytes.Pan_tro_3.0.91.gp')}

from tools.transcripts import *
tx_dicts = {}
for g, f in files.iteritems():
    tx_dicts[g] = get_gene_pred_dict(f)

In [7]:
# counts
new_counts = []
for g, df in dfs.iteritems():
    d = df.groupby('gene_id').first()
    c = d[d.gene_biotype == 'protein_coding']
    new_counts.append([g, len(df), len(d), len(c)])

In [17]:
old_counts = []
for g, t in tx_dicts.iteritems():
    d = {x.name2 for x in t.itervalues()}
    c = {x.name2 for x in t.itervalues() if x.cds_size > 0}
    old_counts.append([g, len(t), len(d), len(c)])

In [18]:
new_counts = pd.DataFrame(new_counts, columns=['genome', 'transcripts', 'genes', 'coding genes'])
old_counts = pd.DataFrame(old_counts, columns=['genome', 'transcripts', 'genes', 'coding genes'])

In [19]:
new_counts.sort_values('genome')

Unnamed: 0,genome,transcripts,genes,coding genes
1,Chimp,189551,55603,18443
0,Gorilla,188403,55164,18392
2,Orangutan,186992,54502,18259


In [31]:
new_counts['coding genes'].mean()

18364.666666666668

In [20]:
old_counts.sort_values('genome')

Unnamed: 0,genome,transcripts,genes,coding genes
0,Chimp,58366,31951,23534
2,Gorilla,53003,29603,21795
1,Orangutan,29447,28443,20424


In [32]:
old_counts['coding genes'].mean()

21917.666666666668

In [23]:
t = new_counts.sort_values('genome').set_index('genome') - old_counts.sort_values('genome').set_index('genome')

In [28]:
t.mean(axis=0)

transcripts     141376.666667
genes            25090.666667
coding genes     -3553.000000
dtype: float64

In [29]:
print t

           transcripts  genes  coding genes
genome                                     
Chimp           131185  23652         -5091
Gorilla         135400  25561         -3403
Orangutan       157545  26059         -2165


In [53]:
print (old_counts['coding genes']- 19836).mean()

2081.66666667


In [129]:
from tools.fileOps import *
d = '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set'
for g, f in files.iteritems():
    # strip chr from cat and then filter later
    cat = os.path.join(d, g + '.filtered.gp')
    if g != 'Orangutan':
        with TemporaryFilePath() as tmp:
            !sed 's/chr//g' {cat} > {tmp}
            !clusterGenes {g}.txt no {f} {tmp}
    else:
        !clusterGenes {g}.txt no {f} {cat}

In [150]:
clusters = {}

useful_chroms = set(map(str, range(1, 23)))

for g, f in files.iteritems():
    cdf = pd.read_csv(g + '.txt', sep='\t')
    if g != 'Orangutan':
        clusters[g] = cdf[cdf.chrom.isin(useful_chroms)]
    else:
        clusters[g] = cdf

In [36]:
# biotype plot
name_map = {
    '3prime_overlapping_ncRNA': 'ncRNA',
    '3prime_overlapping_ncrna': 'ncRNA',
    'IG_C_gene': 'protein coding',
    'IG_C_pseudogene': 'pseudogene',
    'IG_D_gene': 'protein coding',
    'IG_D_pseudogene': 'pseudogene',
    'IG_J_gene': 'protein coding',
    'IG_J_pseudogene': 'pseudogene',
    'IG_V_gene': 'protein coding',
    'IG_V_pseudogene': 'pseudogene',
    'IG_pseudogene': 'pseudogene',
    'Mt_rRNA': 'ncRNA',
    'Mt_tRNA': 'ncRNA',
    'TEC': 'ncRNA',
    'TR_C_gene': 'protein coding',
    'TR_C_pseudogene': 'pseudogene',
    'TR_J_gene': 'protein coding',
    'TR_J_pseudogene': 'pseudogene',
    'TR_V_gene': 'protein coding',
    'TR_V_pseudogene': 'pseudogene',
    'antisense': 'ncRNA',
    'antisense_RNA': 'ncRNA',
    'bidirectional_promoter_lncRNA': 'lncRNA',
    'bidirectional_promoter_lncrna': 'lncRNA',
    'lincRNA': 'lncRNA',
    'macro_lncRNA': 'ncRNA',
    'miRNA': 'ncRNA',
    'misc_RNA': 'ncRNA',
    'non_coding': 'ncRNA',
    'non_stop_decay': 'protein coding',
    'nonsense_mediated_decay': 'protein coding',
    'polymorphic_pseudogene': 'protein coding',  # coding in some humans
    'processed_pseudogene': 'ncRNA',
    'processed_transcript': 'ncRNA',
    'protein_coding': 'protein coding',
    'pseudogene': 'ncRNA',
    'rRNA': 'ncRNA',
    'retained_intron': 'ncRNA',
    'ribozyme': 'ncRNA',
    'sRNA': 'ncRNA',
    'scaRNA': 'ncRNA',
    'scRNA': 'ncRNA',
    'sense_intronic': 'ncRNA',
    'sense_overlapping': 'ncRNA',
    'snRNA': 'ncRNA',
    'snoRNA': 'ncRNA',
    'transcribed_processed_pseudogene': 'pseudogene',
    'transcribed_unitary_pseudogene': 'pseudogene',
    'transcribed_unprocessed_pseudogene': 'pseudogene',
    'translated_unprocessed_pseudogene': 'pseudogene',
    'translated_processed_pseudogene': 'pseudogene',
    'unitary_pseudogene': 'ncRNA',   # only in human
    'unknown_likely_coding': 'protein coding',
    'unprocessed_pseudogene': 'pseudogene',
    'vaultRNA': 'ncRNA',
'TR_D_gene': 'protein coding',
u'IG_LV_gene': 'protein coding'}


gtf_dir = '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/primate_paper/kallisto_expression'
gtfs = {'panTro4/Ensembl V91': os.path.join(gtf_dir, 'Pan_troglodytes.Pan_tro_3.0.91.gtf'),
       'gorGor4/Ensembl V91': os.path.join(gtf_dir, 'Gorilla_gorilla.gorGor4.91.gtf'),
       'ponAbe2/Ensembl v90': os.path.join(gtf_dir, 'Pongo_abelii.PPYG2.88.gtf')}
from tools.misc import *
def construct_ensembl_map(gtf):
    lines = [x.split('\t') for x in open(gtf) if not x.startswith('#')]
    ensembl_map = []
    for l in lines:
        x = parse_gtf_attr_line(l[-1])
        try:
            ensembl_map.append([x['gene_id'], x['transcript_id'], x['gene_biotype']])
        except KeyError:
            continue
    return pd.DataFrame(ensembl_map, columns=['gene_id', 'transcript_id', 'gene_biotype'])

gtf_dfs = {}
for g, gtf in gtfs.iteritems():
    gtf_dfs[g] = construct_ensembl_map(gtf)

In [43]:
# munge into counts
counts = []
for g, df in gtf_dfs.iteritems():
    d = df.groupby('gene_id').first().reset_index()
    d['simplified gene biotype'] = [name_map[x] for x in d.gene_biotype]
    c = Counter(d['simplified gene biotype'])
    for x, y in c.iteritems():
        counts.append([g, x, y])
count_df = pd.DataFrame(counts, columns=['genome', 'biotype', 'count'])

In [50]:
from cat.plots import *
with open('ensembl_primate_biotypes.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.factorplot(data=count_df, x='genome', y='count', hue='biotype', kind='bar',
                       hue_order=order, x_order=['panTro4/Ensembl V91', 'gorGor4/Ensembl V91', 'ponAbe2/Ensembl v99'], size=6)
    ax = g.axes[0][0]
    ax.set_ylim(0, 25000)
    ax.set_ylabel('Number of genes')
    g.fig.suptitle('Biotypes in Ensembl annotation of legacy great apes')
    g.set_xticklabels(rotation=90)
    g.fig.subplots_adjust(top=0.85, bottom=0.4)
    pdf.savefig(bboxes_inches='tight')
    plt.close('all')

In [51]:
print count_df

                genome         biotype  count
0  ponAbe2/Ensembl v99  protein coding  20424
1  ponAbe2/Ensembl v99           ncRNA   8019
2  gorGor4/Ensembl V91  protein coding  21795
3  gorGor4/Ensembl V91           ncRNA   7808
4  panTro4/Ensembl V91  protein coding  23534
5  panTro4/Ensembl V91           ncRNA   8417


In [66]:
gtf_dfs_new_keys = {'Gorilla': gtf_dfs['gorGor4/Ensembl V91'],
                   'Chimp': gtf_dfs['panTro4/Ensembl V91'],
                   'Orangutan': gtf_dfs['ponAbe2/Ensembl v99']}

In [None]:
# how many clusters are protein coding in non-CAT and non-coding in CAT?
biotype_changes = {}
for g, df in clusters.iteritems():
    # add biotypes
    bdf = pd.concat([dfs[g][['transcript_id', 'gene_biotype']], gtf_dfs_new_keys[g][['transcript_id', 'gene_biotype']]])
    m = df.merge(bdf, left_on='gene', right_on='transcript_id')
    m['simplified_biotype'] = [name_map[x] for x in m.gene_biotype]
    m = m.drop_duplicates()
    count = Counter()
    matches = 0
    for cluster_id, s in m.groupby('#cluster'):
        cat_portion = s[s.transcript_id.str.contains('_T0')]
        ens_portion = s[~s.transcript_id.isin(cat_portion.transcript_id)]
        if len(cat_portion) != 0 and len(ens_portion) != 0:
            a = frozenset(cat_portion['simplified_biotype'])
            b = frozenset(ens_portion['simplified_biotype'])
            if a != b:
                count[(a, b)] += 1
            else:
                matches += 1
    biotype_changes[g] = [count, matches]

In [None]:
for g, (x, y) in biotype_changes.iteritems():
    for (a, b), v in x.iteritems():
        print g, a, b, v

In [163]:
for g, (x, y) in biotype_changes.iteritems():
    tot = 0
    for (a, b), v in x.iteritems():
        if 'protein coding' not in a and 'protein coding' in b:
            tot += v
    print g, tot

Gorilla 1728
Chimp 940
Orangutan 1270
