In [1]:
import pandas as pd
from tools.bio import *
from tools.transcripts import *
from tools.sqlInterface import *
from tools.gff3 import *
from tools.misc import *
from collections import *
from cat.plots import *
from tools.psl import *
from cat.consensus import *

In [5]:
old_chimp = pd.concat([load_metrics_from_db('/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/databases/Chimp.db', tx_mode, 'mRNA') for tx_mode in ['transMap', 'augTM', 'augTMR']])
old_orang = pd.concat([load_metrics_from_db('/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/databases/Orangutan.db', tx_mode, 'mRNA') for tx_mode in ['transMap', 'augTM', 'augTMR']])
new_chimp = pd.concat([load_metrics_from_db('../../databases/Clint_Chimp.db', tx_mode, 'mRNA') for tx_mode in ['transMap', 'augTM', 'augTMR']])
new_orang = pd.concat([load_metrics_from_db('../../databases/Susie_Orangutan.db', tx_mode, 'mRNA') for tx_mode in ['transMap', 'augTM', 'augTMR']])

In [9]:
# load the consensus gene sets to filter the transMap
gene_sets = {'chimp': pd.read_csv('../../consensus_gene_set/Clint_Chimp.gp_info', sep='\t', header=0),
             'chimp_old': pd.read_csv('/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Chimp.gp_info', sep='\t', header=0),
             'orang': pd.read_csv('../../consensus_gene_set/Susie_Orangutan.gp_info', sep='\t', header=0),
             'orang_old': pd.read_csv('/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Orangutan.gp_info', sep='\t', header=0)}

In [10]:
chimp_merged = new_chimp.merge(gene_sets['chimp'], left_on='AlignmentId', right_on='alignment_id')
old_chimp_merged = old_chimp.merge(gene_sets['chimp_old'], left_on='AlignmentId', right_on='alignment_id')

orang_merged = new_orang.merge(gene_sets['orang'], left_on='AlignmentId', right_on='alignment_id')
old_orang_merged = old_orang.merge(gene_sets['orang_old'], left_on='AlignmentId', right_on='alignment_id')

In [11]:
chimp_merged = chimp_merged[['source_transcript', 'AlnCoverage', 'AlnIdentity']]
old_chimp_merged = old_chimp_merged[['source_transcript', 'AlnCoverage', 'AlnIdentity']]
orang_merged = orang_merged[['source_transcript', 'AlnCoverage', 'AlnIdentity']]
old_orang_merged = old_orang_merged[['source_transcript', 'AlnCoverage', 'AlnIdentity']]

In [12]:
chimp = chimp_merged.merge(old_chimp_merged, on='source_transcript', suffixes=['_new', '_old'])
orang = orang_merged.merge(old_orang_merged, on='source_transcript', suffixes=['_new', '_old'])

In [13]:
chimp['cov delta'] = chimp['AlnCoverage_new'] - chimp['AlnCoverage_old']
chimp['ident delta'] = chimp['AlnIdentity_new'] - chimp['AlnIdentity_old']
orang['cov delta'] = orang['AlnCoverage_new'] - orang['AlnCoverage_old']
orang['ident delta'] = orang['AlnIdentity_new'] - orang['AlnIdentity_old']

In [14]:
chimp['genome'] = ['Chimpanzee'] * len(chimp)
orang['genome'] = ['Orangutan'] * len(orang)

merged_df = pd.concat([chimp, orang])
merged_df = merged_df.drop(['AlnCoverage_new', 'AlnIdentity_new', 'AlnCoverage_old', 'AlnIdentity_old'], axis=1)
merged_df = pd.melt(merged_df, id_vars=['source_transcript', 'genome'])

In [18]:
def find_counts(df, genome, variable):
    tot = df[(df.genome == genome) & (df.variable == variable)]
    nonzero = tot[tot.value != 0]
    return len(nonzero), len(tot) - len(nonzero)


counts = [['Chimpanzee', 'identity', find_counts(merged_df, 'Chimpanzee', 'ident delta')],
         ['Chimpanzee', 'coverage', find_counts(merged_df, 'Chimpanzee', 'cov delta')],
         ['Orangutan', 'identity', find_counts(merged_df, 'Orangutan', 'ident delta')],
         ['Orangutan', 'coverage', find_counts(merged_df, 'Orangutan', 'cov delta')]]

counts = [[x[0], x[1], x[2][0], x[2][1]] for x in counts]
count_df = pd.DataFrame(counts, columns=['genome', 'variable', 'changed', 'unchanged'])

In [21]:
cov_df = merged_df[(merged_df.variable == 'cov delta') & (merged_df.value != 0)].drop('variable', axis=1).set_index('source_transcript')
cov_df['value'] = pd.to_numeric(cov_df['value'])
identity_df = merged_df[(merged_df.variable == 'ident delta') & (merged_df.value != 0)].drop('variable', axis=1).set_index('source_transcript')
identity_df['value'] = pd.to_numeric(identity_df['value'])

with open('primate_delta_coverage_identity.pdf', 'w') as outf, PdfPages(outf) as pdf:
    fig, axes = plt.subplots(ncols=2, nrows=2)
    g = sns.boxplot(y='value', x='genome', data=cov_df, ax=axes[0][0], showfliers=False)
    g.set_xticklabels(g.axes.get_xticklabels(), rotation=90)
    g.set_ylabel(r'$\Delta$ coverage')
    g.set_xlabel('')
    g.set_xticklabels('')
    g.set_title('Change in transMap metrics\nbetween assemblies')
    g.set_ylim(-30, 30)
    g = sns.boxplot(y='value', x='genome', data=identity_df, ax=axes[1][0], showfliers=False)
    g.set_ylim(-1, 1)
    g.set_xticklabels(g.axes.get_xticklabels(), rotation=90)
    g.set_ylabel(r'$\Delta$ identity')
    g.set_xlabel('')
    sns.despine()


    cov_vals = count_df[count_df.variable == 'coverage'].as_matrix(columns=['changed', 'unchanged'])
    axes[0][1].bar(1, cov_vals[0][0], 0.7, color=sns.color_palette()[0], alpha=0.8)
    axes[0][1].bar(1, cov_vals[0][1], 0.7, color=sns.color_palette()[0], bottom=cov_vals[0][0], hatch='//', alpha=0.35)
    axes[0][1].bar(2, cov_vals[1][0], 0.7, color=sns.color_palette()[1], alpha=0.8)
    axes[0][1].bar(2, cov_vals[1][1], 0.7, color=sns.color_palette()[1], bottom=cov_vals[1][0], hatch='//', alpha=0.35)
    ident_vals = count_df[count_df.variable == 'identity'].as_matrix(columns=['changed', 'unchanged'])
    axes[1][1].bar(1, ident_vals[0][0], 0.7, color=sns.color_palette()[0], alpha=0.8)
    axes[1][1].bar(1, ident_vals[0][1], 0.7, color=sns.color_palette()[0], bottom=ident_vals[0][0], hatch='//', alpha=0.35)
    axes[1][1].bar(2, ident_vals[1][0], 0.7, color=sns.color_palette()[1], alpha=0.8)
    axes[1][1].bar(2, ident_vals[1][1], 0.7, color=sns.color_palette()[1], bottom=ident_vals[1][0], hatch='//', alpha=0.35)
    axes[0][1].set_xticks([1.35, 2.35])
    axes[1][1].set_xticks([1.35, 2.35])
    axes[0][1].set_xticklabels(['', ''], rotation=90)
    axes[1][1].set_xticklabels(['Chimpanzee', 'Orangutan'], rotation=90)
    axes[0][1].set_title('Number of transcripts\nwith metric changes')
    plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9, wspace=0.7, hspace=0.2)
    multipage_close(pdf, False)

In [22]:
print count_df

       genome  variable  changed  unchanged
0  Chimpanzee  identity    38475      39805
1  Chimpanzee  coverage    23472      54808
2   Orangutan  identity    51559      26268
3   Orangutan  coverage    35487      42340


In [23]:
cov_df.groupby('genome').aggregate(np.mean)

Unnamed: 0_level_0,value
genome,Unnamed: 1_level_1
Chimpanzee,9.716985
Orangutan,10.198749


In [24]:
identity_df.groupby('genome').aggregate(np.mean)

Unnamed: 0_level_0,value
genome,Unnamed: 1_level_1
Chimpanzee,1.447396
Orangutan,1.569747


In [25]:
print chimp[['AlnCoverage_new', 'AlnCoverage_old', 'AlnIdentity_new', 'AlnIdentity_old']].mean()

AlnCoverage_new    98.855413
AlnCoverage_old    95.941807
AlnIdentity_new    98.846267
AlnIdentity_old    98.134865
dtype: float64


In [26]:
print orang[['AlnCoverage_new', 'AlnCoverage_old', 'AlnIdentity_new', 'AlnIdentity_old']].mean()

AlnCoverage_new    98.083997
AlnCoverage_old    93.433644
AlnIdentity_new    97.491948
AlnIdentity_old    96.452019
dtype: float64
