In [1]:
import os
os.chdir('orang')

In [2]:
from tools.bio import *
from tools.transcripts import *
from tools.sqlInterface import *
from tools.gff3 import *
from tools.misc import *
from collections import *
from cat.plots import *

In [3]:
susie_expression_1 = pd.read_csv('SRR306798_susie_cat.genes.results', header=0, sep='\t')
ensembl_expression_1 = pd.read_csv('SRR306798_ponabe_ensembl.genes.results', header=0, sep='\t')
ponabe_expression_1 = pd.read_csv('SRR306798_ponabe_cat.genes.results', header=0, sep='\t')
human_expression_1 = pd.read_csv('SRR306798_human.genes.results', header=0, sep='\t')
susie_expression_2 = pd.read_csv('SRR306799_susie_cat.genes.results', header=0, sep='\t')
ensembl_expression_2 = pd.read_csv('SRR306799_ponabe_ensembl.genes.results', header=0, sep='\t')
ponabe_expression_2 = pd.read_csv('SRR306799_ponabe_cat.genes.results', header=0, sep='\t')
human_expression_2 = pd.read_csv('SRR306799_human.genes.results', header=0, sep='\t')

In [4]:
susie_expression = pd.merge(susie_expression_1[['gene_id', 'TPM']], susie_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
susie_expression.columns = ['TPM'] * 2
susie_expression = susie_expression.groupby(by=susie_expression.columns, axis=1).mean().reset_index()

In [5]:
human_expression = pd.merge(human_expression_1[['gene_id', 'TPM']], human_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
human_expression.columns = ['TPM'] * 2
human_expression = human_expression.groupby(by=human_expression.columns, axis=1).mean().reset_index()

In [6]:
ensembl_expression = pd.merge(ensembl_expression_1[['gene_id', 'TPM']], ensembl_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
ensembl_expression.columns = ['TPM'] * 2
ensembl_expression = ensembl_expression.groupby(by=ensembl_expression.columns, axis=1).mean().reset_index()

In [7]:
ponabe_expression = pd.merge(ponabe_expression_1[['gene_id', 'TPM']], ponabe_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
ponabe_expression.columns = ['TPM'] * 2
ponabe_expression = ponabe_expression.groupby(by=ponabe_expression.columns, axis=1).mean().reset_index()

In [8]:
# load human annotation
ref_df = load_annotation('/hive/groups/recon/projs/primates/susie_indel_corrected/databases/Human.db')

# load ensembl annotations
lines = [x.split('\t') for x in open('/hive/groups/recon/projs/primates/compare_clint_expression/ponabe_ensembl/Pongo_abelii.PPYG2.88.gtf') if not x.startswith('#')]
ensembl_map = {}
for l in lines:
    x = parse_gtf_attr_line(l[-1])
    try:
        ensembl_map[x['gene_id']] = x['gene_name']
    except KeyError:
        continue
ensembl_map = pd.DataFrame(ensembl_map.items(), columns=['gene_id', 'gene_name'])

# load name maps for susie, ponabe
susie_map = pd.read_csv('/hive/groups/recon/projs/primates/susie_indel_corrected/consensus_gene_set/Susie_Orangutan.gp_info',
                       sep='\t', header=0)
ponabe_map = pd.read_csv('/hive/groups/recon/projs/primates/original_primates/annotation_full_primates/consensus_gene_set/Orangutan.gp_info',
                         sep='\t', header=0)

In [9]:
# add names to susie
susie = pd.merge(susie_expression[['gene_id', 'TPM']], susie_map[['gene_id', 'source_gene_common_name']], 
                 on='gene_id')[['TPM', 'source_gene_common_name']]
susie.columns = ['Susie/Orangutan (CAT)', 'gene_name']
susie = susie.drop_duplicates()
susie = susie.groupby('gene_name').aggregate(sum).reset_index()

In [10]:
# add names to ensembl
ensembl = pd.merge(ensembl_expression[['gene_id', 'TPM']], ensembl_map, on='gene_id')[['TPM', 'gene_name']]
ensembl = ensembl.drop_duplicates()
ensembl = ensembl.groupby('gene_name').aggregate(sum).reset_index()
ensembl.columns = ['gene_name', 'ponAbe2 (Ensembl V87)']

In [11]:
# add names to ponabe_cat
ponabe = pd.merge(ponabe_expression[['gene_id', 'TPM']], ponabe_map[['gene_id', 'source_gene_common_name']], 
                 on='gene_id')[['TPM', 'source_gene_common_name']]
ponabe = ponabe.drop_duplicates()
ponabe.columns = ['ponAbe2 (CAT)', 'gene_name']
ponabe = ponabe.groupby('gene_name').aggregate(sum).reset_index()

In [12]:
# add names to human
ref_map = ref_df[['GeneName', 'GeneId']]
ref_map.columns = ['gene_name', 'gene_id']
human = pd.merge(human_expression[['gene_id', 'TPM']], ref_map[['gene_id', 'gene_name']], 
                 on='gene_id')[['TPM', 'gene_name']]
human.columns = ['Human (GENCODE V24) TPM', 'gene_name']
human = human.drop_duplicates()
human = human.groupby('gene_name').aggregate(sum).reset_index()

In [13]:
# now combine everything, retaining all genes found in human
combined = pd.merge(human, ponabe, how='outer', on='gene_name').merge(ensembl, how='outer', on='gene_name').merge(susie, how='outer', on='gene_name')
df = pd.melt(combined, id_vars=['gene_name', 'Human (GENCODE V24) TPM'])
df.columns = ['gene_name', 'Human (GENCODE V24) TPM', 'assembly/annotation', 'TPM']

In [68]:
# write to disk for other analysis
combined_df = pd.merge(susie, ensembl, on='gene_name', how='outer')
combined_df.to_csv('../susie_ensembl.tsv', sep='\t')
combined_df.merge(ponabe, on='gene_name', how='outer').to_csv('../susie_ensembl_cat.tsv', sep='\t')

In [14]:
num_human = len(human[human['Human (GENCODE V24) TPM'] != 0])
num_susie = len(susie[susie['Susie/Orangutan (CAT)'] != 0])
num_ponabe = len(ponabe[ponabe['ponAbe2 (CAT)'] != 0])
num_ensembl = len(ensembl[ensembl['ponAbe2 (Ensembl V87)'] != 0])
from scipy.stats import *


In [16]:
palette = sns.color_palette()
palette = [palette[3], palette[2], palette[1]]
cols = ['Susie/Orangutan (CAT)', 'ponAbe2 (CAT)', 'ponAbe2 (Ensembl V87)']

In [71]:
with open('orang_full_expression.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.lmplot(data=df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', fit_reg=False,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3, 'rasterized': True},
                  col_order=cols,
                  palette=palette)
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = df[df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, False)
    if len(cols) == 3:
        g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
                   y=[num_human, num_susie, num_ponabe, num_ensembl])
    else:
        g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
                   y=[num_human, num_ponabe, num_ensembl])

    g.set_title('Number of genes found in RNA-seq')
    sns.despine()
    plt.xticks(rotation=70)
    plt.title('Number of genes with non-zero expression')
    multipage_close(pdf, False)

In [18]:
# coding only
coding_genes = set(ref_df[ref_df.GeneBiotype == 'protein_coding'].GeneName)
coding_df = df[df.gene_name.isin(coding_genes)]

num_human_coding = len(human[(human['Human (GENCODE V24) TPM'] != 0) & (human.gene_name.isin(coding_genes))])
num_susie_coding = len(susie[(susie['Susie/Orangutan (CAT)'] != 0) & (susie.gene_name.isin(coding_genes))])
num_ponabe_coding = len(ponabe[(ponabe['ponAbe2 (CAT)'] != 0) & (ponabe.gene_name.isin(coding_genes))])
num_ensembl_coding = len(ensembl[(ensembl['ponAbe2 (Ensembl V87)'] != 0) & (ensembl.gene_name.isin(coding_genes))])

with open('orang_coding_expression.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.lmplot(data=coding_df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', ci=None, fit_reg=False,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3, 'rasterized': True},
                  col_order=cols,
                  palette=palette)
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = df[df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, True)
    fig, ax = plt.subplots(figsize=(3,4.5))
    if len(cols) == 3:
        g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
                   y=[num_human_coding, num_susie_coding, num_ponabe_coding, num_ensembl_coding], ax=ax)
    else:
        g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
                   y=[num_human_coding, num_ponabe_coding, num_ensembl_coding], ax=ax)
    sns.despine()
    plt.xticks(rotation=90)
    plt.title('Number of coding genes with non-zero expression')
    multipage_close(pdf, True)

In [22]:
num_human_coding = len(human[(human['Human (GENCODE V24) TPM'] != 0) & (human.gene_name.isin(coding_genes))])
num_susie_coding = len(susie[(susie['Susie/Orangutan (CAT)'] != 0) & (susie.gene_name.isin(coding_genes))])
num_ponabe_coding = len(ponabe[(ponabe['ponAbe2 (CAT)'] != 0) & (ponabe.gene_name.isin(coding_genes))])
num_ensembl_coding = len(ensembl[(ensembl['ponAbe2 (Ensembl V87)'] != 0) & (ensembl.gene_name.isin(coding_genes))])

g = sns.barplot(x=['Human (GENCODE V24)', 'Susie (CAT)', 'ponAbe2 (CAT)', 'ponAbe2 (Ensembl V87)'],
           y=[num_human_coding, num_susie_coding, num_ponabe_coding, num_ensembl_coding])
sns.despine()
plt.xticks(rotation=70)
plt.title('Number of coding genes with non-zero expression')

<matplotlib.text.Text at 0x7f4e57d37990>

In [23]:
# now we perform the cross-species comparison
human1 = pd.read_csv('../human/SRR306854_human.genes.results', header=0, sep='\t')
human2 = pd.read_csv('../human/SRR306855_human.genes.results', header=0, sep='\t')
human3 = pd.read_csv('../human/SRR306856_human.genes.results', header=0, sep='\t')

human_human_expression = pd.merge(human1[['gene_id', 'TPM']], human2[['gene_id', 'TPM']], on='gene_id')
human_human_expression = pd.merge(human_human_expression, human3[['gene_id', 'TPM']], on='gene_id')
human_human_expression = human_human_expression.set_index('gene_id')
human_human_expression.columns = ['TPM'] * 3
human_human_expression = human_human_expression.groupby(by=human_human_expression.columns, axis=1).mean().reset_index()

human_human = pd.merge(human_human_expression, ref_map[['gene_id', 'gene_name']], 
                 on='gene_id')[['gene_name', 'TPM']]
human_human = human_human.drop_duplicates()
human_human.columns = ['gene_name', 'Human (GENCODE V24) TPM']

In [24]:
# now combine everything, retaining all genes found in human
combined_human = pd.merge(human_human, ponabe, how='outer', on='gene_name').merge(ensembl, how='outer', on='gene_name').merge(susie, how='outer', on='gene_name')
human_df = pd.melt(combined_human, id_vars=['gene_name', 'Human (GENCODE V24) TPM'])
human_df.columns = ['gene_name', 'Human (GENCODE V24) TPM', 'assembly/annotation', 'TPM']

In [29]:
with open('orang_coding_expression_cross.pdf', 'w') as outf, PdfPages(outf) as pdf:
    df = human_df[human_df.gene_name.isin(coding_genes)]
    cols = ['Susie/Orangutan (CAT)', 'ponAbe2 (CAT)', 'ponAbe2 (Ensembl V87)']
    g = sns.lmplot(data=df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', ci=None, fit_reg=False,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3, 'rasterized': True},
                  col_order=cols,
                  palette=sns.color_palette()[1:])
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = df[df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, False)
    num_human_coding = len(human[(human['Human (GENCODE V24) TPM'] != 0) & (human.gene_name.isin(coding_genes))])
    num_susie_coding = len(susie[(susie['Susie/Orangutan (CAT)'] != 0) & (susie.gene_name.isin(coding_genes))])
    num_ponabe_coding = len(ponabe[(ponabe['ponAbe2 (CAT)'] != 0) & (ponabe.gene_name.isin(coding_genes))])
    num_ensembl_coding = len(ensembl[(ensembl['ponAbe2 (Ensembl V87)'] != 0) & (ensembl.gene_name.isin(coding_genes))])

    fig, ax = plt.subplots(figsize=(3,4.5))
    g = sns.barplot(x=['Human (GENCODE V24)', 'Susie/Orangutan (CAT)', 'ponAbe2 (CAT)', 'ponAbe2 (Ensembl V87)'],
               y=[num_human_coding, num_susie_coding, num_ponabe_coding, num_ensembl_coding], ax=ax)
    sns.despine()
    plt.xticks(rotation=90)
    plt.title('Number of coding genes with non-zero expression')
    multipage_close(pdf, False)