In [1]:
import os
!mkdir -p chimp
os.chdir('chimp')

In [2]:
from tools.bio import *
from tools.transcripts import *
from tools.sqlInterface import *
from tools.gff3 import *
from tools.misc import *
from collections import *
from cat.plots import *

In [3]:
clint_expression_1 = pd.read_csv('SRR306823_clint_cat.genes.results', header=0, sep='\t')
ensembl_expression_1 = pd.read_csv('SRR306823_pantro_ensembl.genes.results', header=0, sep='\t')
pantro_expression_1 = pd.read_csv('SRR306823_pantro_cat.genes.results', header=0, sep='\t')
human_expression_1 = pd.read_csv('SRR306823_human.genes.results', header=0, sep='\t')
clint_expression_2 = pd.read_csv('SRR306824_clint_cat.genes.results', header=0, sep='\t')
ensembl_expression_2 = pd.read_csv('SRR306824_pantro_ensembl.genes.results', header=0, sep='\t')
pantro_expression_2 = pd.read_csv('SRR306824_pantro_cat.genes.results', header=0, sep='\t')
human_expression_2 = pd.read_csv('SRR306824_human.genes.results', header=0, sep='\t')

In [4]:
clint_expression = pd.merge(clint_expression_1[['gene_id', 'TPM']], clint_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
clint_expression.columns = ['TPM'] * 2
clint_expression = clint_expression.groupby(by=clint_expression.columns, axis=1).mean().reset_index()

In [5]:
human_expression = pd.merge(human_expression_1[['gene_id', 'TPM']], human_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
human_expression.columns = ['TPM'] * 2
human_expression = human_expression.groupby(by=human_expression.columns, axis=1).mean().reset_index()

In [6]:
ensembl_expression = pd.merge(ensembl_expression_1[['gene_id', 'TPM']], ensembl_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
ensembl_expression.columns = ['TPM'] * 2
ensembl_expression = ensembl_expression.groupby(by=ensembl_expression.columns, axis=1).mean().reset_index()

In [7]:
pantro_expression = pd.merge(pantro_expression_1[['gene_id', 'TPM']], pantro_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
pantro_expression.columns = ['TPM'] * 2
pantro_expression = pantro_expression.groupby(by=pantro_expression.columns, axis=1).mean().reset_index()

In [8]:
# load human annotation
ref_df = load_annotation('/hive/groups/recon/projs/primates/susie_indel_corrected/databases/Human.db')

# load ensembl annotations
lines = [x.split('\t') for x in open('/hive/groups/recon/projs/primates/compare_clint_expression/pantro_ensembl/Pan_troglodytes.CHIMP2.1.4.88.gtf') if not x.startswith('#')]
ensembl_map = {}
for l in lines:
    x = parse_gtf_attr_line(l[-1])
    try:
        ensembl_map[x['gene_id']] = x['gene_name']
    except KeyError:
        continue
ensembl_map = pd.DataFrame(ensembl_map.items(), columns=['gene_id', 'gene_name'])

# load name maps for clint, panTro
clint_map = pd.read_csv('/hive/groups/recon/projs/primates/susie_indel_corrected/consensus_gene_set/Clint_Chimp.gp_info',
                       sep='\t', header=0)
pantro_map = pd.read_csv('/hive/groups/recon/projs/primates/original_primates/annotation_full_primates/consensus_gene_set/Chimp.gp_info',
                         sep='\t', header=0)

In [9]:
# add names to clint
clint = pd.merge(clint_expression, clint_map[['gene_id', 'source_gene_common_name']], 
                 on='gene_id')[['TPM', 'source_gene_common_name']]
clint.columns = ['Clint/Chimp (CAT)', 'gene_name']
clint = clint.drop_duplicates()
clint = clint.groupby('gene_name').aggregate(sum).reset_index()

In [10]:
# add names to ensembl
ensembl = pd.merge(ensembl_expression, ensembl_map, on='gene_id')[['TPM', 'gene_name']]
ensembl = ensembl.drop_duplicates()
ensembl = ensembl.groupby('gene_name').aggregate(sum).reset_index()
ensembl.columns = ['gene_name', 'panTro2.14 (Ensembl V87)']

In [11]:
# add names to pantro_cat
pantro = pd.merge(pantro_expression, pantro_map[['gene_id', 'source_gene_common_name']], 
                 on='gene_id')[['TPM', 'source_gene_common_name']]
pantro.columns = ['panTro2.14 (CAT)', 'gene_name']
pantro = pantro.drop_duplicates()
pantro = pantro.groupby('gene_name').aggregate(sum).reset_index()

In [12]:
# add names to human
ref_map = ref_df[['GeneName', 'GeneId']]
ref_map.columns = ['gene_name', 'gene_id']
human = pd.merge(human_expression, ref_map[['gene_id', 'gene_name']], 
                 on='gene_id')[['TPM', 'gene_name']]
human.columns = ['Human (GENCODE V24) TPM', 'gene_name']
human = human.drop_duplicates()
human = human.groupby('gene_name').aggregate(sum).reset_index()

In [13]:
# now combine everything, retaining all genes found in human
combined = pd.merge(human, pantro, how='outer', on='gene_name').merge(ensembl, how='outer', on='gene_name').merge(clint, how='outer', on='gene_name')
df = pd.melt(combined, id_vars=['gene_name', 'Human (GENCODE V24) TPM'])
df.columns = ['gene_name', 'Human (GENCODE V24) TPM', 'assembly/annotation', 'TPM']

In [14]:
# write to disk for other analysis
combined_df = pd.merge(clint, ensembl, on='gene_name', how='outer')
combined_df.to_csv('../clint_ensembl.tsv', sep='\t')

In [19]:
num_human = len(human[human['Human (GENCODE V24) TPM'] != 0])
num_clint = len(clint[clint['Clint/Chimp (CAT)'] != 0])
num_pantro = len(pantro[pantro['panTro2.14 (CAT)'] != 0])
num_ensembl = len(ensembl[ensembl['panTro2.14 (Ensembl V87)'] != 0])
from scipy.stats import *
cols = ['Clint/Chimp (CAT)', 'panTro2.14 (CAT)', 'panTro2.14 (Ensembl V87)']
palette = sns.color_palette()
palette = [palette[3], palette[2], palette[1]]
with open('chimp_full_expression.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.lmplot(data=df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', fit_reg=False,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3,  'rasterized': True},
                  col_order=cols,
                  palette=palette)
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = df[df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, False)
    g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
               y=[num_human, num_clint, num_pantro, num_ensembl])
    g.set_title('Number of genes found in RNA-seq')
    sns.despine()
    plt.xticks(rotation=70)
    plt.title('Number of genes with non-zero expression')
    multipage_close(pdf, False)

In [20]:
# coding only
coding_genes = set(ref_df[ref_df.GeneBiotype == 'protein_coding'].GeneName)
coding_df = df[df.gene_name.isin(coding_genes)]

num_human_coding = len(human[(human['Human (GENCODE V24) TPM'] != 0) & (human.gene_name.isin(coding_genes))])
num_clint_coding = len(clint[(clint['Clint/Chimp (CAT)'] != 0) & (clint.gene_name.isin(coding_genes))])
num_pantro_coding = len(pantro[(pantro['panTro2.14 (CAT)'] != 0) & (pantro.gene_name.isin(coding_genes))])
num_ensembl_coding = len(ensembl[(ensembl['panTro2.14 (Ensembl V87)'] != 0) & (ensembl.gene_name.isin(coding_genes))])

with open('chimp_coding_expression.pdf', 'w') as outf, PdfPages(outf) as pdf:
    g = sns.lmplot(data=coding_df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', ci=None, fit_reg=False,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3, 'rasterized': True},
                  col_order=cols,
                  palette=palette)
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = df[df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, True)
    fig, ax = plt.subplots(figsize=(3,4.5))
    g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
               y=[num_human_coding, num_clint_coding, num_pantro_coding, num_ensembl_coding], ax=ax)
    sns.despine()
    plt.xticks(rotation=90)
    plt.title('Number of coding genes with non-zero expression')
    multipage_close(pdf, True)

In [21]:
# now we perform the cross-species comparison
human1 = pd.read_csv('../human/SRR306854_human.genes.results', header=0, sep='\t')
human2 = pd.read_csv('../human/SRR306855_human.genes.results', header=0, sep='\t')
human3 = pd.read_csv('../human/SRR306856_human.genes.results', header=0, sep='\t')

human_human_expression = pd.merge(human1[['gene_id', 'TPM']], human2[['gene_id', 'TPM']], on='gene_id')
human_human_expression = pd.merge(human_human_expression, human3[['gene_id', 'TPM']], on='gene_id')
human_human_expression = human_human_expression.set_index('gene_id')
human_human_expression.columns = ['TPM'] * 3
human_human_expression = human_human_expression.groupby(by=human_human_expression.columns, axis=1).mean().reset_index()

human_human = pd.merge(human_human_expression, ref_map[['gene_id', 'gene_name']], 
                 on='gene_id')[['gene_name', 'TPM']]
human_human = human_human.drop_duplicates()
human_human.columns = ['gene_name', 'Human (GENCODE V24) TPM']

In [22]:
# now combine everything, retaining all genes found in human
combined_human = pd.merge(human_human, pantro, how='outer', on='gene_name').merge(ensembl, how='outer', on='gene_name').merge(clint, how='outer', on='gene_name')
human_df = pd.melt(combined_human, id_vars=['gene_name', 'Human (GENCODE V24) TPM'])
human_df.columns = ['gene_name', 'Human (GENCODE V24) TPM', 'assembly/annotation', 'TPM']

In [23]:
coding_genes = set(ref_df[ref_df.GeneBiotype == 'protein_coding'].GeneName)
coding_df = df[df.gene_name.isin(coding_genes)]

with open('chimp_expression_cross.pdf', 'w') as outf, PdfPages(outf) as pdf:
    cols = ['Clint/Chimp (CAT)', 'panTro2.14 (CAT)', 'panTro2.14 (Ensembl V87)']
    human_coding_df = human_df[human_df.gene_name.isin(coding_genes)]
    g = sns.lmplot(data=human_coding_df, x='Human (GENCODE V24) TPM', y='TPM', col='assembly/annotation',
                  hue='assembly/annotation', ci=None, fit_reg=False, col_order=cols,
                  scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3, 'rasterized': True},
                  palette=palette)
    x = np.linspace(0, 250)
    for col, ax in zip(*[cols, g.axes[0]]):
        tmp = human_coding_df[human_coding_df['assembly/annotation'] == col]
        tmp = tmp[(tmp['Human (GENCODE V24) TPM'].notnull()) & (tmp['TPM'].notnull())]
        c, p = pearsonr(tmp['Human (GENCODE V24) TPM'], tmp['TPM'])
        ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
        ax.set_xlim(0, 250)
        ax.set_ylim(0, 250)
        ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
    multipage_close(pdf, False)
    num_human_coding = len(human[(human['Human (GENCODE V24) TPM'] != 0) & (human.gene_name.isin(coding_genes))])
    num_clint_coding = len(clint[(clint['Clint/Chimp (CAT)'] != 0) & (clint.gene_name.isin(coding_genes))])
    num_pantro_coding = len(pantro[(pantro['panTro2.14 (CAT)'] != 0) & (pantro.gene_name.isin(coding_genes))])
    num_ensembl_coding = len(ensembl[(ensembl['panTro2.14 (Ensembl V87)'] != 0) & (ensembl.gene_name.isin(coding_genes))])

    fig, ax = plt.subplots(figsize=(3,4.5))
    g = sns.barplot(x=['Human (GENCODE V24)'] + cols,
               y=[num_human_coding, num_clint_coding, num_pantro_coding, num_ensembl_coding], ax=ax)
    sns.despine()
    plt.xticks(rotation=90)
    plt.title('Number of coding genes with non-zero expression')
    multipage_close(pdf, False)