In [1]:
# load dfs
import pandas as pd
from cat.plots import *
import seaborn as sns
from scipy.stats import *
from tools.sqlInterface import *
base_dir = '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/'
dfs = {'chimp': pd.read_csv(os.path.join(base_dir, 'Clint_Chimp.fixed.filtered.gp_info'), sep='\t'),
      'orangutan': pd.read_csv(os.path.join(base_dir, 'Susie_Orangutan.fixed.filtered.gp_info'), sep='\t'),
      'gorilla': pd.read_csv(os.path.join(base_dir, 'Susie_Gorilla.fixed.filtered.gp_info'), sep='\t')}

base_dir = '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/'
for g, og in [['panTro4', 'Chimp'], ['ponAbe2', 'Orangutan'], ['gorGor4', 'Gorilla']]:
    dfs[g] = pd.read_csv(os.path.join(base_dir, og + '.filtered.gp_info'), sep='\t')


dfs['human'] = load_annotation('/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/databases/Human.db')
dfs['human'].columns = ['transcript_id', 'gene_id', 'transcript_name', 'gene_name', 'gene_biotype', 'transcript_biotype']

In [2]:
# load expression
exp_dfs = {}
for g in dfs:
    df = pd.read_csv('{}_kallisto_output/abundance.tsv'.format(g), header=0, sep='\t')
    df['target_id'] = [x.split('(')[0] for x in df.target_id]
    exp_dfs[g] = df
for g in ['gorilla_human', 'orangutan_human', 'chimp_human']:
    df = pd.read_csv('{}_kallisto_output/abundance.tsv'.format(g), header=0, sep='\t')
    df['target_id'] = [x.split('(')[0] for x in df.target_id]
    exp_dfs[g] = df

In [24]:
# merge dfs
merged_dfs = {}
for g, exp_df in exp_dfs.iteritems():
    if g == 'human':
        m = exp_df.merge(dfs[g][['transcript_id', 'gene_id', 'gene_biotype']].drop_duplicates(), left_on='target_id', right_on='transcript_id')
        m = m[m.gene_biotype == 'protein_coding']
        m = m[['gene_id', 'tpm']]
    elif g in ['gorilla_human', 'orangutan_human', 'chimp_human']:
        m = exp_df.merge(dfs['human'][['transcript_id', 'gene_id', 'gene_biotype']].drop_duplicates(), left_on='target_id', right_on='transcript_id')
        m = m[m.gene_biotype == 'protein_coding']
        m = m[['gene_id', 'tpm']]
    else:
        m = exp_df.merge(dfs[g][['transcript_id', 'source_gene', 'gene_biotype']].drop_duplicates(), left_on='target_id', right_on='transcript_id')
        m = m[m.gene_biotype == 'protein_coding']
        m = m[['source_gene', 'tpm']]
        m.columns = ['gene_id', 'tpm']
    m = m.groupby('gene_id').aggregate(sum)
    merged_dfs[g] = m.reset_index()
    print g, m.tpm.sum()

gorGor4 715365.918296
ponAbe2 912589.675316
orangutan 930358.716555
panTro4 546909.102316
chimp 508775.855775
gorilla 607275.57229
orangutan_human 905687.0035
human 570790.558337
gorilla_human 584843.975313
chimp_human 517688.186924


In [25]:
# fix names
merged_dfs['human'].columns = ['gene_id', 'Human (GENCODE V27)']
merged_dfs['chimp'].columns = ['gene_id', 'Clint/Chimpanzee (CAT)']
merged_dfs['gorilla'].columns = ['gene_id', 'Susie/Gorilla (CAT)']
merged_dfs['orangutan'].columns = ['gene_id', 'Susie/Orangutan (CAT)']
merged_dfs['panTro4'].columns = ['gene_id', 'panTro4 (CAT)']
merged_dfs['ponAbe2'].columns = ['gene_id', 'ponAbe2 (CAT)']
merged_dfs['gorGor4'].columns = ['gene_id', 'gorGor4 (CAT)']
merged_dfs['chimp_human'].columns = ['gene_id', 'GRCh38 (GENCODE V27)']
merged_dfs['orangutan_human'].columns = ['gene_id', 'GRCh38 (GENCODE V27)']
merged_dfs['gorilla_human'].columns = ['gene_id', 'GRCh38 (GENCODE V27)']

In [26]:
# now we merge and group
combined_dfs = {}
combined_dfs['chimp'] = merged_dfs['human'].merge(
    merged_dfs['chimp'], how='outer', on='gene_id').merge(
    merged_dfs['panTro4'], how='outer', on='gene_id').merge(
    merged_dfs['chimp_human'], how='outer', on='gene_id')
combined_dfs['gorilla'] = merged_dfs['human'].merge(
    merged_dfs['gorilla'], how='outer', on='gene_id').merge(
    merged_dfs['gorGor4'], how='outer', on='gene_id').merge(
    merged_dfs['gorilla_human'], how='outer', on='gene_id')
combined_dfs['orangutan'] = merged_dfs['human'].merge(
    merged_dfs['orangutan'], how='outer', on='gene_id').merge(
    merged_dfs['ponAbe2'], how='outer', on='gene_id').merge(
    merged_dfs['orangutan_human'], how='outer', on='gene_id')

melted_dfs = {}
for g, df in combined_dfs.iteritems():
    df = pd.melt(df, id_vars=['gene_id', 'Human (GENCODE V27)'])
    df.columns = ['gene_id', 'Human (GENCODE V27)', 'assembly/annotation', 'TPM']
    melted_dfs[g] = df.drop_duplicates()

In [27]:
def plot_fn(pdf, df, cols):
    palette = sns.color_palette()
    #palette = [palette[3], palette[2], palette[1]]
    palette = palette[1:]
    with open(pdf, 'w') as outf, PdfPages(outf) as pdf:
        g = sns.lmplot(data=df, x='Human (GENCODE V27)', y='TPM', col='assembly/annotation',
                      hue='assembly/annotation', fit_reg=False,
                      scatter_kws={'marker': '+', 'alpha': 0.5, 's': 3,  'rasterized': True},
                      col_order=cols,
                      palette=palette)
        x = np.linspace(0, 250)
        for col, ax in zip(*[cols, g.axes[0]]):
            tmp = df[df['assembly/annotation'] == col]
            tmp = tmp[(tmp['Human (GENCODE V27)'].notnull()) & (tmp['TPM'].notnull())]
            c, p = pearsonr(tmp['Human (GENCODE V27)'], tmp['TPM'])
            ax.set_title(ax.get_title() + '\nPearson r={:.2f} p={:.2f}'.format(c, p))
            ax.set_xlim(0, 250)
            ax.set_ylim(0, 250)
            ax.plot(x, x, 'r--', color='black', alpha=0.7, linewidth=1)
        multipage_close(pdf, False)
        

for g, df in melted_dfs.iteritems():
    pdf = g + '.kallisto.genes.gene_id.pdf'
    if g == 'chimp':
        cols = ['GRCh38 (GENCODE V27)', 'Clint/Chimpanzee (CAT)', 'panTro4 (CAT)']
    elif g == 'gorilla':
        cols = ['GRCh38 (GENCODE V27)', 'Susie/Gorilla (CAT)', 'gorGor4 (CAT)']
    else:
        cols = ['GRCh38 (GENCODE V27)', 'Susie/Orangutan (CAT)', 'ponAbe2 (CAT)']
    plot_fn(pdf, df, cols)

In [28]:
num_human = len(merged_dfs['human'][merged_dfs['human']['Human (GENCODE V27)'] > 0.1])
num_cat = len(merged_dfs['chimp'][merged_dfs['chimp']['Clint/Chimpanzee (CAT)'] > 0.1])
num_old_cat = len(merged_dfs['panTro4'][merged_dfs['panTro4']['panTro4 (CAT)'] > 0.1])
num_human_chimp = len(merged_dfs['chimp_human'][merged_dfs['chimp_human']['GRCh38 (GENCODE V27)'] > 0.1])

In [29]:
print num_cat, num_old_cat, num_human, num_human_chimp

15278 15242 15787 16107


In [4]:
%connect_info

{
  "stdin_port": 34057, 
  "ip": "127.0.0.1", 
  "control_port": 45764, 
  "hb_port": 52767, 
  "signature_scheme": "hmac-sha256", 
  "key": "2dc295bf-4642-410c-9b98-c9fd8b221410", 
  "kernel_name": "", 
  "shell_port": 47207, 
  "transport": "tcp", 
  "iopub_port": 60142
}

Paste the above JSON into a file, and connect with:
    $> ipython <app> --existing <file>
or, if you are local, you can connect with just:
    $> ipython <app> --existing /cluster/home/ifiddes/.local/share/jupyter/runtime/kernel-c14c9830-ff28-4fae-9981-ccc8c1c80a82.json 
or even just:
    $> ipython <app> --existing 
if this is the most recent IPython session you have started.
