In [1]:
import os
import collections
from tools.fileOps import *
from tools.procOps import *

In [2]:
# constructed the transcript files
m = {}
# first, the CAT annotations
for g in ['Clint_Chimp', 'Susie_Gorilla', 'Susie_Orangutan']:
    gp = '../../consensus_gene_set/{}.filtered.gp'.format(g)
    fa = '../../genome_files/{}.fa'.format(g)
    with TemporaryFilePath() as tmp:
        !genePredToBed {gp} {tmp}
        !bedtools getfasta -fi {fa} -bed {tmp} -fo {g + '.CAT.transcripts.fa'} -name -split -s
    m[(g.replace('_', '/'), 'CAT')] = g + '.CAT.transcripts.fa'

In [3]:
# CAT annotations for original primates, fix names
og_dir = '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/'
for g, og in zip(*[['Chimp', 'Gorilla', 'Orangutan'], ['panTro4', 'gorGor4', 'ponAbe2']]):
    gp = os.path.join(og_dir, 'consensus_gene_set', g + '.filtered.gp')
    fa = os.path.join(og_dir, 'genome_files', g + '.fa')
    with TemporaryFilePath() as tmp:
        !genePredToBed {gp} {tmp}
        !bedtools getfasta -fi {fa} -bed {tmp} -fo {og + '.CAT.transcripts.fa'} -name -split -s
    m[(og, 'CAT')] = og + '.CAT.transcripts.fa'

In [4]:
# human, which CAT already produces
!cp ../../reference/gencode.v27.annotation.no_PAR.fa ./
m[('Human', 'GENCODE V27')] = 'gencode.v27.annotation.no_PAR.fa'

In [38]:
# Ensembl V91 for chimp and gorilla
!gtfToGenePred -genePredExt Pan_troglodytes.Pan_tro_3.0.91.gtf /dev/stdout | genePredToBed /dev/stdin Pan_troglodytes.Pan_tro_3.0.91.bed
!gtfToGenePred -genePredExt Gorilla_gorilla.gorGor4.91.gtf /dev/stdout | genePredToBed /dev/stdin Gorilla_gorilla.gorGor4.91.bed
!bedtools getfasta -fi chimp_gorilla_fasta/gorilla_gorilla_softmasked_toplevel.fa.fixed -bed Gorilla_gorilla.gorGor4.91.bed -name -split -s -fo gorGor4.Ensembl.transcripts.fa 
!bedtools getfasta -fi chimp_gorilla_fasta/pan_troglodytes_softmasked_toplevel.fa.fixed -bed Pan_troglodytes.Pan_tro_3.0.91.bed -name -split -s -fo panTro4.Ensembl.transcripts.fa

index file chimp_gorilla_fasta/gorilla_gorilla_softmasked_toplevel.fa.fixed.fai not found, generating...
Feature (CABD030151935.1:1777-2008) beyond the length of CABD030151935.1 size (2000 bp).  Skipping.
Feature (CABD030151492.1:1938-2009) beyond the length of CABD030151492.1 size (2000 bp).  Skipping.
Feature (CABD030130636.1:1218-1325) beyond the length of CABD030130636.1 size (1322 bp).  Skipping.
Feature (CABD030130064.1:746-923) beyond the length of CABD030130064.1 size (913 bp).  Skipping.
Feature (CABD030151826.1:800-900) beyond the length of CABD030151826.1 size (894 bp).  Skipping.
Feature (CABD030151588.1:765-872) beyond the length of CABD030151588.1 size (870 bp).  Skipping.
Feature (CABD030161071.1:739-831) beyond the length of CABD030161071.1 size (818 bp).  Skipping.
index file chimp_gorilla_fasta/pan_troglodytes_softmasked_toplevel.fa.fixed.fai not found, generating...
Feature (KV421959.1:10660-10764) beyond the length of KV421959.1 size (10756 bp).  Skipping.
Feature (

In [5]:
m[('gorGor4', 'Ensembl V91')] = 'gorGor4.Ensembl.transcripts.fa'
m[('panTro4', 'Ensembl V91')] = 'panTro4.Ensembl.transcripts.fa'

In [None]:
# finally, we need to load the ensembl original annotations
# for orangutan, things are easier
with TemporaryFilePath() as tmp:
    g = 'Orangutan'
    og = 'ponAbe2'
    fa = os.path.join(og_dir, 'genome_files', g + '.fa')
    !gtfToGenePred -genePredExt Pongo_abelii.PPYG2.88.gtf /dev/stdout | genePredToBed /dev/stdin {tmp}
    !bedtools getfasta -fi {fa} -bed {tmp} -fo {og + '.ensembl.transcripts.fa'} -name -split -s
    

In [6]:
m[('ponAbe2', 'Ensembl V90')] = og + '.ensembl.transcripts.fa'

In [7]:
# now, we can construct our kallisto indices

!mkdir indices -p

import os
index_map = {}
with open('cmds.txt', 'w') as outf:
    for (g, a), x in m.iteritems():
        o = '_'.join([g.replace('/', '_'), a.replace(' ', '_')])
        cmd = 'kallisto index -i indices/{}.kallisto {}\n'.format(o, x)
        outf.write(cmd)

In [None]:
%%bash

cmd="cd ${PWD} && para make -cpu=1 -ram=32g cmds.txt"
ssh ku $cmd

In [9]:
# map of fastq names to genome names
fq_map = {'panTro4': 'chimp', 'ponAbe2': 'orang', 'Human': 'human', 'gorGor3': 'gorilla', 'gorGor4': 'gorilla',
         'Susie/Gorilla': 'gorilla', 'Clint/Chimp': 'chimp', 'Susie/Orangutan': 'orang'}

with open('kallisto_cmds.txt', 'w') as outf:
    for (g, a), x in m.iteritems():
        fwd = 'fastqs/' + fq_map[g] + '.fwd.fq'
        rev = 'fastqs/' + fq_map[g] + '.rev.fq'
        o = '_'.join([g.replace('/', '_'), a.replace(' ', '_')])
        index = 'indices/' + o + '.kallisto'
        cmd = 'kallisto quant -t 8 -o {} -i {} {} {}\n'.format(o, index, fwd, rev)
        outf.write(cmd)

In [10]:
%%bash

cmd="cd ${PWD} && para make -cpu=1 -ram=32g kallisto_cmds.txt"
ssh ku $cmd

Checking input files
10 jobs written to /hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/primate_paper/kallisto_expression/batch
10 jobs in batch
0 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
updated job database on disk
Pushed Jobs: 10
Checking job status 0 minutes after launch
10 jobs in batch
10 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
updated job database on disk
Checking job status 1 minutes after launch
10 jobs in batch
10 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
updated job database on disk
Checking job status 2 minutes after launch
10 jobs in batch
10 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
updated job database on disk
Checking job status 3 minutes after launch
10 jobs in batch
9 jobs (including everybody's) in Parasol queue or running.
Checking finished jobs
updated job database on disk
Checking job 

In [11]:
# load data
import pandas as pd
dfs = {}
for g, a in m:
    o = '_'.join([g.replace('/', '_'), a.replace(' ', '_')])
    df = pd.read_csv(os.path.join(o, 'abundance.tsv'), sep='\t')
    df['target_id'] = [x.split('(')[0] for x in df['target_id']]
    dfs[(g, a)] = df

In [13]:
# start combining

from tools.bio import *
from tools.transcripts import *
from tools.sqlInterface import *
from tools.gff3 import *
from tools.misc import *
from collections import *

# load name maps
gtfs = {('panTro4', 'Ensembl V91'): 'Pan_troglodytes.Pan_tro_3.0.91.gtf',
     ('gorGor4', 'Ensembl V91'): 'Gorilla_gorilla.gorGor4.91.gtf',
     ('ponAbe2', 'Ensembl V90'): 'Pongo_abelii.PPYG2.88.gtf',
     ('panTro4', 'CAT'): '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Chimp.filtered.gp_info',
     ('gorGor4', 'CAT'): '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Gorilla.filtered.gp_info',
     ('ponAbe2', 'CAT'): '/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Orangutan.filtered.gp_info',
     ('Clint/Chimp', 'CAT'): '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Clint_Chimp.fixed.filtered.gp_info',
     ('Susie/Gorilla', 'CAT'): '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Susie_Gorilla.fixed.filtered.gp_info',
    ('Susie/Orangutan', 'CAT'): '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/consensus_gene_set/Susie_Orangutan.fixed.filtered.gp_info',
    ('Human', 'GENCODE V27'): '/hive/groups/recon/projs/primates/primates_indel_corrected_bionano_cut/reference/gencode.v27.annotation.no_PAR.gtf'}

def construct_ensembl_map(gtf):
    lines = [x.split('\t') for x in open(gtf) if not x.startswith('#')]
    ensembl_map = []
    for l in lines:
        x = parse_gtf_attr_line(l[-1])
        try:
            ensembl_map.append([x['gene_id'], x['transcript_id']])
        except KeyError:
            continue
    return pd.DataFrame(ensembl_map, columns=['gene_id', 'transcript_id'])


name_dfs = defaultdict(dict)
for (g, a), gtf in gtfs.iteritems():
    if g == 'Human':
        name_dfs[(g, a)] = load_annotation('/hive/groups/recon/projs/primates/susie_indel_corrected/databases/Human.db')
        name_dfs[(g, a)] = name_dfs[(g, a)][['TranscriptId', 'GeneId']].drop_duplicates()
        name_dfs[(g, a)].columns = ['transcript_id', 'gene_id']
    elif 'Ensembl' in a:
        name_dfs[(g, a)] = construct_ensembl_map(gtf)
    else:
        gp_info = gtf
        df = pd.read_csv(gp_info, sep='\t', header=0)[['gene_id', 'transcript_id']].drop_duplicates()
        df.columns = ['gene_id', 'transcript_id']
        name_dfs[(g, a)] = df
            


In [14]:
combined_dfs = {}
for x, name_df in name_dfs.iteritems():
    df = dfs[x]
    merged = df.merge(name_df, left_on='target_id', right_on='transcript_id')
    combined_dfs[x] = merged.drop(['target_id', 'length', 'eff_length', 'est_counts'], axis=1).drop_duplicates()

In [15]:
combined_by_gene = {}
for x, df in combined_dfs.iteritems():
    combined_by_gene[x] = df.drop(['transcript_id'], axis=1).groupby('gene_id').aggregate(sum)

In [16]:
# turn this into a flat dataframe with the columns Assembly/Annotation, total, expressed
import numpy as np
order = ['Human (GENCODE V27)', 'panTro4 (Ensembl V91)', 'panTro4 (CAT)', 'Clint/Chimp (CAT)',
         'gorGor4 (Ensembl V91)', 'gorGor4 (CAT)', 'Susie/Gorilla (CAT)',
         'ponAbe2 (Ensembl V90)', 'ponAbe2 (CAT)', 'Susie/Orangutan (CAT)',
         ]


r = []
for x, df in combined_by_gene.iteritems():
    r.append(['{} ({})'.format(*x), len(df) - len(df[df.tpm > 0.1]), len(df[df.tpm > 0.1])])
gene_df = pd.DataFrame(r, columns=['Assembly/Annotation', 'not_expressed', 'expressed'])
gene_df['Assembly/Annotation'] = pd.Categorical(gene_df['Assembly/Annotation'], order, ordered=True)
gene_df = gene_df.sort_values('Assembly/Annotation')

gene_data = np.array(gene_df[['expressed', 'not_expressed']])

In [17]:
# now by transcript
# turn this into a flat dataframe with the columns Assembly/Annotation, total, expressed
import numpy as np

r = []
for x, df in combined_dfs.iteritems():
    r.append(['{} ({})'.format(*x), len(df) - len(df[df.tpm > 0.1]), len(df[df.tpm > 0.1])])
tx_df = pd.DataFrame(r, columns=['Assembly/Annotation', 'not_expressed', 'expressed'])
tx_df['Assembly/Annotation'] = pd.Categorical(tx_df['Assembly/Annotation'], order, ordered=True)
tx_df = tx_df.sort_values('Assembly/Annotation')

tx_data = np.array(tx_df[['expressed', 'not_expressed']])

In [18]:
palette=["#CDBA74", 
         '#89bfdd', '#549ece', '#2b7bbb',
         '#fd8262', '#f54f39', '#d62221',
         '#8fd18c', '#57b668', '#2c954c',
         ]

from cat.plots import *
# plot it
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
majorLocator = MultipleLocator(10000)
minorLocator = MultipleLocator(2500)
mkfunc = lambda x, pos: '{}k'.format(int(float(x) / 1000))
mkformatter = matplotlib.ticker.FuncFormatter(mkfunc)
with open('expressed_ipsc_kallisto.pdf', 'w') as outf, PdfPages(outf) as pdf:
    bar_width = 0.6
    fig, (ax1, ax2) = plt.subplots(figsize=(8, 3), ncols=2)
    ax1.xaxis.set_major_locator(MultipleLocator(10000))
    ax1.xaxis.set_major_formatter(mkformatter)
    ax1.xaxis.set_minor_locator(MultipleLocator(2500))
    ax2.xaxis.set_major_locator(MultipleLocator(25000))
    ax2.xaxis.set_major_formatter(mkformatter)
    ax2.xaxis.set_minor_locator(MultipleLocator(5000))
    for i, (exp, not_exp) in enumerate(gene_data):
        b = ax1.barh(i, exp, left=0, height=bar_width, alpha=0.9, linewidth=0, color=palette[i])
        b = ax1.barh(i, not_exp, left=exp, height=bar_width, hatch='//', alpha=0.45, ecolor='black',
                   linewidth=0, color=palette[i])
    for i, (exp, not_exp) in enumerate(tx_data):
        b = ax2.barh(i, exp, left=0, height=bar_width, alpha=0.9, linewidth=0, color=palette[i])
        b = ax2.barh(i, not_exp, left=exp, height=bar_width, hatch='//', alpha=0.45, ecolor='black',
                   linewidth=0, color=palette[i])

    sns.despine()
    ax1.set_yticks(np.arange(len(tx_df)) + (bar_width / 2))
    ax2.set_yticks(np.arange(len(tx_df)) + (bar_width / 2))
    ax2.set_yticklabels('')
    ax1.set_yticklabels(list(tx_df['Assembly/Annotation']))
    ax2.set_xlabel('Number of transcripts')
    ax1.set_xlabel('Number of genes')
    fig.suptitle('Non-zero expression estimates of species-specific iPSC RNA-seq (Kallisto)')
    multipage_close(pdf)

In [19]:
print gene_df

     Assembly/Annotation  not_expressed  expressed
8    Human (GENCODE V27)          29986      27090
1  panTro4 (Ensembl V91)          11595      20353
2          panTro4 (CAT)          27564      28039
3      Clint/Chimp (CAT)          27925      27969
9  gorGor4 (Ensembl V91)          10408      19188
0          gorGor4 (CAT)          27345      27819
7    Susie/Gorilla (CAT)          28418      27567
5  ponAbe2 (Ensembl V90)          13068      15338
4          ponAbe2 (CAT)          34557      19945
6  Susie/Orangutan (CAT)          35179      20043


In [20]:
# get some info on orangutan genes missing for comparison.

orang_old = combined_by_gene[('ponAbe2', 'Ensembl V90')].reset_index()
orang_new = combined_by_gene[('ponAbe2', 'CAT')].reset_index()
orang_old_not_expressed = set(orang_old[orang_old.tpm <= 0.1].gene_id)
orang_new_not_expressed = set(orang_new[orang_new.tpm <= 0.1].gene_id)

In [21]:


def construct_ensembl_map_gene_name(gtf):
    lines = [x.split('\t') for x in open(gtf) if not x.startswith('#')]
    ensembl_map = []
    for l in lines:
        x = parse_gtf_attr_line(l[-1])
        try:
            ensembl_map.append([x['gene_id'], x['transcript_id'], x['gene_name'], x['gene_biotype']])
        except KeyError:
            continue
    return pd.DataFrame(ensembl_map, columns=['gene_id', 'transcript_id', 'gene_name', 'gene_biotype'])

ponabe = construct_ensembl_map_gene_name('Pongo_abelii.PPYG2.88.gtf')
human = load_annotation('/hive/groups/recon/projs/primates/susie_indel_corrected/databases/Human.db')
ponabe_new = pd.read_csv('/hive/groups/recon/projs/primates/original_primates/redo_annotation_indel/consensus_gene_set/Orangutan.gp_info', sep='\t')

In [22]:
orang_m = ponabe[ponabe.gene_id.isin(orang_old_not_expressed)].drop_duplicates()
orang_new_m = ponabe_new[ponabe_new.gene_id.isin(orang_new_not_expressed)]
orang_new_m = orang_new_m[['gene_id', 'gene_biotype']].drop_duplicates()

In [23]:
print tx_df

     Assembly/Annotation  not_expressed  expressed
8    Human (GENCODE V27)          98010      95584
1  panTro4 (Ensembl V91)          23172      35191
2          panTro4 (CAT)          95857      93694
3      Clint/Chimp (CAT)          97870      94855
9  gorGor4 (Ensembl V91)          20924      32072
0          gorGor4 (CAT)          94744      93659
7    Susie/Gorilla (CAT)          98453      94281
5  ponAbe2 (Ensembl V90)          13472      15938
4          ponAbe2 (CAT)         129576      57416
6  Susie/Orangutan (CAT)         132546      58170
