In [3]:
import os
os.chdir('gorilla')

In [4]:
%%bash
# perform RSEM
rsem_dir=/hive/groups/recon/projs/primates/compare_clint_expression
set -e
for p in SRR306808 SRR306809; do
    for ref in gorgor3_ensembl susie_gorilla_cat; do
        rsem-calculate-expression -p 16 ${p}.fastq ${rsem_dir}/${ref}/${ref} ${p}_${ref} --star
    done
done

STAR --genomeDir /hive/groups/recon/projs/primates/compare_clint_expression/gorgor3_ensembl  --outSAMunmapped Within  --outFilterType BySJout  --outSAMattributes NH HI AS NM MD  --outFilterMultimapNmax 20  --outFilterMismatchNmax 999  --outFilterMismatchNoverLmax 0.04  --alignIntronMin 20  --alignIntronMax 1000000  --alignMatesGapMax 1000000  --alignSJoverhangMin 8  --alignSJDBoverhangMin 1  --sjdbScore 1  --runThreadN 16  --genomeLoad NoSharedMemory  --outSAMtype BAM Unsorted  --quantMode TranscriptomeSAM  --outSAMheaderHD \@HD VN:1.4 SO:unsorted  --outFileNamePrefix SRR306808_gorgor3_ensembl.temp/SRR306808_gorgor3_ensembl  --readFilesIn SRR306808.fastq 
Jul 07 10:07:32 ..... started STAR run
Jul 07 10:07:32 ..... loading genome
Jul 07 10:08:17 ..... started mapping
Jul 07 10:16:15 ..... finished successfully

rsem-parse-alignments /hive/groups/recon/projs/primates/compare_clint_expression/gorgor3_ensembl/gorgor3_ensembl SRR306808_gorgor3_ensembl.temp/SRR306808_gorgor3_ensembl SRR3068

In [17]:
%%bash
# perform RSEM
rsem_dir=/hive/groups/recon/projs/primates/compare_clint_expression
ref=gorgor4_cat
set -e
for p in SRR306808 SRR306809; do
   echo "rsem-calculate-expression -p 20 ${p}.fastq ${rsem_dir}/${ref}/${ref} ${p}_${ref} --star"
done

rsem-calculate-expression -p 20 SRR306808.fastq /hive/groups/recon/projs/primates/compare_clint_expression/gorgor4_cat/gorgor4_cat SRR306808_gorgor4_cat --star
rsem-calculate-expression -p 20 SRR306809.fastq /hive/groups/recon/projs/primates/compare_clint_expression/gorgor4_cat/gorgor4_cat SRR306809_gorgor4_cat --star


In [20]:
from tools.bio import *
from tools.transcripts import *
from tools.sqlInterface import *
from tools.gff3 import *
from tools.misc import *
from collections import *
from cat.plots import *

In [22]:
susie_gorilla_expression_1 = pd.read_csv('SRR306808_susie_gorilla_cat.genes.results', header=0, sep='\t')
ensembl_expression_1 = pd.read_csv('SRR306808_gorgor3_ensembl.genes.results', header=0, sep='\t')
susie_gorilla_expression_2 = pd.read_csv('SRR306809_susie_gorilla_cat.genes.results', header=0, sep='\t')
ensembl_expression_2 = pd.read_csv('SRR306809_gorgor3_ensembl.genes.results', header=0, sep='\t')
gorgor_expression_1 = pd.read_csv('SRR306808_gorgor4_cat.genes.results', header=0, sep='\t')
gorgor_expression_2 = pd.read_csv('SRR306809_gorgor4_cat.genes.results', header=0, sep='\t')

In [23]:
susie_gorilla_expression = pd.merge(susie_gorilla_expression_1[['gene_id', 'TPM']], susie_gorilla_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
susie_gorilla_expression.columns = ['TPM'] * 2
susie_gorilla_expression = susie_gorilla_expression.groupby(by=susie_gorilla_expression.columns, axis=1).mean().reset_index()

In [24]:
ensembl_expression = pd.merge(ensembl_expression_1[['gene_id', 'TPM']], ensembl_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
ensembl_expression.columns = ['TPM'] * 2
ensembl_expression = ensembl_expression.groupby(by=ensembl_expression.columns, axis=1).mean().reset_index()

In [25]:
gorgor_expression = pd.merge(gorgor_expression_1[['gene_id', 'TPM']], gorgor_expression_2[['gene_id', 'TPM']], 
                            on='gene_id').set_index('gene_id')
gorgor_expression.columns = ['TPM'] * 2
gorgor_expression = gorgor_expression.groupby(by=gorgor_expression.columns, axis=1).mean().reset_index()

In [27]:
# load human annotation
ref_df = load_annotation('/hive/groups/recon/projs/primates/susie_indel_corrected/databases/Human.db')

# load ensembl annotations
lines = [x.split('\t') for x in open('/hive/groups/recon/projs/primates/compare_clint_expression/gorgor3_ensembl/Gorilla_gorilla.gorGor3.1.89.gtf') if not x.startswith('#')]
ensembl_map = {}
for l in lines:
    x = parse_gtf_attr_line(l[-1])
    try:
        ensembl_map[x['gene_id']] = x['gene_name']
    except KeyError:
        continue
ensembl_map = pd.DataFrame(ensembl_map.items(), columns=['gene_id', 'gene_name'])

# load name maps for susie_gorilla
susie_gorilla_map = pd.read_csv('/hive/groups/recon/projs/primates/susie_indel_corrected/consensus_gene_set/Susie_Gorilla.gp_info',
                       sep='\t', header=0)
# load name maps for gorgor
gorgor_map = pd.read_csv('/hive/groups/recon/projs/primates/original_primates/annotation_full_primates/consensus_gene_set/Gorilla.gp_info',
                       sep='\t', header=0)

In [28]:
# add names to susie_gorilla
susie_gorilla = pd.merge(susie_gorilla_expression[['gene_id', 'TPM']], susie_gorilla_map[['gene_id', 'source_gene_common_name']], 
                 on='gene_id')[['TPM', 'source_gene_common_name']]
susie_gorilla.columns = ['susie_gorilla (CAT)', 'gene_name']
susie_gorilla = susie_gorilla.drop_duplicates()
susie_gorilla = susie_gorilla.groupby('gene_name').aggregate(sum).reset_index()

In [29]:
# add names to ensembl
ensembl = pd.merge(ensembl_expression[['gene_id', 'TPM']], ensembl_map, on='gene_id')[['TPM', 'gene_name']]
ensembl = ensembl.drop_duplicates()
ensembl = ensembl.groupby('gene_name').aggregate(sum).reset_index()
ensembl.columns = ['gene_name', 'gorGor3 (Ensembl V87)']

In [36]:
# add names to gorgor
gorgor = pd.merge(gorgor_expression[['gene_id', 'TPM']], gorgor_map, on='gene_id')[['TPM', 'source_gene_common_name']]
gorgor = gorgor.drop_duplicates()
gorgor.columns = ['gorGor4 (CAT)', 'gene_name']
gorgor = gorgor.groupby('gene_name').aggregate(sum).reset_index()

In [37]:
# write to disk for other analysis
combined_df = pd.merge(susie_gorilla, ensembl, on='gene_name', how='outer')
combined_df.to_csv('../susie_gorilla_ensembl.tsv', sep='\t')
combined_df.merge(gorgor, on='gene_name', how='outer').to_csv('../gorgor_susie_gorilla_ensembl.tsv', sep='\t')

In [32]:
gorgor_expression.head()

Unnamed: 0,gene_id,TPM
0,Gorilla_G0000001,0.0
1,Gorilla_G0000002,0.0
2,Gorilla_G0000003,0.0
3,Gorilla_G0000004,0.0
4,Gorilla_G0000005,0.0


In [33]:
gorgor_map.head()

Unnamed: 0,gene_id,transcript_id,alignment_id,alternative_source_transcripts,exon_annotation_support,exon_rna_support,failed_gene,frameshift,gene_alternate_contigs,gene_biotype,...,paralog_status,paralogy,score,source_gene,source_gene_common_name,source_transcript,source_transcript_name,transcript_biotype,transcript_class,transcript_modes
0,Gorilla_G0000001,Gorilla_T0000001,ENST00000492842.1-2,,1,0,True,,,unprocessed_pseudogene,...,NotConfident,2.0,921.0,ENSG00000240361.1,OR4G11P,ENST00000492842.1,OR4G11P-001,unprocessed_pseudogene,failing,transMap
1,Gorilla_G0000002,Gorilla_T0000002,ENST00000588632.1-2,,1,0,True,,,unprocessed_pseudogene,...,NotConfident,2.0,922.0,ENSG00000267310.1,OR4G1P,ENST00000588632.1,OR4G1P-001,unprocessed_pseudogene,failing,transMap
2,Gorilla_G0000003,Gorilla_T0000003,ENST00000328113.3-2,,1,0,True,,,unprocessed_pseudogene,...,NotConfident,2.0,921.0,ENSG00000183909.5,OR4G2P,ENST00000328113.3,OR4G2P-001,unprocessed_pseudogene,failing,transMap
3,Gorilla_G0000004,Gorilla_T0000004,ENST00000559612.1-1,,1,0,False,,,unprocessed_pseudogene,...,,,0.0,ENSG00000259646.1,AC140725.7,ENST00000559612.1,AC140725.7-001,unprocessed_pseudogene,passing,transMap
4,Gorilla_G0000005,Gorilla_T0000005,ENST00000438217.1-1,,0,0,True,,,lincRNA,...,SyntenyHeuristic,3.0,845.0,ENSG00000213863.2,XX-C2158C12.2,ENST00000438217.1,XX-C2158C12.2-001,lincRNA,failing,transMap


In [35]:
gorgor.head()

Unnamed: 0,TPM,source_gene_common_name
0,0.0,OR4G11P
1,0.0,OR4G1P
2,0.0,OR4G2P
3,0.0,AC140725.7
4,0.0,XX-C2158C12.2
