In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
#%matplotlib inline 

import warnings
# both loading plastid and one of the lines below provoke some warnings
warnings.filterwarnings('ignore')

from plastid import *
from twobitreader import TwoBitFile
from collections import defaultdict
import six

#to make executable
import os

In [1]:
##All files are defined by their path rather than by a prefix.
os.system('pwd')

/home/jmr/targeted_capture/updated


In [2]:
# what are the ensembl ids for these genes?
targeted_genes = pd.read_csv('target_gene_list.txt', header=None, names=['gene_name'])
feature_names = pd.read_csv('./references/features.tsv.gz',
                            sep='\t',
                            header=None,
                            names=['gene_id', 'gene_name', 'feature_type'])
targeted_genes = feature_names[feature_names['gene_name'].isin(targeted_genes['gene_name'])]

In [None]:
#check for genes that were not assigned an ensembl id
tmp = pd.read_csv('target_gene_list.txt', header=None, names=['gene_name'])
print('Missing:')
print(pd.DataFrame(tmp)[~pd.DataFrame(tmp)['gene_name'].isin(targeted_genes['gene_name'])])
#The gene names need to be updated for these genes. Search ensembl website to find desired gene name.
#Append these genes MANUALLY to the end of target_gene_list.txt and rerun this script.
#These genes must be added to target_gene_list.txt because this list is used by pick_targets.ipynb script.


In [3]:
# write these to a text document
np.savetxt('targeted_gene_ids.csv', targeted_genes['gene_id'].values, fmt='%s')

In [4]:
# # extract the relevant transcripts from the genome reference
os.system('grep -f targeted_gene_ids.csv /nvme/indices/refdata-cellranger-GRCh38-1.2.0/genes/genes.gtf > ./references/targeted_genes.gtf')


In [5]:
# # convert genome to 2bit format
os.system('faToTwoBit /nvme/indices/refdata-cellranger-GRCh38-1.2.0/fasta/genome.fa ./references/genome.2bit')


In [6]:
# this line produces a lot of warnings about duplicate tags...
annotated_transcripts = {transcript.attr['transcript_id']: transcript for transcript 
                             in GTF2_TranscriptAssembler('./references/targeted_genes.gtf',return_type=Transcript)}

In [7]:
# make a table of all the unique transcripts for each gene
targeted_transcripts = pd.Series({transcript_id: transcript.get_gene() for transcript_id, transcript in annotated_transcripts.iteritems()}).reset_index()
targeted_transcripts.columns = ['transcript_id', 'gene_id']

name_mapper = dict(zip(targeted_genes['gene_id'], targeted_genes['gene_name']))
id_mapper = dict(zip(targeted_genes['gene_name'], targeted_genes['gene_id']))

targeted_transcripts['gene_name'] = targeted_transcripts['gene_id'].map(name_mapper)
targeted_transcripts = targeted_transcripts.sort_values('gene_name').reset_index(drop=True)

In [10]:
# construct dictionary that groups transcript models by gene
transcripts_by_gene = defaultdict(dict)

for name, x in targeted_transcripts.iterrows():
    transcript_id = x['transcript_id']
    transcripts_by_gene[x['gene_name']][transcript_id] = annotated_transcripts[transcript_id]

In [11]:
# find spanning segments that cover all transcripts for a particular gene
spanning_segments = dict()

# we will add some extra space at the end just in case 3' UTRs aren't fully annotated
utr3_offset = 200

for gene, transcripts in transcripts_by_gene.iteritems():
    # note the spanning segments include annotated 3' UTRs
    start = np.min([x.spanning_segment.start for x in transcripts.values()])
    end = np.max([x.spanning_segment.end for x in transcripts.values()])
    
    sense = transcripts.values()[0].strand
    
    if sense == '+':
        end = end + utr3_offset
    elif sense == '-':
        start = start - utr3_offset
    
    spanning_segments[gene] = SegmentChain(GenomicSegment(transcripts.values()[0].chrom,
                                                          start, end,
                                                          transcripts.values()[0].strand),
                                          gene_id=id_mapper[gene])

In [43]:
import cPickle as pickle

def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [93]:
save_obj(spanning_segments, './counts/spanning_segments')

In [17]:
bam_list = {'CR2_l{0}'.format(i): '/home/jmr/direct_capture/doubles/perturbseq/count/CR2_l{0}/outs/possorted_genome_bam.bam'.format(i) for i in range(1,7)}


In [None]:
for pretty_name, bam_file in bam_list.iteritems():
    alignments = BAMGenomeArray(bam_file)
    print(bam_file)
    print('================================================')
    
    # this will put weight 1/N on each nucleotide covered by a read, where N is the length of the read
    alignments.set_mapping(CenterMapFactory())

    # now accumulate read density
    count_vectors = dict()
    for gene, segment in spanning_segments.iteritems():
        print('\t{0}'.format(gene))
        count_vectors[gene] = segment.get_counts(alignments)
        
    save_obj(count_vectors, './counts/count_vectors_{0}.count'.format(pretty_name))

    # this will put weight 1 at the 5' end of the mapped read
    alignments.set_mapping(FivePrimeMapFactory())

    print('================================================')
    # now accumulate read density
    start_count_vectors = dict()
    for gene, segment in spanning_segments.iteritems():
        print('\t{0}'.format(gene))
        start_count_vectors[gene] = segment.get_counts(alignments)
        
    save_obj(start_count_vectors, './counts/start_count_vectors_{0}.count'.format(pretty_name))

/home/jmr/direct_capture/doubles/perturbseq/count/CR2_l4/outs/possorted_genome_bam.bam
	HSF1
	HSPA5
	HSPA9
	HSPA8
	UQCRH
	POLR3A
	TOMM7
	RAN
	RPL18
	SMG9
	ANAPC13
	DBR1
	HSP90AB1
	ZNF574
	GPI
	SOD1
	HK1
	CREB1
	C1orf131
	C12orf45
	GATA1
	SNRPD2
	ORMDL1
	HBG2
	XRN2
	ATR
	YBX1
	ATP6V1A
	FUS
	RPL9
	PABPC1
	GAPDH
	POLR1D
	TOMM20
	CLTC
	RICTOR
	EXOSC4
	CHCHD2
	APOE
	CDK1
	JUN
	H3F3A
	C14orf178
	CLUH
	CDK6
	MRPL17
	SGTA
	FTL
	PEX1
	TFAP4
	BTF3
	TUBB
	EMC1
	ASNA1
	ZNF236
	ATM
	RPS5
	UPF1
	VDAC1
	HINT1
	RPL41
	PRAME
	NPM1
	ATP5A1
	ZNF622
	POLR2H
	C2orf15
	PSMA5
	FTH1
	RANBP1
	HMGCS1
	RPS25
	MINA
	HNRNPA1
	EEF1B2
	MTOR
	C15orf41
	LEO1
	RPL10A
	HSPE1
	PSMB5
	RPS15
	RPS14
	EEF1A1
	RPS18
	PRDX1
	EIF2S1
	TXN
	CDK12
	RPL22
	HSPD1
	PFKM
	C22orf15
	HNRNPA2B1
	LDHA
	LONP1
	PIGF
	VCP
	SEC61B
	UPF3B
	UPF3A
	ATF4
	SEC61A1
	EEF2
	HSF1
	HSPA5
	HSPA9
	HSPA8
	UQCRH
	POLR3A
	TOMM7
	RAN
	RPL18
	SMG9
	ANAPC13
	DBR1
	HSP90AB1
	ZNF574
	GPI
	SOD1
	HK1
	CREB1
	C1orf131
	C12orf45
	GATA1
	SNRPD2
	ORMDL1
	HBG2
	XRN2
	A

In [None]:
# this is transcript-resolved mappings 5' end mappings, though with short reads many of these will end up randomly assigned
for pretty_name, bam_file in bam_list.iteritems():
    alignments = BAMGenomeArray(bam_file)
    alignments.set_mapping(FivePrimeMapFactory())
    
    print(bam_file)
    print('================================================')

    transcript_count_vectors = defaultdict(dict)

    for gene, transcripts in transcripts_by_gene.iteritems():
        print('\t{0}'.format(gene))
        for transcript_id, transcript in transcripts.iteritems():
            print('\t\t{0}'.format(transcript_id))
            transcript_count_vectors[gene][transcript_id] = transcript.get_counts(alignments)
            
    save_obj(transcript_count_vectors, './counts/transcript_count_vectors_{0}.count'.format(pretty_name))

/home/jmr/direct_capture/doubles/perturbseq/count/CR2_l4/outs/possorted_genome_bam.bam
	HSF1
		ENST00000528842
		ENST00000530661
		ENST00000614796
		ENST00000528988
		ENST00000400780
		ENST00000531447
		ENST00000528199
		ENST00000533130
		ENST00000533240
		ENST00000532338
		ENST00000527328
		ENST00000534314
		ENST00000529630
		ENST00000528838
	HSPA5
		ENST00000324460
	HSPA9
		ENST00000524109
		ENST00000504902
		ENST00000508003
		ENST00000507097
		ENST00000506477
		ENST00000501917
		ENST00000523929
		ENST00000507886
		ENST00000505110
		ENST00000297185
		ENST00000507115
		ENST00000504810
		ENST00000512328
	HSPA8
		ENST00000526686
		ENST00000532167
		ENST00000524552
		ENST00000526862
		ENST00000527983
		ENST00000532636
		ENST00000534624
		ENST00000530391
		ENST00000227378
		ENST00000532091
		ENST00000525463
		ENST00000533238
		ENST00000453788
		ENST00000532780
		ENST00000534319
		ENST00000526110
		ENST00000525624
		ENST00000532182
		ENST00000534567
		ENST00000531063
		ENST00000524590
		EN

In [None]:
for pretty_name, bam_file in bam_list.iteritems():
    alignments = BAMGenomeArray(bam_file)
    alignments.set_mapping(CenterMapFactory())
    
    print(bam_file)
    print('================================================')

    transcript_count_vectors = defaultdict(dict)

    for gene, transcripts in transcripts_by_gene.iteritems():
        print('\t{0}'.format(gene))
        for transcript_id, transcript in transcripts.iteritems():
            print('\t\t{0}'.format(transcript_id))
            transcript_count_vectors[gene][transcript_id] = transcript.get_counts(alignments)
            
    save_obj(transcript_count_vectors, './counts/transcript_count_vectors_{0}.count'.format(pretty_name))

In [26]:
import glob

In [84]:
def get_files(pattern):
    file_list = glob.glob(pattern)
    all_count_vectors = dict()
    
    for file_name in file_list:
        all_count_vectors[file_name] = load_obj(file_name)
    return all_count_vectors

def merge_count_files(pattern):
    data = get_files(pattern)
    
    merged = dict()

    # initialize dictionary with values from first file
    first = data.pop(data.keys()[0])
    for gene_name, values in first.iteritems():
        merged[gene_name] = values.copy()

    # add on the rest
    for file, arrays in data.iteritems():
        for gene_name, values in arrays.iteritems():
            merged[gene_name] += values
            
    return merged

def merge_transcript_count_files(pattern):
    data = get_files(pattern)
    
    merged = defaultdict(dict)

    # initialize dictionary with values from first file
    first = data.pop(data.keys()[0])
    for gene_name, transcript_dict in first.iteritems():
        for transcript_id, values in transcript_dict.iteritems():
            merged[gene_name][transcript_id] = values.copy()

    # add on the rest
    for file, arrays in data.iteritems():
        for gene_name, transcript_dict in arrays.iteritems():
            for transcript_id, values in transcript_dict.iteritems():
                merged[gene_name][transcript_id] += values
            
    return merged

In [77]:
save_obj(merge_count_files('./counts/count_vectors*'), './counts/merged_count_vectors.count')

In [78]:
save_obj(merge_count_files('./counts/start_count_vectors*'), './counts/merged_start_count_vectors.count')

In [91]:
save_obj(merge_transcript_count_files('./counts/transcript_count_vectors*'), './counts/merged_transcript_count_vectors.count')