In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

import os.path as op
import findspark
import os
findspark.init()

import pyspark
sc = pyspark.SparkContext()

Populating the interactive namespace from numpy and matplotlib


In [2]:
import shortuuid
shortuuid.uuid()

'Q6CVr8VqHJs4fswJfn8DQB'

## Utility functions

In [17]:
assembly = 'hg19'
data_dir = op.expanduser("~/data")
base_ucsc_dir = op.join(data_dir, 'ucsc-data/{}'.format(assembly))  # where all of the files downloaded from UCSC will be stored

get_outfile = lambda x: gu.get_outfile(x, data_dir, assembly)

## Load the chromosome lengths

In [14]:
import genome_utils as gu

cum_chrom_lengths = gu.get_chrom_lengths(sc, base_ucsc_dir)
print cum_chrom_lengths['chr1'], cum_chrom_lengths['chr2'], cum_chrom_lengths['chr3']

0 249250621 492449994


## Loading the refgene data

In [15]:
refGene = gu.load_refgene_data(sc, base_ucsc_dir, cum_chrom_lengths)
### add the genomic position
print refGene.take(1)

[{'exonEnds': u'15469389,15471514,15473730,15476045,15478082,15484120', 'geneName': u'EAF1', 'chromOffset': 492449994, 'name': u'NM_033083', 'txStart': u'15469063', 'exonCount': u'6', 'strand': u'+', 'cdsEnd': u'15480662', 'genomeTxStart': 507919057, 'geneLength': 15057, 'cdsStart': u'15469286', 'chrom': u'chr3', 'genomeTxEnd': 507934114, 'txEnd': u'15484120', 'exonStarts': u'15469063,15471419,15473593,15475854,15477848,15480615'}]


## Count the references

For this exercise, the result should be a tsv with the following columns:

```
<taxId> <geneId> <refSeqID> <citation count>
```

In [19]:
taxid_geneid_count = gu.load_gene_counts(sc, op.join(data_dir, 'genbank-data/'),
                                        outfile=get_outfile('taxid-geneid-count'))
print taxid_geneid_count.take(1)

[((9, 1246500), {'count': 1})]
[((6279, 6095747), {'count': 1})]


## Load the gene2refseq annotations

In [22]:
refseqid_taxid_geneid = gu.load_refseq2gene(sc, op.join(data_dir, 'genbank-data'), 
                                        outfile=get_outfile('refseq-taxid-geneid'))

[(u'-', (9, 1246500)), (u'-', (9, 1246501)), (u'-', (9, 1246502)), (u'-', (9, 1246503)), (u'-', (9, 1246504)), (u'-', (9, 1246505)), (u'-', (9, 1246509)), (u'-', (9, 1246510)), (u'-', (9, 3722426)), (u'-', (9, 8655732))]
[(u'XM_009054708', (225164, 20243866))]


## Join the reference counts and refseq to geneid translations

In [23]:
print refseqid_taxid_geneid.take(1)


[(u'XM_009054708', (225164, 20243866))]


In [24]:
taxid_geneid_count_refseq = gu.join_counts_and_ids(sc, refseqid_taxid_geneid, taxid_geneid_count,
                                                  outfile=get_outfile('taxid-geneid-refseqid-count'))

count1: 9519075
count2: 7364241
[((4577, 103654904), u'XR_566429')]
taxid_geneid_count [((6279, 6095747), {'count': 1})]
1. taxid_geneid_count.count(): 8516870
2. taxid_geneid_count.count(): 8516870
[((6279, 6095747), ({'count': 1}, u'XM_001892259'))]
[((1233873, 14509061), {'count': 1})]


## Join the refgene data with the count data

In [15]:


refseqid_refgene_count = gu.join_refgene_and_counts(refGene, taxid_geneid_count_refseq)

outfile = get_outfile('refgene-count')
(refseqid_refgene_count.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

[(u'NM_033083', {'exonEnds': u'15469389,15471514,15473730,15476045,15478082,15484120', 'geneName': u'EAF1', 'chromOffset': 492449994, 'name': u'NM_033083', 'txStart': u'15469063', 'exonCount': u'6', 'strand': u'+', 'cdsEnd': u'15480662', 'genomeTxStart': 507919057, 'geneLength': 15057, 'cdsStart': u'15469286', 'chrom': u'chr3', 'genomeTxEnd': 507934114, 'txEnd': u'15484120', 'exonStarts': u'15469063,15471419,15473593,15475854,15477848,15480615'})]
[(u'XM_001892259', {'count': 1})]
[(u'NM_001466', ({'exonEnds': u'42638630', 'geneName': u'FZD2', 'chromOffset': 2655442424, 'name': u'NM_001466', 'txStart': u'42634811', 'exonCount': u'1', 'strand': u'+', 'cdsEnd': u'42636754', 'genomeTxStart': 2698077235, 'exonStarts': u'42634811', 'cdsStart': u'42635056', 'chrom': u'chr17', 'genomeTxEnd': 2698081054, 'txEnd': u'42638630', 'geneLength': 3819}, {'count': 23}))]


In [16]:
print refseqid_refgene_count.take(1)

[(u'NM_001466', ({'exonEnds': u'42638630', 'geneName': u'FZD2', 'chromOffset': 2655442424, 'name': u'NM_001466', 'txStart': u'42634811', 'exonCount': u'1', 'strand': u'+', 'cdsEnd': u'42636754', 'genomeTxStart': 2698077235, 'exonStarts': u'42634811', 'cdsStart': u'42635056', 'chrom': u'chr17', 'genomeTxEnd': 2698081054, 'txEnd': u'42638630', 'geneLength': 3819}, {'count': 23}))]


In [17]:
refseqid_refgene_count_plus = refseqid_refgene_count.filter(lambda x: x[1][0]['strand'] == '+')

outfile = get_outfile('refgene-count-plus')
(refseqid_refgene_count_plus.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

In [18]:
refseqid_refgene_count_minus = refseqid_refgene_count.filter(lambda x: x[1][0]['strand'] == '-')

outfile = get_outfile('refgene-count-minus')
(refseqid_refgene_count_minus.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

## Run the entire pipeline

In [19]:
assembly = 'hg19'
output_dir = op.join(data_dir, assembly)    # where all of the intermediate output will be stored
base_ucsc_dir = op.join(data_dir, 'ucsc-data/{}'.format(assembly))  # where all of the files downloaded from UCSC will be stored

cum_chrom_lengths = get_chrom_lengths(base_ucsc_dir)
refGene = load_refgene_data(base_ucsc_dir)
taxid_geneid_count = load_gene_counts(op.join(data_dir, 'genbank-data/'))
refseqid_taxid_geneid = load_refseq2gene(op.join(data_dir, 'genbank-data'))
taxid_geneid_count_refseq = join_counts_and_ids(refseqid_taxid_geneid, taxid_geneid_count)
refseqid_refgene_count = join_refgene_and_counts(refGene, taxid_geneid_count_refseq)

outfile = get_outfile('refgene-count')
(refseqid_refgene_count.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

[((9, 1246500), {'count': 1})]
[(u'-', (9, 1246500)), (u'-', (9, 1246501)), (u'-', (9, 1246502)), (u'-', (9, 1246503)), (u'-', (9, 1246504)), (u'-', (9, 1246505)), (u'-', (9, 1246509)), (u'-', (9, 1246510)), (u'-', (9, 3722426)), (u'-', (9, 8655732))]
[(u'XM_009054708', (225164, 20243866))]
count1: 9519075
count2: 7364241
[((4577, 103654904), u'XR_566429')]
taxid_geneid_count [((6279, 6095747), {'count': 1})]
1. taxid_geneid_count.count(): 8516870
2. taxid_geneid_count.count(): 8516870
[((6279, 6095747), ({'count': 1}, u'XM_001892259'))]
[((6239, 191243), {'count': 1})]
[(u'NM_033083', {'exonEnds': u'15469389,15471514,15473730,15476045,15478082,15484120', 'geneName': u'EAF1', 'chromOffset': 492449994, 'name': u'NM_033083', 'txStart': u'15469063', 'exonCount': u'6', 'strand': u'+', 'cdsEnd': u'15480662', 'genomeTxStart': 507919057, 'geneLength': 15057, 'cdsStart': u'15469286', 'chrom': u'chr3', 'genomeTxEnd': 507934114, 'txEnd': u'15484120', 'exonStarts': u'15469063,15471419,15473593,15

In [38]:
refseqid_refgene_count_plus = refseqid_refgene_count.filter(lambda x: x[1][0]['strand'] == '+')

outfile = get_outfile('refgene-count-plus')
print outfile
print refseqid_refgene_count_plus.take(1)
(refseqid_refgene_count_plus.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

/Users/peter/data/hg19/genbank-output/refgene-count-plus
[(u'NM_001466', ({'exonEnds': u'42638630', 'geneName': u'FZD2', 'chromOffset': 2655442424, 'name': u'NM_001466', 'txStart': u'42634811', 'exonCount': u'1', 'strand': u'+', 'cdsEnd': u'42636754', 'genomeTxStart': 2698077235, 'exonStarts': u'42634811', 'cdsStart': u'42635056', 'chrom': u'chr17', 'genomeTxEnd': 2698081054, 'txEnd': u'42638630', 'geneLength': 3819}, {'count': 23}))]


In [39]:
refseqid_refgene_count_plus = refseqid_refgene_count.filter(lambda x: x[1][0]['strand'] == '-')

outfile = get_outfile('refgene-count-minus')
print outfile
print refseqid_refgene_count_plus.take(1)
(refseqid_refgene_count_plus.map(lambda x: "{name}\t{chrom}\t{strand}\t{txStart}\t{txEnd}\t{genomeTxStart}\t{genomeTxEnd}\t{cdsStart}\t{cdsEnd}\t{exonCount}\t{exonStarts}\t{exonEnds}\t{geneName}\t{count}\t{uid}"
                            .format(count=x[1][1]['count'],uid=shortuuid.uuid(), **x[1][0]))
 .saveAsTextFile(outfile))

/Users/peter/data/hg19/genbank-output/refgene-count-minus
[(u'NR_031592', ({'exonEnds': u'10514214', 'geneName': u'MIR1181', 'chromOffset': 2937113968, 'name': u'NR_031592', 'txStart': u'10514133', 'exonCount': u'1', 'strand': u'-', 'cdsEnd': u'10514214', 'genomeTxStart': 2947628101, 'exonStarts': u'10514133', 'cdsStart': u'10514214', 'chrom': u'chr19', 'genomeTxEnd': 2947628182, 'txEnd': u'10514214', 'geneLength': 81}, {'count': 3}))]
