<a href="https://colab.research.google.com/github/jioffe502/kmerkounters/blob/main/kmercount.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install mrjob
!pip install mrjob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mrjob
  Downloading mrjob-0.7.4-py2.py3-none-any.whl (439 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m439.6/439.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mrjob
Successfully installed mrjob-0.7.4


In [2]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.121-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.30.0,>=1.29.121
  Downloading botocore-1.29.121-py3-none-any.whl (10.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.26.121 botocore-1.29.121 jmespath-1.0.1 s3transfer-0.6.0


In [3]:
from google.colab import files

# upload dataset
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving ioffe.pem to ioffe.pem
User uploaded file "ioffe.pem" with length 1674 bytes


In [4]:
from google.colab import files

# upload mrjob.conf under current directory(/content/mrjob.conf)
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving mrjob.conf to mrjob.conf
User uploaded file "mrjob.conf" with length 651 bytes


In [5]:
!chmod 400 ioffe.pem

In [None]:
%%file kmercount.py
from mrjob.job import MRJob
import json
import re


'''
Count the number of kmers. Base MR Job - No improvements
'''

class KmerCount(MRJob):

    def mapper(self, _, line):
        # Parse the FASTA file and extract the sequence
        if line.startswith(">"):
            seq = ""
        else:
            seq = line.strip()
            # Count the number of kmers of size 1
            for i in range(len(seq)):
                kmer = seq[i:i+1]
                yield kmer, 1
    
    def reducer(self, kmer, counts):
        # Sum the counts for each kmer
        yield kmer, sum(counts)


if __name__ == '__main__':
    KmerCount.run()

Overwriting kmercount.py


In [8]:
%%file kmercount.py
from mrjob.job import MRJob
import json
import re


'''
Count the number of kmers. MR job with Combiner and defining kmer length
'''

class KmerCount(MRJob):

    def configure_args(self):
        super(KmerCount, self).configure_args()
        self.add_passthru_arg('--kmer-length', type=int, default=1, help='Length of the kmer')

    def mapper(self, _, line):
        # Parse the FASTA file and extract the sequence
        if line.startswith(">"):
            seq = ""
        else:
            seq = line.strip()
            kmer_length = self.options.kmer_length
            # Count the number of kmers of specified size
            for i in range(len(seq) - kmer_length + 1):
                kmer = seq[i:i+kmer_length]
                yield kmer, 1

    
    def reducer(self, kmer, counts):
        # Sum the counts for each kmer
        yield kmer, sum(counts)
    
    combiner = reducer



if __name__ == '__main__':
    KmerCount.run()


Overwriting kmercount.py


In [9]:
%%time
!python kmercount.py --kmer-length 3 -r local /content/NA24385_illumina_hg19_chr1_first1000.fa > kmercount.out

No configs found; falling back on auto-configuration
No configs specified for local runner
Creating temp directory /tmp/kmercount.root.20230427.200752.056919
Running step 1 of 1...
job output is in /tmp/kmercount.root.20230427.200752.056919/output
Streaming final output from /tmp/kmercount.root.20230427.200752.056919/output...
Removing temp directory /tmp/kmercount.root.20230427.200752.056919...
CPU times: user 49.8 ms, sys: 10 ms, total: 59.8 ms
Wall time: 5.38 s


In [11]:
#run the program. Capture output
%%time
!python kmercount.py -r emr s3://kmer-finalproject/kmer/final.fa --cloud-tmp-dir=s3://kmer-finalproject/tmp --cluster-id=j-3DZTKWVCU4YML --conf-path /content/mrjob.conf > kmercountfinal.out

Creating temp directory /tmp/kmercount.root.20230427.200936.914624
uploading working dir files to s3://kmer-finalproject/tmp/kmercount.root.20230427.200936.914624/files/wd...
Copying other local files to s3://kmer-finalproject/tmp/kmercount.root.20230427.200936.914624/files/
Adding our job to existing cluster j-3DZTKWVCU4YML
  master node is ec2-23-20-26-185.compute-1.amazonaws.com
Waiting for Step 1 of 1 (s-1VHXWQPAVYNZT) to complete...
  PENDING (cluster is RUNNING: Running step)
  RUNNING for 0:00:36
     5.0% complete
  RUNNING for 0:01:10
     5.3% complete
  RUNNING for 0:01:43
     5.7% complete
  RUNNING for 0:02:17
     6.2% complete
  RUNNING for 0:02:50
     6.7% complete
  RUNNING for 0:03:24
     7.1% complete
  RUNNING for 0:03:57
     7.6% complete
  RUNNING for 0:04:31
     8.0% complete
  RUNNING for 0:05:04
     8.4% complete
  RUNNING for 0:05:37
     8.7% complete
  RUNNING for 0:06:11
     8.8% complete
  RUNNING for 0:06:44
     8.9% complete
  RUNNING for 0:07:18