From c5cc47790e2e7320c4eae47630bd6d3258b09c16 Mon Sep 17 00:00:00 2001
From: John Andrew Fingerhut <andy_fingerhut@alum.wustl.edu>
Date: Wed, 29 Jul 2009 22:45:16 -0700
Subject: [PATCH] Update RESULTS, add CHANGELOG, new Clojure versions of
 reverse-complement benchmark

---
 CHANGELOG               |  11 +++
 RESULTS                 |  51 ++++++++-----
 rcomp/README            |  30 ++++++++
 rcomp/clj-run.sh        |   2 +-
 rcomp/revcomp.clj-2.clj |  70 +++++++++++++++---
 rcomp/revcomp.clj-4.clj | 159 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 293 insertions(+), 30 deletions(-)
 create mode 100644 CHANGELOG
 create mode 100644 rcomp/README
 create mode 100644 rcomp/revcomp.clj-4.clj

diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..3041c65
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,11 @@
+Jul 29, 2009
+
+Added some Clojure programs for the reverse-complement benchmark.  See
+rcomp/README for some memory issues I'm still trying to solve.
+
+Found a significant speedup for the Clojure program for the
+k-nucleotide problem.  Lesson: If you want to use the same constant
+map like a function many times, assign it to a Var once, and let-bind
+it where it is used to a name.  Don't just have the map there in the
+first element of a form -- Clojure will recreate a new map each time
+before calling it.
diff --git a/RESULTS b/RESULTS
index ae89d92..aa2e65b 100644
--- a/RESULTS
+++ b/RESULTS
@@ -18,26 +18,20 @@ Times are real / user / sys on my iMac
 -----------------------------------------------------
            noT      noT            |  noT
 long test on my iMac:
-rcomp   |   8.3  |  11.9  |        |   8.5  | no implementation yet
-        |   4.8  |   7.9  |        |   3.6  |
-        |   2.1  |   2.3  |        |   1.4  |
+rcomp   |   8.3  |  11.9  |        |   8.5  | current
+        |   4.8  |   7.9  |        |   3.6  | implementation
+        |   2.1  |   2.3  |        |   1.4  | crashes on long test
 
 long test on my iMac:
-mand-   | wrong  | out of |  32.7  |  28.6  | 340.4
-elbrot  | output | mem    |  59.3  |  54.4  | 350.5
-        |        | (?)    |   0.8  |   0.4  |   4.7
+mand-   | wrong  | out of |  32.7  |  28.6  | 325.0
+elbrot  | output | mem    |  59.3  |  54.4  | 326.1  clj = 
+        |        | (?)    |   0.8  |   0.4  |   3.2  6.01 x java
 
            noT      noT            |    T
 long test on my iMac:
-k-nuc-  | 190.9  | 306.0  |  90.5  |  52.4  | 1677.6 (27m 57.6s)
-leotide | 187.9  | 302.7  | 130.8  |  89.6  | 2245.1 (37m 25.1s)
-        |   2.4  |   1.9  |   4.6  |   1.8  |   24.2 (    24.2s)
-
-Disappointing result: Replacing dna-char-to-code-val with a macro did
-not speed things up, and may have slowed them down:
-                                            | 1800.0 (30m  0.0s)
-                                            | 2317.0 (38m 37.0s)
-                                            |   30.6 (    30.6s)
+k-nuc-  | 190.9  | 306.0  |  90.5  |  53.5  | 1286.4 (21m 26.4s)
+leotide | 187.9  | 302.7  | 130.8  |  91.8  | 1118.1 (18m 38.1s) clj =
+        |   2.4  |   1.9  |   4.6  |   2.1  |   21.8 (    21.8s) 12.1 x java
 
 k-nucleotide long test on benchmark shootout machine:
         | 164.9  | 249.8  |  52.0  |  20.6  |
@@ -45,12 +39,14 @@ k-nucleotide long test on benchmark shootout machine:
         |     ?  |     ?  |     ?  |     ?  |
 
 k-nucleotide medium test on my iMac:
-        |   8.6  |  12.7  |   3.9  |   3.9  |   64.2 / 69.6 / 69.1 / 67.1
-        |   7.9  |  12.5  |   5.4  |   5.7  |   98.4 / 92.9 / 93.1 / 88.1
-        |   0.6  |   0.1  |   0.3  |   0.2  |    1.5 /  1.6 /  1.6 /  1.7
+        |   8.6  |  12.7  |   3.9  |   3.9  |   43.8
+        |   7.9  |  12.5  |   5.4  |   5.7  |   46.5  clj =
+        |   0.6  |   0.1  |   0.3  |   0.2  |    1.5  8.1 x java
 
 k-nucleotide medium test, all clj, modified-pmap with specified number
-of parallel threads, on my iMac:
+of parallel threads, on my iMac (note, these are from
+knucleotide.clj-5.clj, not the slightly improved knucleotide.clj-6.clj
+version):
         |   1    |   2    |   3    |   4    |   5    |   6    |
         |  74.9  |  70.7  |  77.2  |  76.8  |  82.5  |  77.8  |
         | 125.9  | 122.1  | 134.6  | 134.0  | 143.4  | 134.0  |
@@ -61,6 +57,23 @@ fasta
 thread-ring
 
 
+The longest few benchmark run on my iMac are:
+
+mins benchmark test language
+
+127 mandelbrot long perl
+ 21       knuc long clj (21m 26.4s / 18m 38.1s / 21.8s)
+  9      fasta long perl
+  5 mandelbrot long clj
+  5       knuc long perl
+  3       knuc long sbcl
+  2 mandelbrot long sbcl
+  1.5     knuc long ghc
+  0.9     knuc long java
+  ?      rcomp long clj (runs out of memory or crahes with IOException)
+
+
+
 mandelbrot notes:
 
 sbcl version implemented with threads.  I run it with only 1 thread.
diff --git a/rcomp/README b/rcomp/README
new file mode 100644
index 0000000..f4b4002
--- /dev/null
+++ b/rcomp/README
@@ -0,0 +1,30 @@
+July 29, 2009
+
+revcomp.clj-4.clj is the best I've got so far, but it runs out of
+memory on the full size benchmark.
+
+long-input.txt contains 3 DNA sequences in FASTA format.  The first is
+50,000,000 characters long, the second is 75,000,000 characters long,
+and the third is 125,000,000 characters long.  Each needs to be
+reversed, have each character replaced with a different one, and
+printed out, so we need to store each of the strings one at a time,
+but it is acceptable to deallocate/garbage-collect the previous one
+when starting on the next.  I think my code should be doing that, but
+I don't know how to verify that.
+
+I've read that a Java String takes 2 bytes per character, plus about
+38 bytes of overhead per string.  That is about 250 Mbytes for the
+longest one.  I also read in a seq of lines, and these long strings
+are split into lines with 60 characters (plus a newline) each.  Thus
+the string's data needs to be stored at least twice temporarily --
+once for the many 60-character strings, plus the final long one.
+Also, the Java StringBuilder that Clojure's (str ...) function uses
+probably needs to be copied and reallocated periodically as it
+outgrows its current allocation.  So I could imagine needing about 3 *
+250 Mbytes temporarily, but that doesn't explain why my 1536 Mbytes of
+JVM memory are being exhausted.
+
+It would be possible to improve things by not creating all of the
+separate strings, one for each line, and then concatenating them
+together.  But first I'd like to explain why it is using so much,
+because I must be missing something.
diff --git a/rcomp/clj-run.sh b/rcomp/clj-run.sh
index 8063c2d..2aca14d 100755
--- a/rcomp/clj-run.sh
+++ b/rcomp/clj-run.sh
@@ -5,4 +5,4 @@
 
 source ../env.sh
 
-$JAVA -server -Xmx1280m ${JAVA_PROFILING} -cp ${CLOJURE_CLASSPATH} clojure.main revcomp.clj-2.clj "$@"
+$JAVA -server -Xmx1536m ${JAVA_PROFILING} -cp ${CLOJURE_CLASSPATH} clojure.main revcomp.clj-4.clj "$@"
diff --git a/rcomp/revcomp.clj-2.clj b/rcomp/revcomp.clj-2.clj
index 11e162e..93ae09b 100644
--- a/rcomp/revcomp.clj-2.clj
+++ b/rcomp/revcomp.clj-2.clj
@@ -10,20 +10,41 @@
   (= \> (first (seq l))))
 
 
+;; TBD: Try avoiding the use of when-let, in case it might be causing
+;; me to hold on to a head of a sequence when I don't want to.
+
 (defn fasta-desc-dna-str-pairs
   "Take a sequence of lines from a FASTA format file.  Return a lazy sequence of 2-element vectors [desc dna-seq], where desc is a FASTA description line, and sdna-seq is the concatenated FASTA DNA sequence with that description.  If input file is big, you'll save lots of memory if you call this function in a with-open for the file, and don't hold on to the head of the lines parameter."
   [lines]
-  (when-let [x (drop-while (fn [l]
-                             (not (fasta-description-line l)))
-                           lines)]
-    (when-let [x (seq x)]
-      (lazy-seq
+  (lazy-seq
+    (let [lines (seq (drop-while (fn [l] (not (fasta-description-line l)))
+				 lines))]
+      (when lines
 	(let [[lines-before-next-desc next-desc-line-onwards]
-	      (split-with (fn [l] (not (fasta-description-line l))) (rest x))]
-	  (cons [(first x) (apply str lines-before-next-desc)]
+	      (split-with (fn [l] (not (fasta-description-line l)))
+			  (rest lines))]
+	  (cons [(first lines) (apply str lines-before-next-desc)]
 		(fasta-desc-dna-str-pairs next-desc-line-onwards)))))))
 
 
+(comment
+
+(defn fasta-desc-dna-str-pairs
+  "Take a sequence of lines from a FASTA format file.  Return a lazy sequence of 2-element vectors [desc dna-seq], where desc is a FASTA description line, and sdna-seq is the concatenated FASTA DNA sequence with that description.  If input file is big, you'll save lots of memory if you call this function in a with-open for the file, and don't hold on to the head of the lines parameter."
+  [lines]
+  (lazy-seq
+    (when-let [lines (drop-while (fn [l]
+				   (not (fasta-description-line l)))
+				 lines)]
+      (when-let [lines (seq lines)]
+	(let [[lines-before-next-desc next-desc-line-onwards]
+	      (split-with (fn [l] (not (fasta-description-line l)))
+			  (rest lines))]
+	  (cons [(first lines) (apply str lines-before-next-desc)]
+		(fasta-desc-dna-str-pairs next-desc-line-onwards)))))))
+)
+
+
 (def complement-of-dna-char
      {\w \W, \W \W,
       \s \S, \S \S,
@@ -75,11 +96,40 @@
 
 
 (let [max-dna-chars-per-line 60]
-  (with-open [br (java.io.BufferedReader. *in*)
-	      bw (java.io.BufferedWriter. *out*)]
+;;  (with-open [br (java.io.BufferedReader. *in*)
+;;	      bw (java.io.BufferedWriter. *out*)]
+  (let [br (java.io.BufferedReader. *in*)
+	bw (java.io.BufferedWriter. *out*)]
     (doseq [[desc dna-seq] (fasta-desc-dna-str-pairs (line-seq br))]
       (println-string-to-buffered-writer bw desc)
+      (println-string-to-buffered-writer bw dna-seq)
+;;      (print-reverse-complement-of-str-in-lines bw dna-seq
+;;						max-dna-chars-per-line)
+      (. bw flush))
+    ))
+
+
+(comment
+
+(let [max-dna-chars-per-line 60]
+;;  (with-open [br (java.io.BufferedReader. *in*)
+;;	      bw (java.io.BufferedWriter. *out*)]
+  (let [br (java.io.BufferedReader. *in*)
+	bw (java.io.BufferedWriter. *out*)]
+    (doseq [[dna-seq-num [desc dna-seq]]
+	    (map (fn [x y] [x y])
+		 (iterate inc 1)
+		 (fasta-desc-dna-str-pairs (line-seq br)))]
+      (println-string-to-buffered-writer bw (format "%d" dna-seq-num))
+      (println-string-to-buffered-writer bw desc)
+      ;; (. bw flush)
       (print-reverse-complement-of-str-in-lines bw dna-seq
-						max-dna-chars-per-line))))
+						max-dna-chars-per-line)
+      ;; (. bw flush)
+      )
+    (. bw flush)
+    ))
+)
+
 
 (. System (exit 0))
diff --git a/rcomp/revcomp.clj-4.clj b/rcomp/revcomp.clj-4.clj
new file mode 100644
index 0000000..09d5f55
--- /dev/null
+++ b/rcomp/revcomp.clj-4.clj
@@ -0,0 +1,159 @@
+;; Author: Andy Fingerhut (andy_fingerhut@alum.wustl.edu)
+;; Date: Jul 29, 2009
+
+;; I don't see why revcomp.clj-2.clj is using so much memory.  My next
+;; idea is to memory map the input file, and only copy small parts of
+;; it at a time into other buffers, reversed and complemented, for
+;; writing.  Not necessarily very Clojure-ish, but the input data is
+;; too big for the most straightforward kinds of solutions.
+
+;; Unfortunately, I don't think we can memory-map standard input.
+
+;; The next most memory-efficient thing I can think of is to slurp in
+;; part of the input file at a time -- only one DNA sequence's worth
+;; at a time, not a as a sequence of lines, but as a string or byte
+;; array.
+
+
+;;(set! *warn-on-reflection* true)
+
+(defn fasta-description-line
+  "Return true when the line l is a FASTA description line"
+  [l]
+  (= \> (first (seq l))))
+
+
+(defn split-with-2
+  [pred coll]
+  (loop [s (seq coll)
+	 reversed-take-while '()]
+    (if s
+      (if (pred (first s))
+	(recur (next s) (cons (first s) reversed-take-while))
+	[(reverse reversed-take-while) s])
+      [(reverse reversed-take-while) '()])))
+
+
+;; TBD: Try avoiding the use of when-let, in case it might be causing
+;; me to hold on to a head of a sequence when I don't want to.
+
+(defn fasta-desc-dna-str-pairs
+  "Take a sequence of lines from a FASTA format file.  Return a lazy sequence of 2-element vectors [desc dna-seq], where desc is a FASTA description line, and sdna-seq is the concatenated FASTA DNA sequence with that description.  If input file is big, you'll save lots of memory if you call this function in a with-open for the file, and don't hold on to the head of the lines parameter."
+  [lines]
+  (lazy-seq
+    (let [lines (seq (drop-while (fn [l] (not (fasta-description-line l)))
+				 lines))]
+      (when lines
+	(let [[lines-before-next-desc next-desc-line-onwards]
+	      (split-with-2 (fn [l] (not (fasta-description-line l)))
+		            (rest lines))]
+	  (cons [(first lines) (apply str lines-before-next-desc)]
+		(fasta-desc-dna-str-pairs next-desc-line-onwards)))))))
+
+
+(comment
+
+(defn fasta-desc-dna-str-pairs
+  "Take a sequence of lines from a FASTA format file.  Return a lazy sequence of 2-element vectors [desc dna-seq], where desc is a FASTA description line, and sdna-seq is the concatenated FASTA DNA sequence with that description.  If input file is big, you'll save lots of memory if you call this function in a with-open for the file, and don't hold on to the head of the lines parameter."
+  [lines]
+  (lazy-seq
+    (when-let [lines (drop-while (fn [l]
+				   (not (fasta-description-line l)))
+				 lines)]
+      (when-let [lines (seq lines)]
+	(let [[lines-before-next-desc next-desc-line-onwards]
+	      (split-with-2 (fn [l] (not (fasta-description-line l)))
+		            (rest lines))]
+	  (cons [(first lines) (apply str lines-before-next-desc)]
+		(fasta-desc-dna-str-pairs next-desc-line-onwards)))))))
+)
+
+
+(def complement-of-dna-char
+     {\w \W, \W \W,
+      \s \S, \S \S,
+      \a \T, \A \T,
+      \t \A, \T \A,
+      \u \A, \U \A,
+      \g \C, \G \C,
+      \c \G, \C \G,
+      \y \R, \Y \R,
+      \r \Y, \R \Y,
+      \k \M, \K \M,
+      \m \K, \M \K,
+      \b \V, \B \V,
+      \d \H, \D \H,
+      \h \D, \H \D,
+      \v \B, \V \B,
+      \n \N, \N \N })
+
+
+(defn print-reverse-complement-of-str-in-lines [#^java.io.BufferedWriter bw
+						#^java.lang.String s
+						max-len]
+  (let [comp complement-of-dna-char
+	len (int (count s))
+	max-len (int max-len)]
+    (when (> len 0)
+      (loop [start (int (dec len))
+	     to-print-before-nl (int max-len)]
+	(let [next-start (int (dec start))
+	      next-to-print-before-nl (int (dec to-print-before-nl))]
+	  (. bw write (int (comp (. s charAt start))))
+	  (when (zero? next-to-print-before-nl)
+	    (. bw newLine))
+	  (when (not (zero? start))
+	    (if (zero? next-to-print-before-nl)
+	      (recur next-start max-len)
+	      (recur next-start next-to-print-before-nl)))))
+      ;; Need one more newline at the end if the string was not a
+      ;; multiple of max-len characters.
+      (when (not= 0 (rem len max-len))
+	(. bw newLine))
+      )))
+
+
+(defn println-string-to-buffered-writer [#^java.io.BufferedWriter bw
+					 #^java.lang.String s]
+  (. bw write (.toCharArray s) 0 (count s))
+  (. bw newLine))
+
+
+(let [max-dna-chars-per-line 60]
+;;  (with-open [br (java.io.BufferedReader. *in*)
+;;	      bw (java.io.BufferedWriter. *out*)]
+  (let [br (java.io.BufferedReader. *in*)
+	bw (java.io.BufferedWriter. *out*)]
+    (doseq [[desc dna-seq] (fasta-desc-dna-str-pairs (line-seq br))]
+      (println-string-to-buffered-writer bw desc)
+;;      (println-string-to-buffered-writer bw dna-seq)
+      (print-reverse-complement-of-str-in-lines bw dna-seq
+						max-dna-chars-per-line)
+      (. bw flush))
+    ))
+
+
+(comment
+
+(let [max-dna-chars-per-line 60]
+;;  (with-open [br (java.io.BufferedReader. *in*)
+;;	      bw (java.io.BufferedWriter. *out*)]
+  (let [br (java.io.BufferedReader. *in*)
+	bw (java.io.BufferedWriter. *out*)]
+    (doseq [[dna-seq-num [desc dna-seq]]
+	    (map (fn [x y] [x y])
+		 (iterate inc 1)
+		 (fasta-desc-dna-str-pairs (line-seq br)))]
+      (println-string-to-buffered-writer bw (format "%d" dna-seq-num))
+      (println-string-to-buffered-writer bw desc)
+      ;; (. bw flush)
+      (print-reverse-complement-of-str-in-lines bw dna-seq
+						max-dna-chars-per-line)
+      ;; (. bw flush)
+      )
+    (. bw flush)
+    ))
+)
+
+
+(. System (exit 0))