From ba508d879e2517eeacda22e0324948c211ab47f9 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 29 Jun 2017 20:41:04 +0000 Subject: [PATCH 01/20] pkg moves, sorted/either iterators --- build.sbt | 5 +- .../iterator/BufferedTakeWhileIterator.scala | 17 -- .../hammerlab/iterator/EitherIterator.scala | 78 ++++++ .../iterator/GroupWithIterator.scala | 33 ++- .../iterator/RangeAccruingIterator.scala | 5 +- .../iterator/SortedZipIterator.scala | 71 ++++++ .../iterator/bulk/BufferedBulkIterator.scala | 39 +++ .../range/OverlappingRangesIterator.scala | 71 ++++++ .../org/hammerlab/iterator/range/Range.scala | 42 +++ .../{ => sliding}/Sliding2Iterator.scala | 4 +- .../{ => sliding}/Sliding3Iterator.scala | 4 +- .../{ => sliding}/SlidingIterator.scala | 4 +- .../BufferedTakeWhileIteratorTest.scala | 25 -- .../iterator/EitherIteratorTest.scala | 147 +++++++++++ .../iterator/GroupWithIteratorTest.scala | 10 +- .../iterator/RunLengthIteratorTest.scala | 17 ++ .../iterator/SimpleBufferedIteratorTest.scala | 3 +- .../iterator/SortedZipIteratorTest.scala | 240 ++++++++++++++++++ .../org/hammerlab/iterator/TestIterator.scala | 20 -- .../bulk/BufferedCollectWhileTest.scala | 52 ++++ .../iterator/bulk/BufferedDropWhileTest.scala | 85 +++++++ .../iterator/bulk/BufferedTakeWhileTest.scala | 92 +++++++ .../range/OverlappingRangesIteratorTest.scala | 114 +++++++++ .../{ => sliding}/Sliding2OptTest.scala | 4 +- .../{ => sliding}/Sliding2PadTest.scala | 4 +- .../{ => sliding}/Sliding2PrevTest.scala | 4 +- .../iterator/{ => sliding}/Sliding2Test.scala | 4 +- .../{ => sliding}/Sliding3NextOptsTest.scala | 4 +- .../{ => sliding}/Sliding3OptTest.scala | 4 +- .../iterator/{ => sliding}/Sliding3Test.scala | 4 +- .../{ => sliding}/SlidingIteratorTest.scala | 4 +- 31 files changed, 1118 insertions(+), 92 deletions(-) delete mode 100644 src/main/scala/org/hammerlab/iterator/BufferedTakeWhileIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/EitherIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/bulk/BufferedBulkIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/range/OverlappingRangesIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/range/Range.scala rename src/main/scala/org/hammerlab/iterator/{ => sliding}/Sliding2Iterator.scala (94%) rename src/main/scala/org/hammerlab/iterator/{ => sliding}/Sliding3Iterator.scala (94%) rename src/main/scala/org/hammerlab/iterator/{ => sliding}/SlidingIterator.scala (91%) delete mode 100644 src/test/scala/org/hammerlab/iterator/BufferedTakeWhileIteratorTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/EitherIteratorTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/TestIterator.scala create mode 100644 src/test/scala/org/hammerlab/iterator/bulk/BufferedCollectWhileTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/bulk/BufferedDropWhileTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/bulk/BufferedTakeWhileTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/range/OverlappingRangesIteratorTest.scala rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding2OptTest.scala (82%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding2PadTest.scala (81%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding2PrevTest.scala (82%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding2Test.scala (79%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding3NextOptsTest.scala (92%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding3OptTest.scala (92%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/Sliding3Test.scala (89%) rename src/test/scala/org/hammerlab/iterator/{ => sliding}/SlidingIteratorTest.scala (92%) diff --git a/build.sbt b/build.sbt index 4cd6c26..7d30574 100644 --- a/build.sbt +++ b/build.sbt @@ -1,12 +1,13 @@ name := "iterator" -version := "1.2.2" +version := "1.3.0-SNAPSHOT" addScala212 deps ++= Seq( libs.value('commons_math), - kryo.value, "com.chuusai" %% "shapeless" % "2.3.2", libs.value('spire) ) + +testDeps += kryo.value diff --git a/src/main/scala/org/hammerlab/iterator/BufferedTakeWhileIterator.scala b/src/main/scala/org/hammerlab/iterator/BufferedTakeWhileIterator.scala deleted file mode 100644 index e3b823f..0000000 --- a/src/main/scala/org/hammerlab/iterator/BufferedTakeWhileIterator.scala +++ /dev/null @@ -1,17 +0,0 @@ -package org.hammerlab.iterator - -case class BufferedTakeWhileIterator[T](it: BufferedIterator[T]) { - def takewhile(fn: T ⇒ Boolean): BufferedIterator[T] = - new SimpleBufferedIterator[T] { - override protected def _advance: Option[T] = - if (it.hasNext && fn(it.head)) - Some(it.next) - else - None - } -} - -object BufferedTakeWhileIterator { - implicit def makeBufferedTakeWhileIterator[T](it: BufferedIterator[T]): BufferedTakeWhileIterator[T] = - BufferedTakeWhileIterator(it) -} diff --git a/src/main/scala/org/hammerlab/iterator/EitherIterator.scala b/src/main/scala/org/hammerlab/iterator/EitherIterator.scala new file mode 100644 index 0000000..865ed42 --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/EitherIterator.scala @@ -0,0 +1,78 @@ +package org.hammerlab.iterator + +import org.hammerlab.iterator.bulk.BufferedBulkIterator._ + +import scala.collection.mutable.ArrayBuffer + +case class EitherIterator[T, U](it: BufferedIterator[Either[T, U]]) { + + def findLeft: Option[T] = + it + .collect { + case Left(t) ⇒ t + } + .buffered + .headOption + + def groupByLeft: BufferedIterator[(T, Iterator[U])] = { + + // Clear out any leading Rights + it + .collectwhile { + case Right(_) ⇒ null + } + .toList + + new SimpleBufferedIterator[(T, BufferedIterator[U])] { + var curLeft: Option[T] = None + var curRights = Iterator[U]().buffered + override protected def _advance: Option[(T, BufferedIterator[U])] = { + + // Clear any unused elements from the previous rights/U's/"values" iterator + curRights.toList + + it + .nextOption + .collect { + case Left(t) ⇒ + curRights = + it + .collectwhile { + case Right(u) ⇒ u + } + + t → curRights + case Right(u) ⇒ + throw new IllegalStateException( + s"nextOption should not be a Right" + ) + } + } + } + } + + def roundUpRight: BufferedIterator[(Seq[T], U)] = + new SimpleBufferedIterator[(Seq[T], U)] { + override protected def _advance: Option[(Seq[T], U)] = { + val lefts = ArrayBuffer[T]() + while (true) { + it.headOption match { + case Some(Left(t)) ⇒ + it.next + lefts += t + case Some(Right(u)) ⇒ + it.next + return Some(lefts → u) + case None ⇒ + return None + } + } + ??? + } + } +} + +object EitherIterator { + implicit def makeEitherIterator[T, U](it: Iterator[Either[T, U]]): EitherIterator[T, U] = + EitherIterator(it.buffered) +} diff --git a/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala b/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala index 5931ea9..4f563a5 100644 --- a/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala @@ -1,6 +1,6 @@ package org.hammerlab.iterator -import BufferedTakeWhileIterator._ +import org.hammerlab.iterator.bulk.BufferedBulkIterator._ /** * Group one sorted iterator with another, emitting an iterator of the latter's elements for each of the former's @@ -27,6 +27,37 @@ case class GroupWithIterator[T](it: BufferedIterator[T]) { ) ) } + + def sortedZip[U, V: Ordering](other: Iterator[U])( + implicit + tv: T ⇒ V, + uv: U ⇒ V + ): Iterator[Either[T, U]] = { + val o = other.buffered + val ord = implicitly[Ordering[V]] + new SimpleBufferedIterator[Either[T, U]] { + override protected def _advance: Option[Either[T, U]] = { + (it.headOption, o.headOption) match { + case (None, None) ⇒ None + case (Some(t), None) ⇒ + it.next + Some(Left(t)) + case (None, Some(u)) ⇒ + o.next + Some(Right(u)) + case (Some(t), Some(u)) ⇒ + ord.compare(tv(t), uv(u)) match { + case x if x > 0 ⇒ + o.next + Some(Right(u)) + case _ ⇒ + it.next + Some(Left(t)) + } + } + } + } + } } object GroupWithIterator { diff --git a/src/main/scala/org/hammerlab/iterator/RangeAccruingIterator.scala b/src/main/scala/org/hammerlab/iterator/RangeAccruingIterator.scala index 9a78de8..f61e775 100644 --- a/src/main/scala/org/hammerlab/iterator/RangeAccruingIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/RangeAccruingIterator.scala @@ -1,5 +1,7 @@ package org.hammerlab.iterator +import scala.Range + /** * Given an [[Iterator]] of [[Int]]s, collapse contiguous "ranges" of integers that are each 1 greater than their * predecessor. @@ -9,7 +11,8 @@ package org.hammerlab.iterator * * See RangeAccruingIteratorTest for more examples. */ -class RangeAccruingIterator(it: Iterator[Int]) extends Iterator[Range] { +class RangeAccruingIterator(it: Iterator[Int]) + extends Iterator[Range] { var anchor = -1 diff --git a/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala b/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala new file mode 100644 index 0000000..111789e --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala @@ -0,0 +1,71 @@ +package org.hammerlab.iterator + +import shapeless.Lazy + +case class SortedZipIterator[T](l: BufferedIterator[T]) { + + def sortedZip[V: Ordering](other: Iterator[T])( + implicit + tv: T ⇒ V + ): SimpleBufferedIterator[T] = { + val r = other.buffered + val ≤ = implicitly[Ordering[V]].lteq _ + new SimpleBufferedIterator[T] { + override protected def _advance: Option[T] = + (l.headOption, r.headOption) match { + case (Some(t), Some(u)) ⇒ + if (≤(t, u)) { + l.next + Some(t) + } else { + r.next + Some(u) + } + case (Some(t), _) ⇒ + l.next + Some(t) + case (_, Some(u)) ⇒ + r.next + Some(u) + case _ ⇒ + None + } + } + } + + def sortedEitherZip[U, V](other: Iterator[U])( + implicit + tv: T ⇒ V, + uv: U ⇒ V, + ord: Ordering[V] + ): SimpleBufferedIterator[Either[T, U]] = { + val r = other.buffered + val ≤ = ord.lteq _ + new SimpleBufferedIterator[Either[T, U]] { + override protected def _advance: Option[Either[T, U]] = + (l.headOption, r.headOption) match { + case (Some(t), Some(u)) ⇒ + if (≤(t, u)) { + l.next + Some(Left(t)) + } else { + r.next + Some(Right(u)) + } + case (Some(t), _) ⇒ + l.next + Some(Left(t)) + case (_, Some(u)) ⇒ + r.next + Some(Right(u)) + case _ ⇒ + None + } + } + } +} + +object SortedZipIterator { + implicit def makeSortedZipIterator[T](it: Iterator[T]): SortedZipIterator[T] = + SortedZipIterator(it.buffered) +} diff --git a/src/main/scala/org/hammerlab/iterator/bulk/BufferedBulkIterator.scala b/src/main/scala/org/hammerlab/iterator/bulk/BufferedBulkIterator.scala new file mode 100644 index 0000000..8265b02 --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/bulk/BufferedBulkIterator.scala @@ -0,0 +1,39 @@ +package org.hammerlab.iterator.bulk + +import org.hammerlab.iterator.SimpleBufferedIterator + +/** + * Some smarter bulk operations on [[BufferedIterator]]s + */ +case class BufferedBulkIterator[T](it: BufferedIterator[T]) { + def takewhile(fn: T ⇒ Boolean): SimpleBufferedIterator[T] = + new SimpleBufferedIterator[T] { + override protected def _advance: Option[T] = + if (it.hasNext && fn(it.head)) + Some(it.next) + else + None + } + + def dropwhile(fn: T ⇒ Boolean): Unit = + while (it.hasNext && fn(it.head)) + it.next + + def collectwhile[U](pf: PartialFunction[T, U]): BufferedIterator[U] = + new SimpleBufferedIterator[U] { + override protected def _advance: Option[U] = + if (it.hasNext && pf.isDefinedAt(it.head)) + Some( + pf( + it.next + ) + ) + else + None + } +} + +object BufferedBulkIterator { + implicit def makeBufferedBulkWhileIterator[T](it: BufferedIterator[T]): BufferedBulkIterator[T] = + BufferedBulkIterator(it) +} diff --git a/src/main/scala/org/hammerlab/iterator/range/OverlappingRangesIterator.scala b/src/main/scala/org/hammerlab/iterator/range/OverlappingRangesIterator.scala new file mode 100644 index 0000000..d5e288f --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/range/OverlappingRangesIterator.scala @@ -0,0 +1,71 @@ +package org.hammerlab.iterator.range + +import org.hammerlab.iterator.SimpleBufferedIterator + +import scala.collection.mutable + +case class OverlappingRangesIterator[T: Ordering](it: BufferedIterator[Range[T]]) { + + type RangeT = (T, Option[T]) + + val ≤ = implicitly[Ordering[T]].lteq _ + + implicit val orderZippedRangeByEndOpt: Ordering[(Range[T], Int)] = + Ordering + .by[(Range[T], Int), Option[T]](_._1.endOpt) + .reverse + + def joinOverlaps(other: Iterable[Range[T]]): Iterator[(Range[T], Vector[(Range[T], Int)])] = + joinOverlaps(other.iterator.buffered) + + def joinOverlaps(other: BufferedIterator[Range[T]]): Iterator[(Range[T], Vector[(Range[T], Int)])] = { + val queue = mutable.PriorityQueue[(Range[T], Int)]() + + val zippedOther = + other + .zipWithIndex + .buffered + + new SimpleBufferedIterator[(Range[T], Vector[(Range[T], Int)])] { + override protected def _advance: Option[(Range[T], Vector[(Range[T], Int)])] = + it + .nextOption + .map { + elem ⇒ + while (queue.headOption.exists(!_._1.∩(elem))) { + queue.dequeue() + } + + while ( + zippedOther + .headOption + .flatMap(_._1.endOpt) + .exists(≤(_, elem.start)) + ) { + zippedOther.next + } + + while ( + zippedOther + .headOption + .exists(_._1.∩(elem)) + ) { + queue.enqueue(zippedOther.next) + } + + elem → + queue + .toVector + .sortBy(_._2) + } + } + } +} + +object OverlappingRangesIterator { + implicit def makeOverlappingRangesIteratorFromIterable[T: Ordering](it: Iterable[Range[T]]): OverlappingRangesIterator[T] = + OverlappingRangesIterator(it.iterator.buffered) + + implicit def makeOverlappingRangesIterator[T: Ordering](it: Iterator[Range[T]]): OverlappingRangesIterator[T] = + OverlappingRangesIterator(it.buffered) +} diff --git a/src/main/scala/org/hammerlab/iterator/range/Range.scala b/src/main/scala/org/hammerlab/iterator/range/Range.scala new file mode 100644 index 0000000..75dce05 --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/range/Range.scala @@ -0,0 +1,42 @@ +package org.hammerlab.iterator.range + +case class Range[T](start: T, endOpt: Option[T]) { + def ∩(right: Range[T])(implicit ord: Ordering[T]): Boolean = { + val ≤ = ord.lteq _ + val Range(rightStart, rightEndOpt) = right + if (≤(start, rightStart)) + !endOpt.exists(≤(_, rightStart)) + else + !rightEndOpt.exists(≤(_, start)) + } + + override def toString: String = + s"[$start,${endOpt.getOrElse("∞")})" +} + +object Range { + def apply[T](start: T, end: T): Range[T] = Range(start, Some(end)) + def apply[T](start: T): Range[T] = Range(start, None) + + implicit def endOptOrdering[T](implicit ord: Ordering[T]): Ordering[Option[T]] = + new Ordering[Option[T]] { + override def compare(x: Option[T], y: Option[T]): Int = + (x, y) match { + case (None, None) ⇒ 0 + case (None, _) ⇒ -1 + case (_, None) ⇒ 1 + case (Some(x), Some(y)) ⇒ ord.compare(x, y) + } + } + + implicit def orderByStartThenEnd[T: Ordering]: Ordering[Range[T]] = { + + implicit val tupleOrdering = + Ordering.Tuple2[T, Option[T]] + + Ordering.by[Range[T], (T, Option[T])] { + case Range(start, endOpt) ⇒ + start → endOpt + } + } +} diff --git a/src/main/scala/org/hammerlab/iterator/Sliding2Iterator.scala b/src/main/scala/org/hammerlab/iterator/sliding/Sliding2Iterator.scala similarity index 94% rename from src/main/scala/org/hammerlab/iterator/Sliding2Iterator.scala rename to src/main/scala/org/hammerlab/iterator/sliding/Sliding2Iterator.scala index 23d6195..9d7fa08 100644 --- a/src/main/scala/org/hammerlab/iterator/Sliding2Iterator.scala +++ b/src/main/scala/org/hammerlab/iterator/sliding/Sliding2Iterator.scala @@ -1,4 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding + +import org.hammerlab.iterator.SimpleBufferedIterator case class Sliding2Iterator[T](it: BufferedIterator[T]) { def sliding2Prev: Iterator[(Option[T], T)] = diff --git a/src/main/scala/org/hammerlab/iterator/Sliding3Iterator.scala b/src/main/scala/org/hammerlab/iterator/sliding/Sliding3Iterator.scala similarity index 94% rename from src/main/scala/org/hammerlab/iterator/Sliding3Iterator.scala rename to src/main/scala/org/hammerlab/iterator/sliding/Sliding3Iterator.scala index a6becb0..60b1f96 100644 --- a/src/main/scala/org/hammerlab/iterator/Sliding3Iterator.scala +++ b/src/main/scala/org/hammerlab/iterator/sliding/Sliding3Iterator.scala @@ -1,4 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding + +import org.hammerlab.iterator.{ NextOptionIterator, SimpleBufferedIterator } /** * Given an [[Iterator[T]]], emit each element sandwiched between its preceding and succeeding elements. diff --git a/src/main/scala/org/hammerlab/iterator/SlidingIterator.scala b/src/main/scala/org/hammerlab/iterator/sliding/SlidingIterator.scala similarity index 91% rename from src/main/scala/org/hammerlab/iterator/SlidingIterator.scala rename to src/main/scala/org/hammerlab/iterator/sliding/SlidingIterator.scala index 79bdfd2..ea4214b 100644 --- a/src/main/scala/org/hammerlab/iterator/SlidingIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/sliding/SlidingIterator.scala @@ -1,4 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding + +import org.hammerlab.iterator.SimpleBufferedIterator import scala.collection.mutable.ArrayBuffer diff --git a/src/test/scala/org/hammerlab/iterator/BufferedTakeWhileIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/BufferedTakeWhileIteratorTest.scala deleted file mode 100644 index b4b1c10..0000000 --- a/src/test/scala/org/hammerlab/iterator/BufferedTakeWhileIteratorTest.scala +++ /dev/null @@ -1,25 +0,0 @@ -package org.hammerlab.iterator - -import org.hammerlab.iterator.BufferedTakeWhileIterator._ -import org.hammerlab.test.Suite - -class BufferedTakeWhileIteratorTest - extends Suite { - test("simple") { - val it = TestIterator(1 to 10: _*).buffered - it.takewhile(_ < 5).toList should be(1 to 4) - it.toList should be(5 to 10) - } - - test("take none") { - val it = TestIterator(1 to 10: _*).buffered - it.takewhile(_ < 1).toList should be(Nil) - it.toList should be(1 to 10) - } - - test("take all") { - val it = TestIterator(1 to 10: _*).buffered - it.takewhile(_ < 11).toList should be(1 to 10) - it.toList should be(Nil) - } -} diff --git a/src/test/scala/org/hammerlab/iterator/EitherIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/EitherIteratorTest.scala new file mode 100644 index 0000000..4d8d137 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/EitherIteratorTest.scala @@ -0,0 +1,147 @@ +package org.hammerlab.iterator + +import org.hammerlab.iterator.EitherIterator._ +import org.hammerlab.iterator.EitherIteratorTest._ +import org.hammerlab.test.Suite + +class FindLeftTest + extends Suite { + + implicit def intOpt(n: Int): Option[Int] = Some(n) + + def check(elems: Either[Int, String]*)(expected: Option[Int] = None): Unit = { + eithers(elems).findLeft should be(expected) + } + + test("findleft") { + check(4, 5, "abc", 6)(4) + check("abc", 6, "def", 7)(6) + check("abc", "def", 8)(8) + check()() + check("abc")() + check("abc", "def")() + check(4)(4) + } +} + +class GroupByLeftTest + extends Suite { + + def check(elems: Either[Int, String]*)(expected: (Int, String)*): Unit = + eithers(elems) + .groupByLeft + .map { + case (num, strings) ⇒ + num → + strings.mkString("") + } + .toList should be( + expected + ) + + test("simple") { + check( + 1, "a", "b", "c", + 2, + 3, + 4, "d", + 5 + )( + 1 → "abc", + 2 → "", + 3 → "", + 4 → "d", + 5 → "" + ) + } + + test("rights first and last") { + check( + "a", "b", "c", + 1, "d", "e", + 2, + 3, "f", "g" + )( + 1 → "de", + 2 → "", + 3 → "fg" + ) + } + + test("rights not consumed") { + eithers( + Seq( + 1, "a", "b", "c", + 2, + 3, "d", + 4, "e", "f", + 5 + ) + ) + .groupByLeft + .map { + case (num, strings) ⇒ + num → + strings + .buffered + .headOption + .getOrElse("???") + } + .toList should be( + Seq( + 1 → "a", + 2 → "???", + 3 → "d", + 4 → "e", + 5 → "???" + ) + ) + } +} + +class RoundUpRightTest + extends Suite { + + def check(elems: Either[Int, String]*)(expected: (Seq[Int], String)*): Unit = { + eithers(elems).roundUpRight.toList should be(expected) + } + + test("mixed") { + check( + 1, 2, 3, "abc", + "def", + 4, "ghi", + 5 + )( + Seq(1, 2, 3) → "abc", + Nil → "def", + Seq(4) → "ghi" + ) + } + + test("rights first") { + check( + "abc", + "def", + 1, "ghi", + "jkl", + 2, 3, "mno", + "pqr" + )( + Nil → "abc", + Nil → "def", + Seq(1) → "ghi", + Nil → "jkl", + Seq(2, 3) → "mno", + Nil → "pqr" + ) + } + +} + +object EitherIteratorTest { + implicit def leftInt(n: Int): Left[Int, String] = Left(n) + implicit def rightString(s: String): Right[Int, String] = Right(s) + + def eithers(elems: Seq[Either[Int, String]]): BufferedIterator[Either[Int, String]] = Iterator(elems: _*).buffered +} diff --git a/src/test/scala/org/hammerlab/iterator/GroupWithIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/GroupWithIteratorTest.scala index cca220b..82f279e 100644 --- a/src/test/scala/org/hammerlab/iterator/GroupWithIteratorTest.scala +++ b/src/test/scala/org/hammerlab/iterator/GroupWithIteratorTest.scala @@ -1,7 +1,7 @@ package org.hammerlab.iterator +import org.hammerlab.iterator.GroupWithIterator._ import org.hammerlab.test.Suite -import GroupWithIterator._ class GroupWithIteratorTest extends Suite { @@ -9,9 +9,9 @@ class GroupWithIteratorTest implicit def stringToInt(s: String): Int = augmentString(s).toInt test("mixed") { - TestIterator(2, 4, 6, 8, 10) + Iterator(2, 4, 6, 8, 10) .groupWith[String, Int]( - TestIterator("1", "2", "3", "4", "5", "5", "7", "11") + Iterator("1", "2", "3", "4", "5", "5", "7", "11") ) .toList .map { @@ -29,9 +29,9 @@ class GroupWithIteratorTest } test("left empty") { - TestIterator[Int]() + Iterator[Int]() .groupWith[String, Int]( - TestIterator("1", "2", "3", "4", "5", "5", "7", "11") + Iterator("1", "2", "3", "4", "5", "5", "7", "11") ) .toList .map { diff --git a/src/test/scala/org/hammerlab/iterator/RunLengthIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/RunLengthIteratorTest.scala index c60d842..d794d9f 100644 --- a/src/test/scala/org/hammerlab/iterator/RunLengthIteratorTest.scala +++ b/src/test/scala/org/hammerlab/iterator/RunLengthIteratorTest.scala @@ -70,4 +70,21 @@ class RunLengthIteratorTest extends Suite { } } + { + test("re-encode") { + reencode( + Iterator('a' → 2, 'a' → 1, 'b' → 3, 'a' → 4, 'c' → 1, 'c' → 1, 'c' → 2, 'a' → 1) + ) + .toList should be( + List( + 'a' → 3, + 'b' → 3, + 'a' → 4, + 'c' → 4, + 'a' → 1 + ) + ) + } + } + } diff --git a/src/test/scala/org/hammerlab/iterator/SimpleBufferedIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/SimpleBufferedIteratorTest.scala index e193b20..fd23f17 100644 --- a/src/test/scala/org/hammerlab/iterator/SimpleBufferedIteratorTest.scala +++ b/src/test/scala/org/hammerlab/iterator/SimpleBufferedIteratorTest.scala @@ -1,9 +1,8 @@ package org.hammerlab.iterator +import org.hammerlab.iterator.SimpleBufferedIterator._ import org.hammerlab.test.Suite -import SimpleBufferedIterator._ - class SimpleBufferedIteratorTest extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala new file mode 100644 index 0000000..84375bc --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala @@ -0,0 +1,240 @@ +package org.hammerlab.iterator + +import org.hammerlab.iterator.SortedZipIterator._ +import org.hammerlab.test.Suite + +import scala.collection.immutable.StringOps + +abstract class SortedZipIteratorTest + extends Suite { + + type Result + + def check(left: Int*)(right: Int*)(expected: Result*): Unit +} + +trait SortedZipIntsTest { + + self: SortedZipIteratorTest ⇒ + + def L(t: Int): Result + def R(u: Int): Result + + test("135 246") { + check( + 1, 3, 5 + )( + 2, 4, 6 + )( + L(1), + R(2), + L(3), + R(4), + L(5), + R(6) + ) + } + + test("123 456") { + check( + 1, 2, 3 + )( + 4, 5, 6 + )( + L(1), + L(2), + L(3), + R(4), + R(5), + R(6) + ) + } + + test("123 123") { + check( + 1, 2, 3 + )( + 1, 2, 3 + )( + L(1), + R(1), + L(2), + R(2), + L(3), + R(3) + ) + } + + test("both empty") { + check()()() + } + + test("one empty one 1") { + check()(1)(R(1)) + } + + test("one empty one 3") { + check( + + )( + 1, 10, 100 + )( + R(1), + R(10), + R(100) + ) + } + + test("L 1 R empty") { + check(1)()(L(1)) + } + + test("L 3 R empty") { + check( + 1, 10, 100 + )( + + )( + L(1), + L(10), + L(100) + ) + } +} + +abstract class SortedEitherZip + extends SortedZipIteratorTest { + + type L + type R + type Result = Either[L, R] + + override def check(left: Int*)(right: Int*)(expected: Result*): Unit = { + left + .iterator + .sortedEitherZip(right.iterator) + .toList should be( + expected + ) + } + + def L(l: L): Result = Left(l) + def R(r: R): Result = Right(r) +} + +class SortedEitherInts + extends SortedEitherZip + with SortedZipIntsTest { + + type L = Int + type R = Int +} + +trait IntStringEitherTest + extends SortedEitherZip { + + /** + * Workaround [[strlen]] making [[augmentString]] implicit (for accessing + * [[scala.collection.immutable.StringLike.*]]) ambiguous. + */ + implicit class StringMult(val s: String) { + def x(n: Int): String = (s: StringOps) * n + } + + case class WrappedInt(n: Int) + implicit val wrapInt: Int ⇒ WrappedInt = WrappedInt + implicit val unwrapWrappedInt: WrappedInt ⇒ Int = _.n + + val wrappedInts = + Seq[WrappedInt]( + 1, + 2, + 4, + 7, + 10, + 15 + ) + + val strings = + Seq( + "", + "a", + "a", + "bb", + "c" x 3, + "e" x 5, + "f" x 6, + "k" x 11, + "n" x 14 + ) + + implicit def strlen(s: String): Int = s.length +} + +class SortedEitherIntStringTest + extends IntStringEitherTest { + + override type L = WrappedInt + override type R = String + + test("different types") { + wrappedInts + .iterator + .sortedEitherZip[String, Int]( + strings.iterator + ) + .toList should be( + Seq[Either[WrappedInt, String]]( + R(""), + L(1), + R("a"), + R("a"), + L(2), + R("bb"), + R("ccc"), + L(4), + R("eeeee"), + R("ffffff"), + L(7), + L(10), + R("kkkkkkkkkkk"), + R("nnnnnnnnnnnnnn"), + L(15) + ) + ) + } +} + +class SortedEitherStringIntTest + extends IntStringEitherTest { + + override type L = String + override type R = WrappedInt + + test("different types") { + strings + .iterator + .sortedEitherZip[WrappedInt, Int]( + wrappedInts.iterator + ) + .toList should be( + Seq[Either[String, WrappedInt]]( + L(""), + L("a"), + L("a"), + R(1), + L("bb"), + R(2), + L("ccc"), + R(4), + L("eeeee"), + L("ffffff"), + R(7), + R(10), + L("kkkkkkkkkkk"), + L("nnnnnnnnnnnnnn"), + R(15) + ) + ) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/TestIterator.scala b/src/test/scala/org/hammerlab/iterator/TestIterator.scala deleted file mode 100644 index 71e2e35..0000000 --- a/src/test/scala/org/hammerlab/iterator/TestIterator.scala +++ /dev/null @@ -1,20 +0,0 @@ -package org.hammerlab.iterator - -/** - * [[Iterator]]s constructed via [[Iterator.apply]] or [[scala.collection.IndexedSeqLike.iterator]] are tricky about - * what state they are left in after various operations are applied, cf. https://github.com/scala/bug/issues/9274. - * - * This class allows easy creation of vanilla [[Iterator]]s with no funny-business, for testing extended iterator - * functionality like [[BufferedTakeWhileIteratorTest]], which passes erroneously with naively-created [[Iterator]]s. - */ -case class TestIterator[T](elems: T*) - extends Iterator[T] { - var idx = 0 - override def hasNext: Boolean = idx < elems.size - - override def next(): T = { - val elem = elems(idx) - idx += 1 - elem - } -} diff --git a/src/test/scala/org/hammerlab/iterator/bulk/BufferedCollectWhileTest.scala b/src/test/scala/org/hammerlab/iterator/bulk/BufferedCollectWhileTest.scala new file mode 100644 index 0000000..c948ff1 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/bulk/BufferedCollectWhileTest.scala @@ -0,0 +1,52 @@ +package org.hammerlab.iterator.bulk + +import org.hammerlab.iterator.EitherIteratorTest._ +import org.hammerlab.iterator.bulk.BufferedBulkIterator._ +import org.hammerlab.test.Suite + +class BufferedCollectWhileTest + extends Suite { + + def check(elems: Either[Int, String]*)(expectedInts: Int*): Unit = { + eithers(elems) + .collectwhile { + case Left(n) ⇒ n + } + .toList should be( + expectedInts + ) + } + + test("two") { + check( + 4, 5, "abc", 6 + )( + 4, 5 + ) + } + + test("none") { + check( + "abc", 4, 5, 6 + )( + + ) + } + + test("all") { + check( + 4, 5, 6 + )( + 4, 5, 6 + ) + } + + test("empty") { + check( + + )( + + ) + } + +} diff --git a/src/test/scala/org/hammerlab/iterator/bulk/BufferedDropWhileTest.scala b/src/test/scala/org/hammerlab/iterator/bulk/BufferedDropWhileTest.scala new file mode 100644 index 0000000..5e84c03 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/bulk/BufferedDropWhileTest.scala @@ -0,0 +1,85 @@ +package org.hammerlab.iterator.bulk + +import org.hammerlab.iterator.bulk.BufferedBulkIterator._ +import org.hammerlab.test.Suite + +class BufferedDropWhileTest + extends Suite { + + def check(fn: Int ⇒ Boolean, + expectedRemainingElems: Seq[Int])( + implicit elems: Seq[Int] + ): Unit = { + //Seq(1,2,3).iterator.take(3) + val it = Iterator(elems: _*).buffered + it.dropwhile(fn) + it.toList should be(expectedRemainingElems) + } + + { + implicit val elems = 1 to 10 + + test("[1,10] half") { + check( + _ < 5, + 5 to 10 + ) + } + + test("[1,10] none") { + check( + _ < 1, + 1 to 10 + ) + } + + test("[1,10] one") { + check( + _ == 1, + 2 to 10 + ) + } + + test("[1,10] almost all") { + check( + _ < 10, + Seq(10) + ) + } + + test("[1,10] all") { + check( + _ < 11, + Nil + ) + } + } + + test("empty") { + check( + _ ⇒ true, + Nil + )( + Nil + ) + } + + { + implicit val elems = Seq(1) + + test("one, all") { + check( + _ ⇒ true, + Nil + ) + } + + test("one, none") { + check( + _ ⇒ false, + Seq(1) + ) + } + } + +} diff --git a/src/test/scala/org/hammerlab/iterator/bulk/BufferedTakeWhileTest.scala b/src/test/scala/org/hammerlab/iterator/bulk/BufferedTakeWhileTest.scala new file mode 100644 index 0000000..74e0c7b --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/bulk/BufferedTakeWhileTest.scala @@ -0,0 +1,92 @@ +package org.hammerlab.iterator.bulk + +import org.hammerlab.iterator.bulk.BufferedBulkIterator._ +import org.hammerlab.test.Suite + +class BufferedTakeWhileTest + extends Suite { + + def check(fn: Int ⇒ Boolean, + expectedElems: Seq[Int], + expectedRemainingElems: Seq[Int])( + implicit elems: Seq[Int] + ): Unit = { + val it = Iterator(elems: _*).buffered + it.takewhile(fn).toList should be(expectedElems) + it.toList should be(expectedRemainingElems) + } + + { + implicit val elems = 1 to 10 + + test("[1,10] half") { + check( + _ < 5, + 1 to 4, + 5 to 10 + ) + } + + test("[1,10] none") { + check( + _ < 1, + Nil, + 1 to 10 + ) + } + + test("[1,10] one") { + check( + _ == 1, + Seq(1), + 2 to 10 + ) + } + + test("[1,10] almost all") { + check( + _ < 10, + 1 to 9, + Seq(10) + ) + } + + test("[1,10] all") { + check( + _ < 11, + 1 to 10, + Nil + ) + } + } + + test("empty") { + check( + _ ⇒ true, + Nil, + Nil + )( + Nil + ) + } + + { + implicit val elems = Seq(1) + + test("one, all") { + check( + _ ⇒ true, + Seq(1), + Nil + ) + } + + test("one, none") { + check( + _ ⇒ false, + Nil, + Seq(1) + ) + } + } +} diff --git a/src/test/scala/org/hammerlab/iterator/range/OverlappingRangesIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/range/OverlappingRangesIteratorTest.scala new file mode 100644 index 0000000..d007de1 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/range/OverlappingRangesIteratorTest.scala @@ -0,0 +1,114 @@ +package org.hammerlab.iterator.range + +import org.hammerlab.iterator.range.OverlappingRangesIterator._ +import org.hammerlab.test.Suite + +class OverlappingRangesIteratorTest + extends Suite { + + def Ranges(ranges: (Range[Int], Int)*): Seq[(Range[Int], Int)] = ranges + + implicit def intToRange(n: Int): Range[Int] = Range(n, None) + implicit def pairToIntRange(t: (Int, Int)): Range[Int] = Range(t._1, t._2) + implicit def indexedPairToRange(t: ((Int, Int), Int)): (Range[Int], Int) = (t._1, t._2) + + implicit def makeJoinedRangesElem(t: (Int, Seq[Range[Int]])): (Range[Int], Seq[Range[Int]]) = + (t._1: Range[Int]) → t._2 + + implicit def makeHalfOpenJoinedRangesElem(t: ((Int, Int), Seq[Range[Int]])): (Range[Int], Seq[Range[Int]]) = + (t._1: Range[Int]) → t._2 + + def check(left: Range[Int]*)(right: Range[Int]*)(expected: Seq[(Range[Int], Int)]*): Unit = { + left + .iterator + .joinOverlaps( + right + .iterator + .buffered + ) + .toList should be( + left.zip(expected) + ) + } + + test("trivial case") { + check( + 1 → 3 + )( + 1 → 3 + )( + Seq(Range(1, 3) → 0) + ) + } + + test("stable right-side ordering") { + check( + 2 → 4, + 5 → 7, + 8 + )( + 0 → 2, + 1 → 3, + 2 → 5, + 4 → 5, + 4 → 6, + 4 → 9, + 4 → 8, + 6 → 7, + 7 → 11, + 13 → 14, + 13 + )( + Seq(Range(1, 3) → 1, Range(2, 5) → 2), + Seq(Range(4, 6) → 4, Range(4, 9) → 5, Range( 4, 8) → 6, Range( 6, 7) → 7), + Seq(Range(4, 9) → 5, Range(7, 11) → 8, Range(13, 14) → 9, Range(13) → 10) + ) + } + + test("all rights before") { + check( + 5 → 10, + 7 → 9 + )( + 0 → 2, + 0 → 3, + 1 → 5 + )( + Nil, + Nil + ) + } + + test("all rights after") { + check( + 5 → 10, + 7 → 9 + )( + 10 → 12, + 10 → 13, + 11 → 15 + )( + Nil, + Nil + ) + } + + test("fully skipped rights") { + check( + 2 → 4, + 3 → 5, + 10 → 15 + )( + 1 → 2, + 3 → 4, + 5 → 10, + 6 → 9, + 11 → 12, + 15 → 20 + )( + Seq(Range( 3, 4) → 1), + Seq(Range( 3, 4) → 1), + Seq(Range(11, 12) → 4) + ) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/Sliding2OptTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2OptTest.scala similarity index 82% rename from src/test/scala/org/hammerlab/iterator/Sliding2OptTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding2OptTest.scala index 3316899..1513ea8 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding2OptTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2OptTest.scala @@ -1,7 +1,7 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding +import org.hammerlab.iterator.sliding.Sliding2Iterator._ import org.hammerlab.test.Suite -import Sliding2Iterator._ class Sliding2OptTest extends Suite { test("empty") { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding2PadTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2PadTest.scala similarity index 81% rename from src/test/scala/org/hammerlab/iterator/Sliding2PadTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding2PadTest.scala index 4a07e3f..d8199a3 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding2PadTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2PadTest.scala @@ -1,7 +1,7 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding +import org.hammerlab.iterator.sliding.Sliding2Iterator._ import org.hammerlab.test.Suite -import Sliding2Iterator._ class Sliding2PadTest extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding2PrevTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2PrevTest.scala similarity index 82% rename from src/test/scala/org/hammerlab/iterator/Sliding2PrevTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding2PrevTest.scala index 50481c5..63a46b9 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding2PrevTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2PrevTest.scala @@ -1,7 +1,7 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding +import org.hammerlab.iterator.sliding.Sliding2Iterator._ import org.hammerlab.test.Suite -import Sliding2Iterator._ class Sliding2PrevTest extends Suite { test("empty") { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding2Test.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2Test.scala similarity index 79% rename from src/test/scala/org/hammerlab/iterator/Sliding2Test.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding2Test.scala index db0a3f9..9507be1 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding2Test.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding2Test.scala @@ -1,7 +1,7 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding +import org.hammerlab.iterator.sliding.Sliding2Iterator._ import org.hammerlab.test.Suite -import Sliding2Iterator._ class Sliding2Test extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding3NextOptsTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3NextOptsTest.scala similarity index 92% rename from src/test/scala/org/hammerlab/iterator/Sliding3NextOptsTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding3NextOptsTest.scala index 87ce787..1283e36 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding3NextOptsTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3NextOptsTest.scala @@ -1,6 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding -import org.hammerlab.iterator.Sliding3Iterator._ +import org.hammerlab.iterator.sliding.Sliding3Iterator._ import org.hammerlab.test.Suite class Sliding3NextOptsTest extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding3OptTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3OptTest.scala similarity index 92% rename from src/test/scala/org/hammerlab/iterator/Sliding3OptTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding3OptTest.scala index df72a10..7d01e54 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding3OptTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3OptTest.scala @@ -1,6 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding -import org.hammerlab.iterator.Sliding3Iterator._ +import org.hammerlab.iterator.sliding.Sliding3Iterator._ import org.hammerlab.test.Suite class Sliding3OptTest extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/Sliding3Test.scala b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3Test.scala similarity index 89% rename from src/test/scala/org/hammerlab/iterator/Sliding3Test.scala rename to src/test/scala/org/hammerlab/iterator/sliding/Sliding3Test.scala index 0c7801a..3ab293c 100644 --- a/src/test/scala/org/hammerlab/iterator/Sliding3Test.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/Sliding3Test.scala @@ -1,6 +1,6 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding -import org.hammerlab.iterator.Sliding3Iterator._ +import org.hammerlab.iterator.sliding.Sliding3Iterator._ import org.hammerlab.test.Suite class Sliding3Test extends Suite { diff --git a/src/test/scala/org/hammerlab/iterator/SlidingIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/sliding/SlidingIteratorTest.scala similarity index 92% rename from src/test/scala/org/hammerlab/iterator/SlidingIteratorTest.scala rename to src/test/scala/org/hammerlab/iterator/sliding/SlidingIteratorTest.scala index 0291979..f37671c 100644 --- a/src/test/scala/org/hammerlab/iterator/SlidingIteratorTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sliding/SlidingIteratorTest.scala @@ -1,7 +1,7 @@ -package org.hammerlab.iterator +package org.hammerlab.iterator.sliding +import org.hammerlab.iterator.sliding.SlidingIterator._ import org.hammerlab.test.Suite -import SlidingIterator._ class SlidingIteratorTest extends Suite { From 163db345462d50eb0d98941538a994fabc30ecf7 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 6 Jul 2017 22:57:57 +0000 Subject: [PATCH 02/20] rework sorted/zip iterators --- .../hammerlab/iterator/EitherIterator.scala | 1 - .../iterator/GroupWithIterator.scala | 31 --- .../iterator/SortedZipIterator.scala | 71 ------ .../iterator/sorted/EitherZipIterator.scala | 52 ++++ .../iterator/sorted/OrZipIterator.scala | 86 +++++++ .../iterator/sorted/ZipIterator.scala | 48 ++++ src/main/scala/org/hammerlab/types/Or.scala | 21 ++ .../iterator/SortedZipIteratorTest.scala | 240 ------------------ .../iterator/sorted/EitherIntStringTest.scala | 40 +++ .../iterator/sorted/EitherInts.scala | 67 +++++ .../hammerlab/iterator/sorted/EitherOr.scala | 11 + .../iterator/sorted/EitherStringIntTest.scala | 40 +++ .../hammerlab/iterator/sorted/EitherZip.scala | 21 ++ .../iterator/sorted/IntStringEitherTest.scala | 44 ++++ .../org/hammerlab/iterator/sorted/OrZip.scala | 86 +++++++ .../iterator/sorted/ZipIntsTest.scala | 79 ++++++ .../iterator/sorted/ZipIteratorTest.scala | 13 + 17 files changed, 608 insertions(+), 343 deletions(-) delete mode 100644 src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/sorted/EitherZipIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala create mode 100644 src/main/scala/org/hammerlab/types/Or.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala diff --git a/src/main/scala/org/hammerlab/iterator/EitherIterator.scala b/src/main/scala/org/hammerlab/iterator/EitherIterator.scala index 865ed42..c5fecf6 100644 --- a/src/main/scala/org/hammerlab/iterator/EitherIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/EitherIterator.scala @@ -1,7 +1,6 @@ package org.hammerlab.iterator import org.hammerlab.iterator.bulk.BufferedBulkIterator._ - import scala.collection.mutable.ArrayBuffer case class EitherIterator[T, U](it: BufferedIterator[Either[T, U]]) { diff --git a/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala b/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala index 4f563a5..3ea028a 100644 --- a/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/GroupWithIterator.scala @@ -27,37 +27,6 @@ case class GroupWithIterator[T](it: BufferedIterator[T]) { ) ) } - - def sortedZip[U, V: Ordering](other: Iterator[U])( - implicit - tv: T ⇒ V, - uv: U ⇒ V - ): Iterator[Either[T, U]] = { - val o = other.buffered - val ord = implicitly[Ordering[V]] - new SimpleBufferedIterator[Either[T, U]] { - override protected def _advance: Option[Either[T, U]] = { - (it.headOption, o.headOption) match { - case (None, None) ⇒ None - case (Some(t), None) ⇒ - it.next - Some(Left(t)) - case (None, Some(u)) ⇒ - o.next - Some(Right(u)) - case (Some(t), Some(u)) ⇒ - ord.compare(tv(t), uv(u)) match { - case x if x > 0 ⇒ - o.next - Some(Right(u)) - case _ ⇒ - it.next - Some(Left(t)) - } - } - } - } - } } object GroupWithIterator { diff --git a/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala b/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala deleted file mode 100644 index 111789e..0000000 --- a/src/main/scala/org/hammerlab/iterator/SortedZipIterator.scala +++ /dev/null @@ -1,71 +0,0 @@ -package org.hammerlab.iterator - -import shapeless.Lazy - -case class SortedZipIterator[T](l: BufferedIterator[T]) { - - def sortedZip[V: Ordering](other: Iterator[T])( - implicit - tv: T ⇒ V - ): SimpleBufferedIterator[T] = { - val r = other.buffered - val ≤ = implicitly[Ordering[V]].lteq _ - new SimpleBufferedIterator[T] { - override protected def _advance: Option[T] = - (l.headOption, r.headOption) match { - case (Some(t), Some(u)) ⇒ - if (≤(t, u)) { - l.next - Some(t) - } else { - r.next - Some(u) - } - case (Some(t), _) ⇒ - l.next - Some(t) - case (_, Some(u)) ⇒ - r.next - Some(u) - case _ ⇒ - None - } - } - } - - def sortedEitherZip[U, V](other: Iterator[U])( - implicit - tv: T ⇒ V, - uv: U ⇒ V, - ord: Ordering[V] - ): SimpleBufferedIterator[Either[T, U]] = { - val r = other.buffered - val ≤ = ord.lteq _ - new SimpleBufferedIterator[Either[T, U]] { - override protected def _advance: Option[Either[T, U]] = - (l.headOption, r.headOption) match { - case (Some(t), Some(u)) ⇒ - if (≤(t, u)) { - l.next - Some(Left(t)) - } else { - r.next - Some(Right(u)) - } - case (Some(t), _) ⇒ - l.next - Some(Left(t)) - case (_, Some(u)) ⇒ - r.next - Some(Right(u)) - case _ ⇒ - None - } - } - } -} - -object SortedZipIterator { - implicit def makeSortedZipIterator[T](it: Iterator[T]): SortedZipIterator[T] = - SortedZipIterator(it.buffered) -} diff --git a/src/main/scala/org/hammerlab/iterator/sorted/EitherZipIterator.scala b/src/main/scala/org/hammerlab/iterator/sorted/EitherZipIterator.scala new file mode 100644 index 0000000..d3c8d30 --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/sorted/EitherZipIterator.scala @@ -0,0 +1,52 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.{ HeadOptionIterator, SimpleBufferedIterator } + +case class EitherZipIterator[T](l: BufferedIterator[T]) { + def sortedEitherZip[U, V](other: Iterable[U])( + implicit + ord: Ordering[V], + tv: T ⇒ V, + uv: U ⇒ V + ): SimpleBufferedIterator[Either[T, U]] = + sortedEitherZip(other.iterator) + + def sortedEitherZip[U, V](other: Iterator[U])( + implicit + ord: Ordering[V], + tv: T ⇒ V, + uv: U ⇒ V + ): SimpleBufferedIterator[Either[T, U]] = { + val r = other.buffered + val ≤ = ord.lteq _ + new SimpleBufferedIterator[Either[T, U]] { + override protected def _advance: Option[Either[T, U]] = + (l.headOption, r.headOption) match { + case (Some(t), Some(u)) ⇒ + if (≤(t, u)) { + l.next + Some(Left(t)) + } else { + r.next + Some(Right(u)) + } + case (Some(t), _) ⇒ + l.next + Some(Left(t)) + case (_, Some(u)) ⇒ + r.next + Some(Right(u)) + case _ ⇒ + None + } + } + } +} + +object EitherZipIterator { + implicit def makeEitherZipIterator[T](it: Iterator[T]): EitherZipIterator[T] = + EitherZipIterator(it.buffered) + + implicit def makeEitherZipIteratorFromIterable[T](it: Iterable[T]): EitherZipIterator[T] = + EitherZipIterator(it.iterator.buffered) +} diff --git a/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala b/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala new file mode 100644 index 0000000..4dfd4fb --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala @@ -0,0 +1,86 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.{ HeadOptionIterator, SimpleBufferedIterator } +import org.hammerlab.types.{ Both, LO, Or, RO } + +case class OrZipIterator[T](l: BufferedIterator[T]) { +// def sortedOrZip[U, V](other: Iterable[U])( +// implicit +// tv: T ⇒ V, +// uv: U ⇒ V, +// ord: Ordering[V] +// ): SimpleBufferedIterator[Or[T, U]] = +// sortedOrZip(other.iterator)(tv, uv, ord) + +// type Aux[U, V0] = CmpType[T, U] { type V = V0 } + + def sortedOrZip[U, V](other: Iterator[U])( + implicit + ord: Ordering[V], + tv: T ⇒ V, + uv: U ⇒ V + ): SimpleBufferedIterator[Or[T, U]] = { + val r = other.buffered + new SimpleBufferedIterator[Or[T, U]] { + override protected def _advance: Option[Or[T, U]] = + (l.headOption, r.headOption) match { + case (Some(t), Some(u)) ⇒ + Some( + ord.compare(t, u) match { + case 0 ⇒ + l.next + r.next + Both(t, u) + case x if x < 0 ⇒ + l.next + LO(t) + case _ ⇒ + r.next + RO(u) + } + ) + case (Some(t), _) ⇒ + l.next + Some(LO(t)) + case (_, Some(u)) ⇒ + r.next + Some(RO(u)) + case _ ⇒ + None + } + } + } +} + +trait CmpType[T, U] { + type V + implicit def tv: T ⇒ V + implicit def uv: U ⇒ V + def ord: Ordering[V] +} + +object CmpType { + + type Aux[T, U, V0] = CmpType[T, U] { type V = V0 } + + implicit def makeCmp[T, U, V0]( + implicit + tv0: T ⇒ V0, + uv0: U ⇒ V0, + ord0: Ordering[V0] + ): CmpType.Aux[T, U, V0] = + new CmpType[T, U] { + type V = V0 + override implicit def tv: (T) ⇒ V0 = tv0 + override implicit def uv: (U) ⇒ V0 = uv0 + override def ord: Ordering[V0] = ord0 + } +} + +object OrZipIterator { + implicit def makeOrZipIterator[T](it: Iterator[T]): OrZipIterator[T] = + OrZipIterator(it.buffered) + + implicit def makeOrZipIteratorFromIterable[T](it: Iterable[T]): OrZipIterator[T] = + OrZipIterator(it.iterator.buffered) +} diff --git a/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala b/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala new file mode 100644 index 0000000..21f7ebb --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala @@ -0,0 +1,48 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.{ HeadOptionIterator, SimpleBufferedIterator } + +case class ZipIterator[T](l: BufferedIterator[T]) { + def sortedZip[V: Ordering](other: Iterable[T])( + implicit + tv: T ⇒ V + ): SimpleBufferedIterator[T] = + sortedZip[V](other) + + def sortedZip[V: Ordering](other: Iterator[T])( + implicit + tv: T ⇒ V + ): SimpleBufferedIterator[T] = { + val r = other.buffered + val ≤ = implicitly[Ordering[V]].lteq _ + new SimpleBufferedIterator[T] { + override protected def _advance: Option[T] = + (l.headOption, r.headOption) match { + case (Some(t), Some(u)) ⇒ + if (≤(t, u)) { + l.next + Some(t) + } else { + r.next + Some(u) + } + case (Some(t), _) ⇒ + l.next + Some(t) + case (_, Some(u)) ⇒ + r.next + Some(u) + case _ ⇒ + None + } + } + } +} + +object ZipIterator { + implicit def makeZipIterator[T](it: Iterator[T]): ZipIterator[T] = + ZipIterator(it.buffered) + + implicit def makeZipIteratorFromIterable[T](it: Iterable[T]): ZipIterator[T] = + ZipIterator(it.iterator.buffered) +} diff --git a/src/main/scala/org/hammerlab/types/Or.scala b/src/main/scala/org/hammerlab/types/Or.scala new file mode 100644 index 0000000..c1b2642 --- /dev/null +++ b/src/main/scala/org/hammerlab/types/Or.scala @@ -0,0 +1,21 @@ +package org.hammerlab.types + +sealed trait Or[+L, +R] + +object Or { + def apply[L, R](l: L, r: Option[R]): Or[L, R] = + r match { + case Some(r) ⇒ Both(l, r) + case None ⇒ LO(l) + } + + def apply[L, R](l: Option[L], r: R): Or[L, R] = + l match { + case Some(l) ⇒ Both(l, r) + case None ⇒ RO(r) + } +} + +final case class LO[+L, +R](l: L) extends Or[L, R] +final case class RO[+L, +R](r: R) extends Or[L, R] +final case class Both[+L, +R](l: L, r: R) extends Or[L, R] diff --git a/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala deleted file mode 100644 index 84375bc..0000000 --- a/src/test/scala/org/hammerlab/iterator/SortedZipIteratorTest.scala +++ /dev/null @@ -1,240 +0,0 @@ -package org.hammerlab.iterator - -import org.hammerlab.iterator.SortedZipIterator._ -import org.hammerlab.test.Suite - -import scala.collection.immutable.StringOps - -abstract class SortedZipIteratorTest - extends Suite { - - type Result - - def check(left: Int*)(right: Int*)(expected: Result*): Unit -} - -trait SortedZipIntsTest { - - self: SortedZipIteratorTest ⇒ - - def L(t: Int): Result - def R(u: Int): Result - - test("135 246") { - check( - 1, 3, 5 - )( - 2, 4, 6 - )( - L(1), - R(2), - L(3), - R(4), - L(5), - R(6) - ) - } - - test("123 456") { - check( - 1, 2, 3 - )( - 4, 5, 6 - )( - L(1), - L(2), - L(3), - R(4), - R(5), - R(6) - ) - } - - test("123 123") { - check( - 1, 2, 3 - )( - 1, 2, 3 - )( - L(1), - R(1), - L(2), - R(2), - L(3), - R(3) - ) - } - - test("both empty") { - check()()() - } - - test("one empty one 1") { - check()(1)(R(1)) - } - - test("one empty one 3") { - check( - - )( - 1, 10, 100 - )( - R(1), - R(10), - R(100) - ) - } - - test("L 1 R empty") { - check(1)()(L(1)) - } - - test("L 3 R empty") { - check( - 1, 10, 100 - )( - - )( - L(1), - L(10), - L(100) - ) - } -} - -abstract class SortedEitherZip - extends SortedZipIteratorTest { - - type L - type R - type Result = Either[L, R] - - override def check(left: Int*)(right: Int*)(expected: Result*): Unit = { - left - .iterator - .sortedEitherZip(right.iterator) - .toList should be( - expected - ) - } - - def L(l: L): Result = Left(l) - def R(r: R): Result = Right(r) -} - -class SortedEitherInts - extends SortedEitherZip - with SortedZipIntsTest { - - type L = Int - type R = Int -} - -trait IntStringEitherTest - extends SortedEitherZip { - - /** - * Workaround [[strlen]] making [[augmentString]] implicit (for accessing - * [[scala.collection.immutable.StringLike.*]]) ambiguous. - */ - implicit class StringMult(val s: String) { - def x(n: Int): String = (s: StringOps) * n - } - - case class WrappedInt(n: Int) - implicit val wrapInt: Int ⇒ WrappedInt = WrappedInt - implicit val unwrapWrappedInt: WrappedInt ⇒ Int = _.n - - val wrappedInts = - Seq[WrappedInt]( - 1, - 2, - 4, - 7, - 10, - 15 - ) - - val strings = - Seq( - "", - "a", - "a", - "bb", - "c" x 3, - "e" x 5, - "f" x 6, - "k" x 11, - "n" x 14 - ) - - implicit def strlen(s: String): Int = s.length -} - -class SortedEitherIntStringTest - extends IntStringEitherTest { - - override type L = WrappedInt - override type R = String - - test("different types") { - wrappedInts - .iterator - .sortedEitherZip[String, Int]( - strings.iterator - ) - .toList should be( - Seq[Either[WrappedInt, String]]( - R(""), - L(1), - R("a"), - R("a"), - L(2), - R("bb"), - R("ccc"), - L(4), - R("eeeee"), - R("ffffff"), - L(7), - L(10), - R("kkkkkkkkkkk"), - R("nnnnnnnnnnnnnn"), - L(15) - ) - ) - } -} - -class SortedEitherStringIntTest - extends IntStringEitherTest { - - override type L = String - override type R = WrappedInt - - test("different types") { - strings - .iterator - .sortedEitherZip[WrappedInt, Int]( - wrappedInts.iterator - ) - .toList should be( - Seq[Either[String, WrappedInt]]( - L(""), - L("a"), - L("a"), - R(1), - L("bb"), - R(2), - L("ccc"), - R(4), - L("eeeee"), - L("ffffff"), - R(7), - R(10), - L("kkkkkkkkkkk"), - L("nnnnnnnnnnnnnn"), - R(15) - ) - ) - } -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala new file mode 100644 index 0000000..32b0bf0 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala @@ -0,0 +1,40 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.sorted.EitherZipIterator._ + +class EitherIntStringTest + extends IntStringEitherTest { + + override type L = WrappedInt + override type R = String + + override implicit def tv = unwrapWrappedInt + override implicit def uv = strlen + + test("different types") { + wrappedInts + .iterator + .sortedEitherZip[String, Int]( + strings.iterator + ) + .toList should be( + Seq[Either[WrappedInt, String]]( + R(""), + L(1), + R("a"), + R("a"), + L(2), + R("bb"), + R("ccc"), + L(4), + R("eeeee"), + R("ffffff"), + L(7), + L(10), + R("kkkkkkkkkkk"), + R("nnnnnnnnnnnnnn"), + L(15) + ) + ) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala new file mode 100644 index 0000000..d9c7a30 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala @@ -0,0 +1,67 @@ +package org.hammerlab.iterator.sorted + +class EitherInts + extends EitherZip + with ZipIntsTest { + + override def expected: Map[String, Seq[Either[Int, Int]]] = + Map( + "1,2,3 4,5,6" → + Seq( + L(1), + L(2), + L(3), + R(4), + R(5), + R(6) + ), + "1,3,5 2,4,6" → + Seq( + L(1), + R(2), + L(3), + R(4), + L(5), + R(6) + ), + "1,2,3 1,2,3" → + Seq( + L(1), + R(1), + L(2), + R(2), + L(3), + R(3) + ), + "1,2,4,7,9 1,3,5,6,7,8" → + Seq( + L(1), + R(1), + L(2), + R(3), + L(4), + R(5), + R(6), + L(7), + R(7), + R(8), + L(9) + ), + "empty empty" → Nil, + "empty 1" → Seq(R(1)), + "empty 1,10,100" → + Seq( + R(1), + R(10), + R(100) + ), + "1 empty" → Seq(L(1)), + "1,10,100 empty" → + Seq( + L(1), + L(10), + L(100) + ) + ) +} + diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala new file mode 100644 index 0000000..48f171f --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala @@ -0,0 +1,11 @@ +package org.hammerlab.iterator.sorted + +trait EitherOr { + self: ZipIteratorTest ⇒ + + type V = Int + + implicit def tv: L ⇒ V + implicit def uv: R ⇒ V + implicit def ord: Ordering[V] = Ordering.Int +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala new file mode 100644 index 0000000..20d7822 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala @@ -0,0 +1,40 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.sorted.EitherZipIterator._ + +class EitherStringIntTest + extends IntStringEitherTest { + + override type L = String + override type R = WrappedInt + + override implicit def tv = strlen + override implicit def uv = unwrapWrappedInt + + test("different types") { + strings + .iterator + .sortedEitherZip[WrappedInt, Int]( + wrappedInts.iterator + ) + .toList should be( + Seq[Either[String, WrappedInt]]( + L(""), + L("a"), + L("a"), + R(1), + L("bb"), + R(2), + L("ccc"), + R(4), + L("eeeee"), + L("ffffff"), + R(7), + R(10), + L("kkkkkkkkkkk"), + L("nnnnnnnnnnnnnn"), + R(15) + ) + ) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala new file mode 100644 index 0000000..4dbb8a3 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala @@ -0,0 +1,21 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.sorted.EitherZipIterator._ + +abstract class EitherZip + extends ZipIteratorTest + with EitherOr { + + type Result = Either[L, R] + + override def check(left: L*)(right: R*)(expected: Result*): Unit = { + left + .sortedEitherZip(right) + .toList should be( + expected + ) + } + + def L(l: L): Result = Left(l) + def R(r: R): Result = Right(r) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala new file mode 100644 index 0000000..ff5054e --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala @@ -0,0 +1,44 @@ +package org.hammerlab.iterator.sorted + +import scala.collection.immutable.StringOps + +trait IntStringEitherTest + extends EitherZip { + + /** + * Workaround [[strlen]] making [[augmentString]] implicit (for accessing + * [[scala.collection.immutable.StringLike.*]]) ambiguous. + */ + implicit class StringMult(val s: String) { + def x(n: Int): String = (s: StringOps) * n + } + + case class WrappedInt(n: Int) + implicit val wrapInt: Int ⇒ WrappedInt = WrappedInt + val unwrapWrappedInt: WrappedInt ⇒ Int = _.n + + val wrappedInts = + Seq[WrappedInt]( + 1, + 2, + 4, + 7, + 10, + 15 + ) + + val strings = + Seq( + "", + "a", + "a", + "b" x 2, + "c" x 3, + "e" x 5, + "f" x 6, + "k" x 11, + "n" x 14 + ) + + def strlen(s: String): Int = s.length +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala b/src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala new file mode 100644 index 0000000..3f236ae --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala @@ -0,0 +1,86 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.iterator.sorted.OrZipIterator._ +import org.hammerlab.types.{ Both, LO, Or, RO } + +abstract class OrZip + extends ZipIteratorTest + with EitherOr { + + type Result = Or[L, R] + + override def check(left: L*)(right: R*)(expected: Result*): Unit = { + left + .iterator + .sortedOrZip(right.iterator) + .toList should be( + expected + ) + } + + def B(l: L)(implicit ev: L =:= R): Result = Both(l, l) + def B(l: L, r: R): Result = Both(l, r) + + def L(l: L): Result = LO(l) + def R(r: R): Result = RO(r) +} + +class OrInts + extends OrZip + with ZipIntsTest { + + override def expected: Map[String, Seq[Or[Int, Int]]] = + Map( + "1,2,3 4,5,6" → + Seq( + L(1), + L(2), + L(3), + R(4), + R(5), + R(6) + ), + "1,3,5 2,4,6" → + Seq( + L(1), + R(2), + L(3), + R(4), + L(5), + R(6) + ), + "1,2,3 1,2,3" → + Seq( + B(1), + B(2), + B(3) + ), + "1,2,4,7,9 1,3,5,6,7,8" → + Seq( + B(1), + L(2), + R(3), + L(4), + R(5), + R(6), + B(7), + R(8), + L(9) + ), + "empty empty" → Nil, + "empty 1" → Seq(R(1)), + "empty 1,10,100" → + Seq( + R(1), + R(10), + R(100) + ), + "1 empty" → Seq(L(1)), + "1,10,100 empty" → + Seq( + L(1), + L(10), + L(100) + ) + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala new file mode 100644 index 0000000..bad3ac6 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala @@ -0,0 +1,79 @@ +package org.hammerlab.iterator.sorted + +trait ZipIntsTest { + + self: ZipIteratorTest with EitherOr ⇒ + + type L = Int + type R = Int + + override implicit def tv: Int ⇒ Int = identity + override implicit def uv: Int ⇒ Int = identity + + def L(t: L): Result + def R(u: R): Result + + def expected: Map[String, Seq[Result]] + + def test(l: Int*)(r: Int*): Unit = { + def str(n: Seq[Int]): String = + if (n.isEmpty) + "empty" + else + n.mkString(",") + + val name = s"${str(l)} ${str(r)}" + + test(name) { + check( + l: _* + )( + r: _* + )( + expected(name): _* + ) + } + } + + test( + 1, 3, 5 + )( + 2, 4, 6 + ) + + test( + 1, 2, 3 + )( + 4, 5, 6 + ) + + test( + 1, 2, 3 + )( + 1, 2, 3 + ) + + test( + 1, 2, 4, 7, 9 + )( + 1, 3, 5, 6, 7, 8 + ) + + test()() + + test()(1) + + test( + + )( + 1, 10, 100 + ) + + test(1)() + + test( + 1, 10, 100 + )( + + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala new file mode 100644 index 0000000..ac8d3eb --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala @@ -0,0 +1,13 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.test.Suite + +abstract class ZipIteratorTest + extends Suite { + + type L + type R + type Result + + def check(left: L*)(right: R*)(expected: Result*): Unit +} From 243d59f658f09c7a6c006fa9765ee6630ea12710 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 6 Jul 2017 23:23:41 +0000 Subject: [PATCH 03/20] either/or sorted-zip tests --- .../iterator/sorted/OrZipIterator.scala | 41 ++------- .../iterator/sorted/ZipIterator.scala | 10 ++- .../iterator/sorted/ConvertToInt.scala | 21 +++++ .../iterator/sorted/EitherIntStringTest.scala | 40 --------- .../hammerlab/iterator/sorted/EitherOr.scala | 11 --- .../iterator/sorted/EitherStringIntTest.scala | 40 --------- .../hammerlab/iterator/sorted/EitherZip.scala | 21 ----- .../iterator/sorted/IntStringTest.scala | 18 ++++ .../{ZipIntsTest.scala => IntsTest.scala} | 15 +--- .../iterator/sorted/StringIntTest.scala | 18 ++++ .../org/hammerlab/iterator/sorted/Suite.scala | 13 +++ ...itherTest.scala => WrappedIntString.scala} | 7 +- .../iterator/sorted/ZipIteratorTest.scala | 13 --- .../sorted/either/IntStringTest.scala | 28 +++++++ .../IntsTest.scala} | 12 ++- .../sorted/either/StringIntTest.scala | 29 +++++++ .../iterator/sorted/either/Suite.scala | 23 +++++ .../iterator/sorted/or/IntStringTest.scala | 26 ++++++ .../sorted/{OrZip.scala => or/IntsTest.scala} | 36 ++------ .../iterator/sorted/or/StringIntTest.scala | 26 ++++++ .../hammerlab/iterator/sorted/or/Suite.scala | 29 +++++++ .../iterator/sorted/zip/IntsTest.scala | 83 +++++++++++++++++++ 22 files changed, 351 insertions(+), 209 deletions(-) create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/ConvertToInt.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala delete mode 100644 src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/IntStringTest.scala rename src/test/scala/org/hammerlab/iterator/sorted/{ZipIntsTest.scala => IntsTest.scala} (71%) create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/StringIntTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/Suite.scala rename src/test/scala/org/hammerlab/iterator/sorted/{IntStringEitherTest.scala => WrappedIntString.scala} (90%) delete mode 100644 src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/either/IntStringTest.scala rename src/test/scala/org/hammerlab/iterator/sorted/{EitherInts.scala => either/IntsTest.scala} (81%) create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/either/StringIntTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/either/Suite.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/or/IntStringTest.scala rename src/test/scala/org/hammerlab/iterator/sorted/{OrZip.scala => or/IntsTest.scala} (60%) create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/or/StringIntTest.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/or/Suite.scala create mode 100644 src/test/scala/org/hammerlab/iterator/sorted/zip/IntsTest.scala diff --git a/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala b/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala index 4dfd4fb..33bf3d9 100644 --- a/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/sorted/OrZipIterator.scala @@ -4,15 +4,13 @@ import org.hammerlab.iterator.{ HeadOptionIterator, SimpleBufferedIterator } import org.hammerlab.types.{ Both, LO, Or, RO } case class OrZipIterator[T](l: BufferedIterator[T]) { -// def sortedOrZip[U, V](other: Iterable[U])( -// implicit -// tv: T ⇒ V, -// uv: U ⇒ V, -// ord: Ordering[V] -// ): SimpleBufferedIterator[Or[T, U]] = -// sortedOrZip(other.iterator)(tv, uv, ord) - -// type Aux[U, V0] = CmpType[T, U] { type V = V0 } + def sortedOrZip[U, V](other: Iterable[U])( + implicit + ord: Ordering[V], + tv: T ⇒ V, + uv: U ⇒ V + ): SimpleBufferedIterator[Or[T, U]] = + sortedOrZip(other.iterator) def sortedOrZip[U, V](other: Iterator[U])( implicit @@ -52,31 +50,6 @@ case class OrZipIterator[T](l: BufferedIterator[T]) { } } -trait CmpType[T, U] { - type V - implicit def tv: T ⇒ V - implicit def uv: U ⇒ V - def ord: Ordering[V] -} - -object CmpType { - - type Aux[T, U, V0] = CmpType[T, U] { type V = V0 } - - implicit def makeCmp[T, U, V0]( - implicit - tv0: T ⇒ V0, - uv0: U ⇒ V0, - ord0: Ordering[V0] - ): CmpType.Aux[T, U, V0] = - new CmpType[T, U] { - type V = V0 - override implicit def tv: (T) ⇒ V0 = tv0 - override implicit def uv: (U) ⇒ V0 = uv0 - override def ord: Ordering[V0] = ord0 - } -} - object OrZipIterator { implicit def makeOrZipIterator[T](it: Iterator[T]): OrZipIterator[T] = OrZipIterator(it.buffered) diff --git a/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala b/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala index 21f7ebb..ae32c01 100644 --- a/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala +++ b/src/main/scala/org/hammerlab/iterator/sorted/ZipIterator.scala @@ -3,18 +3,20 @@ package org.hammerlab.iterator.sorted import org.hammerlab.iterator.{ HeadOptionIterator, SimpleBufferedIterator } case class ZipIterator[T](l: BufferedIterator[T]) { - def sortedZip[V: Ordering](other: Iterable[T])( + def sortedZip[V](other: Iterable[T])( implicit + ord: Ordering[V], tv: T ⇒ V ): SimpleBufferedIterator[T] = - sortedZip[V](other) + sortedZip[V](other.iterator) - def sortedZip[V: Ordering](other: Iterator[T])( + def sortedZip[V](other: Iterator[T])( implicit + ord: Ordering[V], tv: T ⇒ V ): SimpleBufferedIterator[T] = { val r = other.buffered - val ≤ = implicitly[Ordering[V]].lteq _ + val ≤ = ord.lteq _ new SimpleBufferedIterator[T] { override protected def _advance: Option[T] = (l.headOption, r.headOption) match { diff --git a/src/test/scala/org/hammerlab/iterator/sorted/ConvertToInt.scala b/src/test/scala/org/hammerlab/iterator/sorted/ConvertToInt.scala new file mode 100644 index 0000000..c18afc0 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/ConvertToInt.scala @@ -0,0 +1,21 @@ +package org.hammerlab.iterator.sorted + +trait VInt { + self: Suite ⇒ + type V = Int + implicit def ord: Ordering[V] = Ordering.Int +} + +trait ConvertToInt + extends VInt { + self: Suite ⇒ + implicit def tv: L ⇒ V + implicit def uv: R ⇒ V +} + +trait IdentityIntConversions + extends ConvertToInt { + self: Suite with IntsTest ⇒ + override implicit val tv: L ⇒ V = identity + override implicit val uv: R ⇒ R = identity +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala deleted file mode 100644 index 32b0bf0..0000000 --- a/src/test/scala/org/hammerlab/iterator/sorted/EitherIntStringTest.scala +++ /dev/null @@ -1,40 +0,0 @@ -package org.hammerlab.iterator.sorted - -import org.hammerlab.iterator.sorted.EitherZipIterator._ - -class EitherIntStringTest - extends IntStringEitherTest { - - override type L = WrappedInt - override type R = String - - override implicit def tv = unwrapWrappedInt - override implicit def uv = strlen - - test("different types") { - wrappedInts - .iterator - .sortedEitherZip[String, Int]( - strings.iterator - ) - .toList should be( - Seq[Either[WrappedInt, String]]( - R(""), - L(1), - R("a"), - R("a"), - L(2), - R("bb"), - R("ccc"), - L(4), - R("eeeee"), - R("ffffff"), - L(7), - L(10), - R("kkkkkkkkkkk"), - R("nnnnnnnnnnnnnn"), - L(15) - ) - ) - } -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala deleted file mode 100644 index 48f171f..0000000 --- a/src/test/scala/org/hammerlab/iterator/sorted/EitherOr.scala +++ /dev/null @@ -1,11 +0,0 @@ -package org.hammerlab.iterator.sorted - -trait EitherOr { - self: ZipIteratorTest ⇒ - - type V = Int - - implicit def tv: L ⇒ V - implicit def uv: R ⇒ V - implicit def ord: Ordering[V] = Ordering.Int -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala deleted file mode 100644 index 20d7822..0000000 --- a/src/test/scala/org/hammerlab/iterator/sorted/EitherStringIntTest.scala +++ /dev/null @@ -1,40 +0,0 @@ -package org.hammerlab.iterator.sorted - -import org.hammerlab.iterator.sorted.EitherZipIterator._ - -class EitherStringIntTest - extends IntStringEitherTest { - - override type L = String - override type R = WrappedInt - - override implicit def tv = strlen - override implicit def uv = unwrapWrappedInt - - test("different types") { - strings - .iterator - .sortedEitherZip[WrappedInt, Int]( - wrappedInts.iterator - ) - .toList should be( - Seq[Either[String, WrappedInt]]( - L(""), - L("a"), - L("a"), - R(1), - L("bb"), - R(2), - L("ccc"), - R(4), - L("eeeee"), - L("ffffff"), - R(7), - R(10), - L("kkkkkkkkkkk"), - L("nnnnnnnnnnnnnn"), - R(15) - ) - ) - } -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala b/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala deleted file mode 100644 index 4dbb8a3..0000000 --- a/src/test/scala/org/hammerlab/iterator/sorted/EitherZip.scala +++ /dev/null @@ -1,21 +0,0 @@ -package org.hammerlab.iterator.sorted - -import org.hammerlab.iterator.sorted.EitherZipIterator._ - -abstract class EitherZip - extends ZipIteratorTest - with EitherOr { - - type Result = Either[L, R] - - override def check(left: L*)(right: R*)(expected: Result*): Unit = { - left - .sortedEitherZip(right) - .toList should be( - expected - ) - } - - def L(l: L): Result = Left(l) - def R(r: R): Result = Right(r) -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/IntStringTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/IntStringTest.scala new file mode 100644 index 0000000..dcfc505 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/IntStringTest.scala @@ -0,0 +1,18 @@ +package org.hammerlab.iterator.sorted + +trait IntStringTest + extends WrappedIntString { + self: Suite ⇒ + + override type L = WrappedInt + override type R = String + + override implicit def tv = unwrapWrappedInt + override implicit val uv = strlen _ + + def expected: Seq[Result] + + test("different types") { + check(wrappedInts)(strings)(expected) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/IntsTest.scala similarity index 71% rename from src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala rename to src/test/scala/org/hammerlab/iterator/sorted/IntsTest.scala index bad3ac6..93e3497 100644 --- a/src/test/scala/org/hammerlab/iterator/sorted/ZipIntsTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sorted/IntsTest.scala @@ -1,15 +1,12 @@ package org.hammerlab.iterator.sorted -trait ZipIntsTest { +trait IntsTest { - self: ZipIteratorTest with EitherOr ⇒ + self: Suite with VInt ⇒ type L = Int type R = Int - override implicit def tv: Int ⇒ Int = identity - override implicit def uv: Int ⇒ Int = identity - def L(t: L): Result def R(u: R): Result @@ -25,13 +22,7 @@ trait ZipIntsTest { val name = s"${str(l)} ${str(r)}" test(name) { - check( - l: _* - )( - r: _* - )( - expected(name): _* - ) + check(l)(r)(expected(name)) } } diff --git a/src/test/scala/org/hammerlab/iterator/sorted/StringIntTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/StringIntTest.scala new file mode 100644 index 0000000..874be6a --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/StringIntTest.scala @@ -0,0 +1,18 @@ +package org.hammerlab.iterator.sorted + +trait StringIntTest + extends WrappedIntString { + self: Suite ⇒ + + override type L = String + override type R = WrappedInt + + override implicit def tv = strlen + override implicit val uv = unwrapWrappedInt + + def expected: Seq[Result] + + test("different types") { + check(strings)(wrappedInts)(expected) + } +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/Suite.scala b/src/test/scala/org/hammerlab/iterator/sorted/Suite.scala new file mode 100644 index 0000000..e0206d4 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/Suite.scala @@ -0,0 +1,13 @@ +package org.hammerlab.iterator.sorted + +import org.hammerlab.test + +abstract class Suite + extends test.Suite { + + type L + type R + type Result + + def check(left: Seq[L])(right: Seq[R])(expected: Seq[Result]): Unit +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/WrappedIntString.scala similarity index 90% rename from src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala rename to src/test/scala/org/hammerlab/iterator/sorted/WrappedIntString.scala index ff5054e..ba1ed45 100644 --- a/src/test/scala/org/hammerlab/iterator/sorted/IntStringEitherTest.scala +++ b/src/test/scala/org/hammerlab/iterator/sorted/WrappedIntString.scala @@ -2,8 +2,9 @@ package org.hammerlab.iterator.sorted import scala.collection.immutable.StringOps -trait IntStringEitherTest - extends EitherZip { +trait WrappedIntString + extends ConvertToInt { + self: Suite ⇒ /** * Workaround [[strlen]] making [[augmentString]] implicit (for accessing @@ -24,6 +25,8 @@ trait IntStringEitherTest 4, 7, 10, + 11, + 11, 15 ) diff --git a/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala deleted file mode 100644 index ac8d3eb..0000000 --- a/src/test/scala/org/hammerlab/iterator/sorted/ZipIteratorTest.scala +++ /dev/null @@ -1,13 +0,0 @@ -package org.hammerlab.iterator.sorted - -import org.hammerlab.test.Suite - -abstract class ZipIteratorTest - extends Suite { - - type L - type R - type Result - - def check(left: L*)(right: R*)(expected: Result*): Unit -} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/either/IntStringTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/either/IntStringTest.scala new file mode 100644 index 0000000..41e33fa --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/either/IntStringTest.scala @@ -0,0 +1,28 @@ +package org.hammerlab.iterator.sorted.either + +import org.hammerlab.iterator.sorted + +class IntStringTest + extends Suite + with sorted.IntStringTest { + override def expected: Seq[Either[L, R]] = + Seq( + R(""), + L(1), + R("a"), + R("a"), + L(2), + R("bb"), + R("ccc"), + L(4), + R("eeeee"), + R("ffffff"), + L(7), + L(10), + L(11), + L(11), + R("kkkkkkkkkkk"), + R("nnnnnnnnnnnnnn"), + L(15) + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala b/src/test/scala/org/hammerlab/iterator/sorted/either/IntsTest.scala similarity index 81% rename from src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala rename to src/test/scala/org/hammerlab/iterator/sorted/either/IntsTest.scala index d9c7a30..bb8581e 100644 --- a/src/test/scala/org/hammerlab/iterator/sorted/EitherInts.scala +++ b/src/test/scala/org/hammerlab/iterator/sorted/either/IntsTest.scala @@ -1,8 +1,12 @@ -package org.hammerlab.iterator.sorted +package org.hammerlab.iterator.sorted.either -class EitherInts - extends EitherZip - with ZipIntsTest { +import org.hammerlab.iterator.sorted +import org.hammerlab.iterator.sorted.IdentityIntConversions + +class IntsTest + extends Suite + with sorted.IntsTest + with IdentityIntConversions { override def expected: Map[String, Seq[Either[Int, Int]]] = Map( diff --git a/src/test/scala/org/hammerlab/iterator/sorted/either/StringIntTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/either/StringIntTest.scala new file mode 100644 index 0000000..3420aa1 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/either/StringIntTest.scala @@ -0,0 +1,29 @@ +package org.hammerlab.iterator.sorted.either + +import org.hammerlab.iterator.sorted + +class StringIntTest + extends Suite + with sorted.StringIntTest { + + override def expected: Seq[Either[L, R]] = + Seq( + L(""), + L("a"), + L("a"), + R(1), + L("bb"), + R(2), + L("ccc"), + R(4), + L("eeeee"), + L("ffffff"), + R(7), + R(10), + L("kkkkkkkkkkk"), + R(11), + R(11), + L("nnnnnnnnnnnnnn"), + R(15) + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/either/Suite.scala b/src/test/scala/org/hammerlab/iterator/sorted/either/Suite.scala new file mode 100644 index 0000000..7ef4c90 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/either/Suite.scala @@ -0,0 +1,23 @@ +package org.hammerlab.iterator.sorted.either + +import org.hammerlab.iterator.sorted +import org.hammerlab.iterator.sorted.ConvertToInt +import org.hammerlab.iterator.sorted.EitherZipIterator._ + +abstract class Suite + extends sorted.Suite + with ConvertToInt { + + type Result = Either[L, R] + + override def check(left: Seq[L])(right: Seq[R])(expected: Seq[Result]): Unit = { + left + .sortedEitherZip(right) + .toList should be( + expected + ) + } + + def L(l: L): Result = Left(l) + def R(r: R): Result = Right(r) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/or/IntStringTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/or/IntStringTest.scala new file mode 100644 index 0000000..9f6f741 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/or/IntStringTest.scala @@ -0,0 +1,26 @@ +package org.hammerlab.iterator.sorted.or + +import org.hammerlab.iterator.sorted +import org.hammerlab.types.Or + +class IntStringTest + extends Suite + with sorted.IntStringTest { + override def expected: Seq[Or[L, R]] = + Seq( + R(""), + B(1, "a"), + R("a"), + B(2, "bb"), + R("ccc"), + L(4), + R("eeeee"), + R("ffffff"), + L(7), + L(10), + B(11, "kkkkkkkkkkk"), + L(11), + R("nnnnnnnnnnnnnn"), + L(15) + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala b/src/test/scala/org/hammerlab/iterator/sorted/or/IntsTest.scala similarity index 60% rename from src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala rename to src/test/scala/org/hammerlab/iterator/sorted/or/IntsTest.scala index 3f236ae..266361b 100644 --- a/src/test/scala/org/hammerlab/iterator/sorted/OrZip.scala +++ b/src/test/scala/org/hammerlab/iterator/sorted/or/IntsTest.scala @@ -1,33 +1,13 @@ -package org.hammerlab.iterator.sorted +package org.hammerlab.iterator.sorted.or -import org.hammerlab.iterator.sorted.OrZipIterator._ -import org.hammerlab.types.{ Both, LO, Or, RO } +import org.hammerlab.iterator.sorted +import org.hammerlab.iterator.sorted.IdentityIntConversions +import org.hammerlab.types.Or -abstract class OrZip - extends ZipIteratorTest - with EitherOr { - - type Result = Or[L, R] - - override def check(left: L*)(right: R*)(expected: Result*): Unit = { - left - .iterator - .sortedOrZip(right.iterator) - .toList should be( - expected - ) - } - - def B(l: L)(implicit ev: L =:= R): Result = Both(l, l) - def B(l: L, r: R): Result = Both(l, r) - - def L(l: L): Result = LO(l) - def R(r: R): Result = RO(r) -} - -class OrInts - extends OrZip - with ZipIntsTest { +class IntsTest + extends Suite + with sorted.IntsTest + with IdentityIntConversions { override def expected: Map[String, Seq[Or[Int, Int]]] = Map( diff --git a/src/test/scala/org/hammerlab/iterator/sorted/or/StringIntTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/or/StringIntTest.scala new file mode 100644 index 0000000..d805821 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/or/StringIntTest.scala @@ -0,0 +1,26 @@ +package org.hammerlab.iterator.sorted.or + +import org.hammerlab.iterator.sorted + +class StringIntTest + extends Suite + with sorted.StringIntTest { + + override def expected: Seq[Result] = + Seq( + L(""), + B("a", 1), + L("a"), + B("bb", 2), + L("ccc"), + R(4), + L("eeeee"), + L("ffffff"), + R(7), + R(10), + B("kkkkkkkkkkk", 11), + R(11), + L("nnnnnnnnnnnnnn"), + R(15) + ) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/or/Suite.scala b/src/test/scala/org/hammerlab/iterator/sorted/or/Suite.scala new file mode 100644 index 0000000..92a0584 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/or/Suite.scala @@ -0,0 +1,29 @@ +package org.hammerlab.iterator.sorted.or + +import org.hammerlab.iterator.sorted +import org.hammerlab.iterator.sorted.ConvertToInt +import org.hammerlab.iterator.sorted.OrZipIterator._ +import org.hammerlab.test.matchers.seqs.SeqMatcher.seqMatch +import org.hammerlab.types.{ Both, LO, Or, RO } + +abstract class Suite + extends sorted.Suite + with ConvertToInt { + + type Result = Or[L, R] + + override def check(left: Seq[L])(right: Seq[R])(expected: Seq[Result]): Unit = { + left + .iterator + .sortedOrZip(right.iterator) + .toList should seqMatch( + expected + ) + } + + def B(l: L)(implicit ev: L =:= R): Result = Both(l, l) + def B(l: L, r: R): Result = Both(l, r) + + def L(l: L): Result = LO(l) + def R(r: R): Result = RO(r) +} diff --git a/src/test/scala/org/hammerlab/iterator/sorted/zip/IntsTest.scala b/src/test/scala/org/hammerlab/iterator/sorted/zip/IntsTest.scala new file mode 100644 index 0000000..59f0564 --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/sorted/zip/IntsTest.scala @@ -0,0 +1,83 @@ +package org.hammerlab.iterator.sorted.zip + +import org.hammerlab.iterator.sorted +import org.hammerlab.iterator.sorted.VInt +import org.hammerlab.iterator.sorted.ZipIterator._ + +class IntsTest + extends sorted.Suite + with VInt + with sorted.IntsTest { + + override type Result = Int + + override def check(left: Seq[L])(right: Seq[R])(expected: Seq[Int]): Unit = + left + .sortedZip(right) + .toList should be( + expected + ) + + override def L(t: L): Result = t + override def R(u: R): Result = u + + override def expected: Map[String, Seq[Result]] = + Map( + "1,2,3 4,5,6" → + Seq( + L(1), + L(2), + L(3), + R(4), + R(5), + R(6) + ), + "1,3,5 2,4,6" → + Seq( + L(1), + R(2), + L(3), + R(4), + L(5), + R(6) + ), + "1,2,3 1,2,3" → + Seq( + L(1), + R(1), + L(2), + R(2), + L(3), + R(3) + ), + "1,2,4,7,9 1,3,5,6,7,8" → + Seq( + L(1), + R(1), + L(2), + R(3), + L(4), + R(5), + R(6), + L(7), + R(7), + R(8), + L(9) + ), + "empty empty" → Nil, + "empty 1" → Seq(R(1)), + "empty 1,10,100" → + Seq( + R(1), + R(10), + R(100) + ), + "1 empty" → Seq(L(1)), + "1,10,100 empty" → + Seq( + L(1), + L(10), + L(100) + ) + ) +} From 529362f9b6e44daf052ca959597673f478561b54 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Fri, 7 Jul 2017 16:04:00 +0000 Subject: [PATCH 04/20] bump test-utils --- build.sbt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.sbt b/build.sbt index 7d30574..ee78cb4 100644 --- a/build.sbt +++ b/build.sbt @@ -10,4 +10,6 @@ deps ++= Seq( libs.value('spire) ) +testUtilsVersion := "1.2.4-SNAPSHOT" + testDeps += kryo.value From bdb74e583ad0fd26e3610982c566f0a71eb2c3f3 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Sun, 9 Jul 2017 16:11:12 +0000 Subject: [PATCH 05/20] Steps use SortedSet --- src/main/scala/org/hammerlab/math/Steps.scala | 55 +++++++++++-------- .../scala/org/hammerlab/math/StepsTest.scala | 14 +++-- 2 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/main/scala/org/hammerlab/math/Steps.scala b/src/main/scala/org/hammerlab/math/Steps.scala index c5465e3..f4d457b 100644 --- a/src/main/scala/org/hammerlab/math/Steps.scala +++ b/src/main/scala/org/hammerlab/math/Steps.scala @@ -1,6 +1,7 @@ package org.hammerlab.math -import math.{exp, log, max, min} +import math.{ exp, log, max, min } +import scala.collection.immutable.SortedSet /** * Some utilities for generating exponential sequences of integers that can be used as e.g. histogram-bucket boundaries. @@ -12,23 +13,28 @@ object Steps { * * Until the k-th step is bigger than k, the whole number k is used in its stead. */ - def geometricEvenSteps(maxDepth: Int, N: Int = 100): Set[Int] = { + def geometricEvenSteps(maxDepth: Int, N: Int = 100): SortedSet[Int] = { val logMaxDepth = log(maxDepth) - Set(0) ++ - (for { - i ← 1 until N - } yield - min( - maxDepth, - max( - i, - exp( - (i - 1) * logMaxDepth / (N - 2) - ).toInt - ) + SortedSet( + 0 :: + ( + for { + i ← 1 until N + } yield + min( + maxDepth, + max( + i, + exp( + (i - 1) * logMaxDepth / (N - 2) + ) + .toInt + ) + ) ) - ).toSet + .toList: _* + ) } /** @@ -60,11 +66,16 @@ object Steps { * * …etc. */ - def roundNumbers(maxDepth: Int): Set[Int] = - (0 until 10).toSet ++ - RoundNumbers( - (10 until 20) ++ (20 until 50 by 2) ++ (50 until 100 by 5), - maxDepth, - 10 - ).toSet + def roundNumbers(maxDepth: Int): SortedSet[Int] = + SortedSet( + ( + (0 until 10) ++ + RoundNumbers( + (10 until 20) ++ (20 until 50 by 2) ++ (50 until 100 by 5), + maxDepth, + 10 + ) + .toSeq + ): _* + ) } diff --git a/src/test/scala/org/hammerlab/math/StepsTest.scala b/src/test/scala/org/hammerlab/math/StepsTest.scala index 3ef6505..8248994 100644 --- a/src/test/scala/org/hammerlab/math/StepsTest.scala +++ b/src/test/scala/org/hammerlab/math/StepsTest.scala @@ -4,16 +4,20 @@ import org.hammerlab.math.Steps._ import org.hammerlab.test.Suite import org.hammerlab.test.matchers.seqs.SeqMatcher.seqMatch -class StepsTest extends Suite { +class StepsTest + extends Suite { test("roundNumbers") { - roundNumbers(200).toVector.sorted should seqMatch( - (0 until 20) ++ (20 until 50 by 2) ++ (50 until 100 by 5) ++ (100 to 200 by 10) + roundNumbers(200) should seqMatch( + ( 0 until 20) ++ + ( 20 until 50 by 2) ++ + ( 50 until 100 by 5) ++ + (100 to 200 by 10) ) } test("geometricEvenSteps") { - geometricEvenSteps(1000, 20).toVector.sorted should ===( - Vector( + geometricEvenSteps(1000, 20) should seqMatch( + Seq( 0, 1, 2, 3, 4, 5, 6, 9, 14, 21, 31, 46, 68, 99, 146, 215, 316, 464, 681, 999 ) ) From e54c3bc5eafccf526e6285cbb0e4033b7d497d97 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 10 Jul 2017 21:10:34 +0000 Subject: [PATCH 06/20] upgrade plugin --- build.sbt | 10 +++++----- project/plugins.sbt | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/build.sbt b/build.sbt index ee78cb4..bca1b61 100644 --- a/build.sbt +++ b/build.sbt @@ -5,11 +5,11 @@ version := "1.3.0-SNAPSHOT" addScala212 deps ++= Seq( - libs.value('commons_math), - "com.chuusai" %% "shapeless" % "2.3.2", - libs.value('spire) + commons_math, + shapeless, + spire ) -testUtilsVersion := "1.2.4-SNAPSHOT" +testDeps += kryo -testDeps += kryo.value +testUtilsVersion := "1.2.4-SNAPSHOT" diff --git a/project/plugins.sbt b/project/plugins.sbt index 1dd7897..ba64db2 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1 +1 @@ -addSbtPlugin("org.hammerlab" % "sbt-parent" % "2.0.1") +addSbtPlugin("org.hammerlab" % "sbt-parent" % "3.0.0-SNAPSHOT") From 4e17cc64f14c6ab8aff2a64c53471b6676ca7701 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 17 Jul 2017 15:39:40 +0000 Subject: [PATCH 07/20] add .sliceOpt iterator --- .../iterator/DropEagerIterator.scala | 17 +++++++++++++++++ .../hammerlab/iterator/SliceIterator.scala | 17 +++++++++++++++++ .../scala/org/hammerlab/math/Monoid.scala | 7 ++++--- .../iterator/SliceIteratorTest.scala | 19 +++++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 src/main/scala/org/hammerlab/iterator/DropEagerIterator.scala create mode 100644 src/main/scala/org/hammerlab/iterator/SliceIterator.scala create mode 100644 src/test/scala/org/hammerlab/iterator/SliceIteratorTest.scala diff --git a/src/main/scala/org/hammerlab/iterator/DropEagerIterator.scala b/src/main/scala/org/hammerlab/iterator/DropEagerIterator.scala new file mode 100644 index 0000000..fa1e4cd --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/DropEagerIterator.scala @@ -0,0 +1,17 @@ +package org.hammerlab.iterator + +case class DropEagerIterator[T](it: Iterator[T]) { + def dropEager(n: Int): Iterator[T] = { + var idx = 0 + while (it.hasNext && idx < n) { + it.next + idx += 1 + } + it + } +} + +object DropEagerIterator { + implicit def makeDropEagerIterator[T](it: Iterator[T]): DropEagerIterator[T] = + DropEagerIterator(it) +} diff --git a/src/main/scala/org/hammerlab/iterator/SliceIterator.scala b/src/main/scala/org/hammerlab/iterator/SliceIterator.scala new file mode 100644 index 0000000..e035a7a --- /dev/null +++ b/src/main/scala/org/hammerlab/iterator/SliceIterator.scala @@ -0,0 +1,17 @@ +package org.hammerlab.iterator + +import org.hammerlab.iterator.DropEagerIterator._ + +case class SliceIterator[T](it: Iterator[T]) { + def sliceOpt(start: Option[Int], length: Option[Int]): Iterator[T] = { + start.foreach(it.dropEager) + length.map(it.take).getOrElse(it) + } + def sliceOpt(start: Int, length: Int): Iterator[T] = sliceOpt(Some(start), Some(length)) + def sliceOpt(start: Option[Int], length: Int): Iterator[T] = sliceOpt(start, Some(length)) + def sliceOpt(start: Int, length: Option[Int] = None): Iterator[T] = sliceOpt(Some(start), length) +} + +object SliceIterator { + implicit def makeSliceIterator[T](it: Iterator[T]): SliceIterator[T] = SliceIterator(it) +} diff --git a/src/main/scala/org/hammerlab/math/Monoid.scala b/src/main/scala/org/hammerlab/math/Monoid.scala index f895529..da864a1 100644 --- a/src/main/scala/org/hammerlab/math/Monoid.scala +++ b/src/main/scala/org/hammerlab/math/Monoid.scala @@ -12,9 +12,10 @@ trait MonoidSyntax[T] { } object MonoidSyntax { - implicit def monoidSyntax[T](a: T)(implicit mt: Monoid[T]): MonoidSyntax[T] = new MonoidSyntax[T] { - def |+|(b: T) = mt.append(a, b) - } + implicit def monoidSyntax[T](a: T)(implicit mt: Monoid[T]): MonoidSyntax[T] = + new MonoidSyntax[T] { + def |+|(b: T) = mt.append(a, b) + } } trait Monoid[T] { diff --git a/src/test/scala/org/hammerlab/iterator/SliceIteratorTest.scala b/src/test/scala/org/hammerlab/iterator/SliceIteratorTest.scala new file mode 100644 index 0000000..79bc92a --- /dev/null +++ b/src/test/scala/org/hammerlab/iterator/SliceIteratorTest.scala @@ -0,0 +1,19 @@ +package org.hammerlab.iterator + +import org.hammerlab.test.Suite +import SliceIterator._ + +class SliceIteratorTest + extends Suite { + test("10") { + (0 to 9 iterator).sliceOpt(0, 10).toList should be(0 to 9) + (0 to 9 iterator).sliceOpt(0, 1).toList should be(0 to 0) + (0 to 9 iterator).sliceOpt(0, 5).toList should be(0 to 4) + (0 to 9 iterator).sliceOpt(0, 11).toList should be(0 to 9) + (0 to 9 iterator).sliceOpt(2, 10).toList should be(2 to 9) + (0 to 9 iterator).sliceOpt(2, 1).toList should be(2 to 2) + (0 to 9 iterator).sliceOpt(2, 5).toList should be(2 to 6) + (0 to 9 iterator).sliceOpt(2 ).toList should be(2 to 9) + (0 to 9 iterator).sliceOpt(2, 11).toList should be(2 to 9) + } +} From 232d18e2410f9d81b3d08f99136c4f9de8bb82f8 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Mon, 31 Jul 2017 01:50:28 +0000 Subject: [PATCH 08/20] add median to Stats --- src/main/scala/org/hammerlab/stats/Stats.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 37cb6d6..2ffaeef 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -12,8 +12,6 @@ import scala.math.{log10, floor, ceil, abs, sqrt} * Wrapper for some computed statistics about a dataset of [[Numeric]] elements. * * @param n number of elements in the dataset. - * @param mean mean. - * @param stddev stddev. * @param mad median absolute deviation (from the median). * @param samplesOpt "sample" elements; the start and end of the data. * @param sortedSamplesOpt "sample" elements; the least and greatest elements. If the dataset is already sorted, meaning @@ -25,6 +23,7 @@ import scala.math.{log10, floor, ceil, abs, sqrt} case class Stats[K: Numeric, V: Integral](n: V, mean: Double, stddev: Double, + median: Double, mad: Double, samplesOpt: Option[Samples[K, V]], sortedSamplesOpt: Option[Samples[K, V]], @@ -192,7 +191,8 @@ object Stats { Stats( n, - mean, stddev, mad, + mean, stddev, + median, mad, samplesOpt, sortedSamplesOpt, ps @@ -278,7 +278,8 @@ object Stats { new Stats( n, - mean, stddev, mad, + mean, stddev, + median, mad, samplesOpt, sortedSamplesOpt, percentiles(sorted) @@ -293,6 +294,7 @@ object Stats { n = Integral[V].zero, mean = 0, stddev = 0, + median = 0, mad = 0, samplesOpt = None, sortedSamplesOpt = None, From f97a8217f6e892d97cb6c4cf8e1c0ca2cb6bdbc7 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 1 Aug 2017 19:31:20 +0000 Subject: [PATCH 09/20] use Rationals for percentiles, support values larger than ints --- .../scala/org/hammerlab/stats/Stats.scala | 135 ++++++++++-------- .../scala/org/hammerlab/stats/StatsTest.scala | 63 ++++++-- 2 files changed, 125 insertions(+), 73 deletions(-) diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 2ffaeef..c89a82f 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -2,11 +2,11 @@ package org.hammerlab.stats import org.hammerlab.iterator.RunLengthIterator._ import spire.implicits._ -import spire.math.{ Integral, Numeric } +import spire.math.{ Integral, Numeric, Rational } import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import scala.math.{log10, floor, ceil, abs, sqrt} +import scala.math.{ abs, ceil, floor, sqrt } /** * Wrapper for some computed statistics about a dataset of [[Numeric]] elements. @@ -27,19 +27,19 @@ case class Stats[K: Numeric, V: Integral](n: V, mad: Double, samplesOpt: Option[Samples[K, V]], sortedSamplesOpt: Option[Samples[K, V]], - percentiles: Seq[(Double, Double)]) { + percentiles: Seq[(Rational, Double)]) { def prettyDouble(d: Double): String = - if (floor(d).toInt == ceil(d).toInt) - d.toInt.toString + if (floor(d).toLong == ceil(d).toLong) + d.toLong.toString else "%.1f".format(d) - def prettyPercentile(d: Double): String = - if (floor(d).toInt == ceil(d).toInt) - d.toInt.toString + def prettyPercentile(r: Rational): String = + if (r.isWhole()) + r.toLong.toString else - d.toString + r.toDouble.toString override def toString: String = { if (n == 0) @@ -53,7 +53,8 @@ case class Stats[K: Numeric, V: Integral](n: V, s"mean:\t${prettyDouble(mean)}", s"stddev:\t${prettyDouble(stddev)}", s"mad:\t${prettyDouble(mad)}" - ).mkString(",\t") + ) + .mkString(",\t") for { samples ← samplesOpt @@ -155,7 +156,7 @@ object Stats { getRunPercentiles( medianDeviations, Seq( - 50.0 → + Rational(50) → ((n.toDouble() - 1) / 2.0) ) ) @@ -305,7 +306,7 @@ object Stats { * Compute percentiles listed in `ps` of the data in `values`; wrapper for implementation below. */ private def getRunPercentiles[K: Numeric, V: Integral](values: Seq[(K, V)], - ps: Seq[(Double, Double)]): Vector[(Double, Double)] = + ps: Seq[(Rational, Double)]): Vector[(Rational, Double)] = getRunPercentiles( values .iterator @@ -325,15 +326,15 @@ object Stats { * @return pairs of (percentile, value). */ private def getRunPercentiles[K: Numeric, V: Integral](values: BufferedIterator[(K, V)], - percentiles: BufferedIterator[(Double, Double)]): Iterator[(Double, Double)] = - new Iterator[(Double, Double)] { + percentiles: BufferedIterator[(Rational, Double)]): Iterator[(Rational, Double)] = + new Iterator[(Rational, Double)] { var elemsPast = 0.0 var curK: Option[Double] = None override def hasNext: Boolean = percentiles.hasNext - override def next(): (Double, Double) = { + override def next(): (Rational, Double) = { val (percentile, idx) = percentiles.next() while(elemsPast <= idx) { val (k, v) = values.next() @@ -357,7 +358,8 @@ object Stats { * Compute some relevant percentiles based on the number of elements present. * @return pairs of (percentile, value). */ - private def histPercentiles[K: Numeric, V: Integral](N: V, values: IndexedSeq[(K, V)]): Vector[(Double, Double)] = { + private def histPercentiles[K: Numeric, V: Integral](N: V, + values: IndexedSeq[(K, V)]): Vector[(Rational, Double)] = { val n = N - 1 val denominators: Iterator[Int] = Iterator(2, 4, 10, 20, 100, 1000, 10000) @@ -365,21 +367,22 @@ object Stats { val percentileIdxs = denominators .takeWhile(d ⇒ d <= n || d == 2) // Always take the median (denominator 2 aka 50th percentile). - .flatMap(d ⇒ { - val loPercentile = 100.0 / d - val hiPercentile = 100.0 - loPercentile + .flatMap { + d ⇒ + val loPercentile = Rational(100, d) + val hiPercentile = 100 - loPercentile - val loIdx = nd / d - val hiIdx = nd - loIdx + val loIdx = nd / d + val hiIdx = nd - loIdx - if (d == 2) - // Median (50th percentile, denominator 2) only emits one tuple. - Iterator(loPercentile → loIdx) - else - // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. - Iterator(loPercentile → loIdx, hiPercentile → hiIdx) - }) + if (d == 2) + // Median (50th percentile, denominator 2) only emits one tuple. + Iterator(loPercentile → loIdx) + else + // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for + // denominator 4, we emit the 25th and 75th percentiles. + Iterator(loPercentile → loIdx, hiPercentile → hiIdx) + } .toArray .sortBy(_._1) @@ -388,48 +391,60 @@ object Stats { /** * Compute some relevant percentiles based on the number of elements present. + * * @return pairs of (percentile, value). */ - private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Double, Double)] = { + private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Rational, Double)] = { val n = values.length - 1 val denominators: Iterator[Int] = { lazy val pow10s: Stream[Int] = 100 #:: pow10s.map(_ * 10) - Iterator(2, 4, 10, 20) ++ pow10s.iterator + Iterator( + 2, // 50 + 4, // 25/75 + 10, // 10/90 + 20 // 5/95 + ) ++ pow10s.iterator // 1/99, .1/99.9, .01/99.99, … } val nd = n.toDouble - denominators.takeWhile(_ <= n).flatMap(d ⇒ { - val loPercentile = 100.0 / d - val hiPercentile = 100.0 - loPercentile - - val loFrac = nd / d - val loFloor = floor(loFrac).toInt - val loCeil = ceil(loFrac).toInt + denominators + .takeWhile(_ <= n) + .flatMap { + d ⇒ + val loPercentile = Rational(100, d) + val hiPercentile = 100 - loPercentile + + val loFloor = n / d + val loRemainder = n % d + + val hiCeil = n - loFloor + + val (lo, hi) = + if (loRemainder == 0) + ( + values(loFloor).toDouble(), + values( hiCeil).toDouble() + ) + else { + val floorWeight = loRemainder.toDouble() / d + ( + values(loFloor).toDouble() * floorWeight + values(loFloor + 1).toDouble() * (1 - floorWeight), + values( hiCeil).toDouble() * floorWeight + values( hiCeil - 1).toDouble() * (1 - floorWeight) + ) + } - val hiFloor = n - loFloor - val hiCeil = n - loCeil - - val loRemainder = loFrac - loFloor - val (lo, hi) = - if (loFloor == loCeil) - (values(loFloor).toDouble(), values(hiFloor).toDouble()) - else - ( - values(loFloor).toDouble() * loRemainder + values(loCeil).toDouble() * (1 - loRemainder), - values(hiCeil).toDouble() * loRemainder + values(hiFloor).toDouble() * (1 - loRemainder) - ) - - if (d == 2) - // Median (50th percentile, denominator 2) only emits one tuple. - Iterator(loPercentile → lo) - else - // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. - Iterator(loPercentile → lo, hiPercentile → hi) - - }).toVector.sortBy(_._1) + if (d == 2) + // Median (50th percentile, denominator 2) only emits one tuple. + Iterator(loPercentile → lo) + else + // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for + // denominator 4, we emit the 25th and 75th percentiles. + Iterator(loPercentile → lo, hiPercentile → hi) + } + .toVector + .sortBy(_._1) } private def getMedian[T: Numeric](sorted: Vector[T]): Double = { diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index 82a47be..768350d 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -2,7 +2,7 @@ package org.hammerlab.stats import org.hammerlab.test.Suite import spire.implicits._ -import spire.math.Integral +import spire.math.{ Integral, Numeric } import scala.util.Random import scala.util.Random.shuffle @@ -14,20 +14,36 @@ class StatsTest extends Suite { Random.setSeed(123L) - def check(input: Seq[Int], lines: String*): Unit = { - Stats(input).toString should be(lines.mkString("\n")) - } + def check[K: Numeric: Ordering](input: Seq[K], lines: String*): Unit = + Stats(input) + .toString should be( + lines.mkString("\n") + ) - def check(input: Seq[Int], numToSample: Int, lines: String*): Unit = { - Stats(input, numToSample).toString should be(lines.mkString("\n")) - } + def check[K: Numeric: Ordering](input: Seq[K], numToSample: Int, lines: String*): Unit = + Stats( + input, + numToSample + ) + .toString should be( + lines.mkString("\n") + ) - def check(input: Seq[Int], numToSample: Int, onlySampleSorted: Boolean, lines: String*): Unit = { - Stats(input, numToSample, onlySampleSorted).toString should be(lines.mkString("\n")) - } + def check[K: Numeric: Ordering](input: Seq[K], + numToSample: Int, + onlySampleSorted: Boolean, + lines: String*): Unit = + Stats( + input, + numToSample, + onlySampleSorted + ) + .toString should be( + lines.mkString("\n") + ) test("empty") { - check( + check[Int]( Nil, "(empty)" ) @@ -357,6 +373,27 @@ class StatsTest extends Suite { "95: 9" ) } -} - + test("values over Int.MAX_VALUE") { + check( + Seq( + 10000000000L, + 100000000000L, + 100000000000L, + 1000000000000L, + 1000000000000L, + 10000000000L, + 1000000000000L, + 100000000000L, + 10000000000L, + 10000000000L + ), + "num: 10, mean: 334000000000, stddev: 437588848121.2, mad: 90000000000", + "elems: 10000000000, 100000000000×2, 1000000000000×2, 10000000000, 1000000000000, 100000000000, 10000000000×2", + "sorted: 10000000000×4, 100000000000×3, 1000000000000×3", + "25: 10000000000", + "50: 100000000000", + "75: 325000000000" + ) + } +} From 56280a9afe30ee6f02ceda513e64d9f089832f15 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 1 Aug 2017 20:49:36 +0000 Subject: [PATCH 10/20] use cats.Show in Stats --- build.sbt | 1 + src/main/scala/org/hammerlab/stats/Runs.scala | 28 ++-- .../scala/org/hammerlab/stats/Samples.scala | 53 +++++--- .../scala/org/hammerlab/stats/Stats.scala | 128 ++++++++++-------- .../scala/org/hammerlab/stats/StatsTest.scala | 25 ++-- 5 files changed, 141 insertions(+), 94 deletions(-) diff --git a/build.sbt b/build.sbt index bca1b61..948a049 100644 --- a/build.sbt +++ b/build.sbt @@ -5,6 +5,7 @@ version := "1.3.0-SNAPSHOT" addScala212 deps ++= Seq( + cats, commons_math, shapeless, spire diff --git a/src/main/scala/org/hammerlab/stats/Runs.scala b/src/main/scala/org/hammerlab/stats/Runs.scala index d667d23..84181d9 100644 --- a/src/main/scala/org/hammerlab/stats/Runs.scala +++ b/src/main/scala/org/hammerlab/stats/Runs.scala @@ -1,22 +1,30 @@ package org.hammerlab.stats +import cats.Show +import cats.Show.show +import cats.implicits._ import spire.math.Integral /** * Convenience class wrapping a sequence of key-number pairs, used in run-length-encoding in [[Stats]]. */ -case class Runs[K, V: Integral](elems: Seq[(K, V)]) { - override def toString: String = - ( - for ((elem, count) ← elems) yield - if (count == 1) - elem.toString - else - s"$elem×$count" - ).mkString(", ") -} +case class Runs[K, V: Integral](elems: Seq[(K, V)]) object Runs { implicit def runsToSeq[K, V: Integral](runs: Runs[K, V]): Seq[(K, V)] = runs.elems implicit def seqToRuns[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = Runs(elems) + + implicit def makeShow[K, V: Integral](implicit elemShow: Show[K], countShow: Show[V]): Show[Runs[K, V]] = + show { + case Runs(elems) ⇒ + elems + .map { + case (elem, count) ⇒ + if (count == 1) + elem.show + else + s"${elem.show}×${count.show}" + } + .mkString(", ") + } } diff --git a/src/main/scala/org/hammerlab/stats/Samples.scala b/src/main/scala/org/hammerlab/stats/Samples.scala index 48ddd7c..158d68f 100644 --- a/src/main/scala/org/hammerlab/stats/Samples.scala +++ b/src/main/scala/org/hammerlab/stats/Samples.scala @@ -1,7 +1,10 @@ package org.hammerlab.stats -import spire.math.Integral +import cats.Show +import cats.Show.show +import cats.implicits._ import spire.implicits._ +import spire.math.Integral /** * Used by [[Stats]] to wrap some [[Runs]] of elements from the start and end of a dataset. @@ -13,30 +16,42 @@ import spire.implicits._ * @tparam K arbitrary element type * @tparam V [[Integral]] type, e.g. [[Int]] or [[Long]]. */ -case class Samples[K, V: Integral](n: V, first: Runs[K, V], numFirst: V, last: Runs[K, V], numLast: V) { +case class Samples[K, V: Integral](n: V, + first: Runs[K, V], + numFirst: V, + last: Runs[K, V], + numLast: V) { def isEmpty: Boolean = first.isEmpty def nonEmpty: Boolean = first.nonEmpty +} + +object Samples { + implicit def makeShow[K, V: Integral](implicit showRuns: Show[Runs[K, V]]): Show[Samples[K, V]] = + show { + case Samples(n, first, numFirst, last, numLast) ⇒ + val numSampled = numFirst + numLast + val numSkipped = n - numSampled + if (numSkipped > 0) + s"${first.show}, …, ${last.show}" + else + removeOverlap(-numSkipped, first, last).show + } - def removeOverlap(num: V, first: Runs[K, V], last: Runs[K, V]): Runs[K, V] = { + def removeOverlap[K, V: Integral](num: V, + first: Runs[K, V], + last: Runs[K, V]): Runs[K, V] = { val lastIt = last.iterator.buffered var dropped = Integral[V].zero Runs( - first ++ lastIt.dropWhile(t ⇒ { - val (_, count) = t - val drop = dropped < num - dropped += count - drop - }) + first ++ + lastIt + .dropWhile { + t ⇒ + val (_, count) = t + val drop = dropped < num + dropped += count + drop + } ) } - - override def toString: String = { - val numSampled = numFirst + numLast - val numSkipped = n - numSampled - if (numSkipped > 0) { - s"$first, …, $last" - } else { - removeOverlap(-numSkipped, first, last).toString - } - } } diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index c89a82f..fdfa782 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -1,5 +1,9 @@ package org.hammerlab.stats +import cats.Show +import cats.Show.show +import cats.instances.all.catsStdShowForString +import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ import spire.implicits._ import spire.math.{ Integral, Numeric, Rational } @@ -27,59 +31,7 @@ case class Stats[K: Numeric, V: Integral](n: V, mad: Double, samplesOpt: Option[Samples[K, V]], sortedSamplesOpt: Option[Samples[K, V]], - percentiles: Seq[(Rational, Double)]) { - - def prettyDouble(d: Double): String = - if (floor(d).toLong == ceil(d).toLong) - d.toLong.toString - else - "%.1f".format(d) - - def prettyPercentile(r: Rational): String = - if (r.isWhole()) - r.toLong.toString - else - r.toDouble.toString - - override def toString: String = { - if (n == 0) - "(empty)" - else { - val strings = ArrayBuffer[String]() - - strings += - List( - s"num:\t$n", - s"mean:\t${prettyDouble(mean)}", - s"stddev:\t${prettyDouble(stddev)}", - s"mad:\t${prettyDouble(mad)}" - ) - .mkString(",\t") - - for { - samples ← samplesOpt - if samples.nonEmpty - } { - strings += s"elems:\t$samples" - } - - for { - sortedSamples ← sortedSamplesOpt - if sortedSamples.nonEmpty - } { - strings += s"sorted:\t$sortedSamples" - } - - strings ++= - percentiles.map { - case (k, v) ⇒ - s"${prettyPercentile(k)}:\t${prettyDouble(v)}" - } - - strings.mkString("\n") - } - } -} + percentiles: Seq[(Rational, Double)]) /** * Helpers for constructing [[Stats]] / computing the statistics that populate a [[Stats]] instance. @@ -480,4 +432,74 @@ object Stats { } runs → sum } + + implicit def makeShow[ + K : Numeric : Show, + V: Integral : Show + ]( + implicit + percentileShow: Show[Rational] = showPercentile, + statShow: Show[Double] = showDouble + ): Show[Stats[K, V]] = + show { + case Stats(n, mean, stddev, median, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ + if (n == 0) + "(empty)" + else { + + def pair[L: Show, R: Show](l: L, r: R): String = + s"${l.show}:\t${r.show}" + + val strings = ArrayBuffer[String]() + + strings += + List( + pair("num", n), + pair("mean", mean), + pair("stddev", stddev), + pair("mad", mad) + ) + .mkString(",\t") + + for { + samples ← samplesOpt + if samples.nonEmpty + } { + strings += pair("elems", samples) + } + + for { + sortedSamples ← sortedSamplesOpt + if sortedSamples.nonEmpty + } { + strings += pair("sorted", sortedSamples) + } + + strings ++= + percentiles.map { + case (k, v) ⇒ + pair(k, v) + } + + strings.mkString("\n") + } + } + + def showDouble: Show[Double] = + show( + d ⇒ + if (floor(d).toLong == ceil(d).toLong) + d.toLong.toString + else + "%.1f".format(d) + ) + + def showPercentile: Show[Rational] = + show( + r ⇒ + if (r.isWhole()) + r.toLong.toString + else + r.toDouble.toString + ) } diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index 768350d..53c1757 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -1,8 +1,10 @@ package org.hammerlab.stats +import cats.Show +import cats.instances.all.{ catsStdShowForInt, catsStdShowForLong } +import cats.syntax.all._ import org.hammerlab.test.Suite -import spire.implicits._ -import spire.math.{ Integral, Numeric } +import spire.math.Numeric import scala.util.Random import scala.util.Random.shuffle @@ -14,31 +16,30 @@ class StatsTest extends Suite { Random.setSeed(123L) - def check[K: Numeric: Ordering](input: Seq[K], lines: String*): Unit = - Stats(input) - .toString should be( + def check[K : Numeric : Ordering : Show](input: Seq[K], lines: String*): Unit = + Stats(input).show should be( lines.mkString("\n") ) - def check[K: Numeric: Ordering](input: Seq[K], numToSample: Int, lines: String*): Unit = + def check[K : Numeric : Ordering : Show](input: Seq[K], numToSample: Int, lines: String*): Unit = Stats( input, numToSample ) - .toString should be( + .show should be( lines.mkString("\n") ) - def check[K: Numeric: Ordering](input: Seq[K], - numToSample: Int, - onlySampleSorted: Boolean, - lines: String*): Unit = + def check[K : Numeric : Ordering : Show](input: Seq[K], + numToSample: Int, + onlySampleSorted: Boolean, + lines: String*): Unit = Stats( input, numToSample, onlySampleSorted ) - .toString should be( + .show should be( lines.mkString("\n") ) From a4e7903819a63c8e5e28b2505d43dd3627c8aaf7 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Tue, 1 Aug 2017 22:06:50 +0000 Subject: [PATCH 11/20] add scientific notation formatting for integral types --- .../scala/org/hammerlab/math/Format.scala | 48 ++++++ .../scala/org/hammerlab/math/FormatTest.scala | 138 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 src/main/scala/org/hammerlab/math/Format.scala create mode 100644 src/test/scala/org/hammerlab/math/FormatTest.scala diff --git a/src/main/scala/org/hammerlab/math/Format.scala b/src/main/scala/org/hammerlab/math/Format.scala new file mode 100644 index 0000000..0903527 --- /dev/null +++ b/src/main/scala/org/hammerlab/math/Format.scala @@ -0,0 +1,48 @@ +package org.hammerlab.math + +import cats.Show +import cats.Show.show +import spire.implicits._ +import spire.math.Integral + +import scala.math.round + +object Format { + def scientific[I: Integral](n: I, precision: Int): String = + if (n < 0) + s"-${scientific(-n, precision)}" + else { + assert(precision >= 2) + val digits = n.toString + val numDigits = digits.length + if (numDigits > precision + 3) { + + val integral = implicitly[Integral[I]] + import integral.fromDouble + + val roundedDigits = + fromDouble( + round( + s"${digits.substring(0, precision)}.${digits(precision)}" + .toDouble + ) + ) + .toString + + val first = roundedDigits.head + val rest = roundedDigits.substring(1, precision) + + s"$first.${rest}e${numDigits - 1 + roundedDigits.length - precision}" + } else + digits + } + + def scientific[I: Integral](precision: Int): Show[I] = + show(scientific(_, precision)) + + object scientific { + implicit def digits2[I: Integral] = scientific[I](2) + implicit def digits3[I: Integral] = scientific[I](3) + implicit def digits4[I: Integral] = scientific[I](3) + } +} diff --git a/src/test/scala/org/hammerlab/math/FormatTest.scala b/src/test/scala/org/hammerlab/math/FormatTest.scala new file mode 100644 index 0000000..6ad75da --- /dev/null +++ b/src/test/scala/org/hammerlab/math/FormatTest.scala @@ -0,0 +1,138 @@ +package org.hammerlab.math + +import cats.Show +import cats.syntax.all._ +import org.hammerlab.test.Suite +import spire.math._ +import Format.scientific + +class FormatTest + extends Suite { + + def check[I : Integral : Show](i: I, expected: String): Unit = { + i.show should be(expected) + } + + test("2-digit ints") { + import scientific.digits2 + + (-20 to 20).foreach(n ⇒ check(n, n.toString)) + + check( -100, "-100") + + check( 99, "99") + check( 100, "100") + check( 101, "101") + + check( 999, "999") + check( 1000, "1000") + check( 1001, "1001") + + check( 9999, "9999") + check( 10000, "10000") + check( 10001, "10001") + + check( 99999, "99999") + check( 100000, "1.0e5") + check( 100001, "1.0e5") + + check( 104999, "1.0e5") + check( 105000, "1.1e5") + check( 105001, "1.1e5") + + check( 144999, "1.4e5") + check( 145000, "1.5e5") + check( 145001, "1.5e5") + + check( 149999, "1.5e5") + check( 150000, "1.5e5") + check( 150001, "1.5e5") + + check( 199999, "2.0e5") + check( 200000, "2.0e5") + check( 200001, "2.0e5") + + check( 449999, "4.5e5") + check( 450000, "4.5e5") + check( 450001, "4.5e5") + + check(-494999, "-4.9e5") + check(-495000, "-5.0e5") + check(-495001, "-5.0e5") + + check( 494999, "4.9e5") + check( 495000, "5.0e5") + check( 495001, "5.0e5") + + check( 994999, "9.9e5") + check( 995000, "1.0e6") + check( 995001, "1.0e6") + + check( 999999, "1.0e6") + check(1000000, "1.0e6") + check(1000001, "1.0e6") + + check(1049999, "1.0e6") + check(1050000, "1.1e6") + } + + test("3-digit ints") { + import scientific.digits3 + + (-20 to 20).foreach(n ⇒ check(n, n.toString)) + + check( -100, "-100") + + check( 99, "99") + check( 100, "100") + check( 101, "101") + + check( 999, "999") + check( 1000, "1000") + check( 1001, "1001") + + check( 9999, "9999") + check( 10000, "10000") + check( 10001, "10001") + + check( 99999, "99999") + check( 100000, "100000") + check( 100001, "100001") + + check( 999999, "999999") + check( 1000000, "1.00e6") + check( 1000001, "1.00e6") + + check( 1004999, "1.00e6") + check( 1005000, "1.01e6") + check( 1005001, "1.01e6") + + check( 1044999, "1.04e6") + check( 1045000, "1.05e6") + check( 1045001, "1.05e6") + + check( 1049999, "1.05e6") + check( 1050000, "1.05e6") + check( 1050001, "1.05e6") + + check( 1944999, "1.94e6") + check( 1945000, "1.95e6") + check( 1945001, "1.95e6") + + check( 1994999, "1.99e6") + check( 1995000, "2.00e6") + check( 1995001, "2.00e6") + + check( 1999999, "2.00e6") + check( 2000000, "2.00e6") + check( 2000001, "2.00e6") + + check( 9994999, "9.99e6") + check(10000000, "1.00e7") + check(10000001, "1.00e7") + + check(10049999, "1.00e7") + check(10050000, "1.01e7") + check(10050001, "1.01e7") + } +} From 3762d36357040c2818e69b0433a22419d2949e5d Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 2 Aug 2017 02:23:46 +0000 Subject: [PATCH 12/20] use ShowInterpolator, fix StatsHistTest --- .../scala/org/hammerlab/stats/Stats.scala | 3 +- .../org/hammerlab/stats/StatsHistTest.scala | 38 ++++++++++++++----- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index fdfa782..2b6e311 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -5,6 +5,7 @@ import cats.Show.show import cats.instances.all.catsStdShowForString import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ +import org.hammerlab.stats.Stats.makeShow import spire.implicits._ import spire.math.{ Integral, Numeric, Rational } @@ -448,7 +449,7 @@ object Stats { else { def pair[L: Show, R: Show](l: L, r: R): String = - s"${l.show}:\t${r.show}" + show"$l:\t$r" val strings = ArrayBuffer[String]() diff --git a/src/test/scala/org/hammerlab/stats/StatsHistTest.scala b/src/test/scala/org/hammerlab/stats/StatsHistTest.scala index 2acda94..91db115 100644 --- a/src/test/scala/org/hammerlab/stats/StatsHistTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsHistTest.scala @@ -1,5 +1,9 @@ package org.hammerlab.stats +import cats.Show +import cats.implicits.{ catsStdShowForInt, catsStdShowForLong } +import cats.syntax.all._ +import org.hammerlab.stats.Stats.fromHist import org.hammerlab.test.Suite import spire.implicits._ import spire.math.Integral @@ -15,17 +19,33 @@ class StatsHistTest extends Suite { Random.setSeed(123L) - def check[V: Integral](input: Seq[(Int, V)], lines: String*): Unit = { - Stats.fromHist(input).toString should be(lines.mkString("\n")) - } + def check[V: Integral : Show](input: Seq[(Int, V)], + lines: String*): Unit = + fromHist(input).show should be(lines.mkString("\n")) - def check[V: Integral](input: Seq[(Int, V)], numToSample: Int, lines: String*): Unit = { - Stats.fromHist(input, numToSample).toString should be(lines.mkString("\n")) - } + def check[V: Integral : Show](input: Seq[(Int, V)], + numToSample: Int, + lines: String*): Unit = + fromHist( + input, + numToSample + ) + .show should be( + lines.mkString("\n") + ) - def check[V: Integral](input: Seq[(Int, V)], numToSample: Int, onlySampleSorted: Boolean, lines: String*): Unit = { - Stats.fromHist(input, numToSample, onlySampleSorted).toString should be(lines.mkString("\n")) - } + def check[V: Integral : Show](input: Seq[(Int, V)], + numToSample: Int, + onlySampleSorted: Boolean, + lines: String*): Unit = + fromHist( + input, + numToSample, + onlySampleSorted + ) + .show should be( + lines.mkString("\n") + ) test("empty") { check( From 67624f8298760c09cc3f77ace6ccf961b167c10f Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 2 Aug 2017 21:42:23 +0000 Subject: [PATCH 13/20] Stats.show, test improvements --- src/main/scala/org/hammerlab/stats/Runs.scala | 17 +- .../scala/org/hammerlab/stats/Samples.scala | 8 +- .../scala/org/hammerlab/stats/Stats.scala | 88 +++- .../scala/org/hammerlab/stats/ShowTest.scala | 399 +++++++++++++++ .../scala/org/hammerlab/stats/StatsTest.scala | 456 +++--------------- 5 files changed, 553 insertions(+), 415 deletions(-) create mode 100644 src/test/scala/org/hammerlab/stats/ShowTest.scala diff --git a/src/main/scala/org/hammerlab/stats/Runs.scala b/src/main/scala/org/hammerlab/stats/Runs.scala index 84181d9..cb053dc 100644 --- a/src/main/scala/org/hammerlab/stats/Runs.scala +++ b/src/main/scala/org/hammerlab/stats/Runs.scala @@ -3,27 +3,36 @@ package org.hammerlab.stats import cats.Show import cats.Show.show import cats.implicits._ +import spire.implicits._ import spire.math.Integral /** * Convenience class wrapping a sequence of key-number pairs, used in run-length-encoding in [[Stats]]. */ -case class Runs[K, V: Integral](elems: Seq[(K, V)]) +case class Runs[K, V: Integral](elems: Seq[(K, V)], num: V) object Runs { implicit def runsToSeq[K, V: Integral](runs: Runs[K, V]): Seq[(K, V)] = runs.elems - implicit def seqToRuns[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = Runs(elems) + implicit def seqToRuns[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = apply[K, V](elems) + + def apply[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = + Runs( + elems, + elems + .map(_._2) + .reduce(_ + _) + ) implicit def makeShow[K, V: Integral](implicit elemShow: Show[K], countShow: Show[V]): Show[Runs[K, V]] = show { - case Runs(elems) ⇒ + case Runs(elems, _) ⇒ elems .map { case (elem, count) ⇒ if (count == 1) elem.show else - s"${elem.show}×${count.show}" + show"$elem×$count" } .mkString(", ") } diff --git a/src/main/scala/org/hammerlab/stats/Samples.scala b/src/main/scala/org/hammerlab/stats/Samples.scala index 158d68f..b7d3379 100644 --- a/src/main/scala/org/hammerlab/stats/Samples.scala +++ b/src/main/scala/org/hammerlab/stats/Samples.scala @@ -18,9 +18,7 @@ import spire.math.Integral */ case class Samples[K, V: Integral](n: V, first: Runs[K, V], - numFirst: V, - last: Runs[K, V], - numLast: V) { + last: Runs[K, V]) { def isEmpty: Boolean = first.isEmpty def nonEmpty: Boolean = first.nonEmpty } @@ -28,8 +26,8 @@ case class Samples[K, V: Integral](n: V, object Samples { implicit def makeShow[K, V: Integral](implicit showRuns: Show[Runs[K, V]]): Show[Samples[K, V]] = show { - case Samples(n, first, numFirst, last, numLast) ⇒ - val numSampled = numFirst + numLast + case Samples(n, first, last) ⇒ + val numSampled = first.num + last.num val numSkipped = n - numSampled if (numSkipped > 0) s"${first.show}, …, ${last.show}" diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 2b6e311..b684b11 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -5,10 +5,10 @@ import cats.Show.show import cats.instances.all.catsStdShowForString import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ -import org.hammerlab.stats.Stats.makeShow import spire.implicits._ import spire.math.{ Integral, Numeric, Rational } +import scala.Double.NaN import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.math.{ abs, ceil, floor, sqrt } @@ -75,9 +75,8 @@ object Stats { vBuilder.result() } - if (values.isEmpty) { - return empty - } + if (values.isEmpty) + return Empty val sorted = if (alreadySorted) @@ -122,24 +121,30 @@ object Stats { val samplesOpt = if (alreadySorted || !onlySampleSorted) { val firstElems = values.take(numToSample) - val numFirstElems = firstElems.map(_._2).reduce(_ + _) - val lastElems = values.takeRight(numToSample) - val numLastElems = lastElems.map(_._2).reduce(_ + _) - Some(Samples[K, V](n, firstElems, numFirstElems, lastElems, numLastElems)) + Some( + Samples[K, V]( + n, + firstElems, + lastElems + ) + ) } else None val sortedSamplesOpt = if (!alreadySorted) { val leastElems = sorted.take(numToSample) - val numLeastElems = leastElems.map(_._2).reduce(_ + _) - val greatestElems = sorted.takeRight(numToSample) - val numGreatestElems = greatestElems.map(_._2).reduce(_ + _) - Some(Samples(n, leastElems, numLeastElems, greatestElems, numGreatestElems)) + Some( + Samples( + n, + leastElems, + greatestElems + ) + ) } else None @@ -179,6 +184,9 @@ object Stats { val values = vBuilder.result() + if (values.isEmpty) + return Empty + val n = values.length val sorted = @@ -208,25 +216,56 @@ object Stats { val samplesOpt: Option[Samples[K, Int]] = if (alreadySorted || !onlySampleSorted) { + // Count occurrences of the first N distinct values. - val (firstElems, numFirstElems) = runLengthEncodeWithSum(values.iterator, numToSample) + val (firstElems, numFirstElems) = + runLengthEncodeWithSum( + values.iterator, + numToSample + ) // Count occurrences of the last N distinct values. - val (lastElems, numLastElems) = runLengthEncodeWithSum(values.reverseIterator, numToSample, reverse = true) + val (lastElems, numLastElems) = + runLengthEncodeWithSum( + values.reverseIterator, + numToSample, + reverse = true + ) - Some(Samples(n, firstElems, numFirstElems, lastElems, numLastElems)) + Some( + Samples( + n, + Runs(firstElems, numFirstElems), + Runs(lastElems, numLastElems) + ) + ) } else None val sortedSamplesOpt: Option[Samples[K, Int]] = if (!alreadySorted) { // Count occurrences of the least N distinct values. - val (leastElems, numLeastElems) = runLengthEncodeWithSum[K](sorted.iterator, numToSample) + val (leastElems, numLeastElems) = + runLengthEncodeWithSum[K]( + sorted.iterator, + numToSample + ) // Count occurrences of the greatest N distinct values. - val (greatestElems, numGreatestElems) = runLengthEncodeWithSum(sorted.reverseIterator, numToSample, reverse = true) + val (greatestElems, numGreatestElems) = + runLengthEncodeWithSum( + sorted.reverseIterator, + numToSample, + reverse = true + ) - Some(Samples(n, leastElems, numLeastElems, greatestElems, numGreatestElems)) + Some( + Samples( + n, + Runs(leastElems, numLeastElems), + Runs(greatestElems, numGreatestElems) + ) + ) } else None @@ -267,7 +306,8 @@ object Stats { ps .iterator .buffered - ).toVector + ) + .toVector /** * Compute percentiles listed in `ps` of the data in `values`. @@ -348,7 +388,7 @@ object Stats { * @return pairs of (percentile, value). */ private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Rational, Double)] = { - val n = values.length - 1 + val n = values.length + 1 val denominators: Iterator[Int] = { lazy val pow10s: Stream[Int] = 100 #:: pow10s.map(_ * 10) @@ -360,8 +400,6 @@ object Stats { ) ++ pow10s.iterator // 1/99, .1/99.9, .01/99.99, … } - val nd = n.toDouble - denominators .takeWhile(_ <= n) .flatMap { @@ -369,10 +407,10 @@ object Stats { val loPercentile = Rational(100, d) val hiPercentile = 100 - loPercentile - val loFloor = n / d + val loFloor = n / d - 1 val loRemainder = n % d - val hiCeil = n - loFloor + val hiCeil = n - 2 - loFloor val (lo, hi) = if (loRemainder == 0) @@ -403,7 +441,7 @@ object Stats { private def getMedian[T: Numeric](sorted: Vector[T]): Double = { val n = sorted.length if (n == 0) - -1 + NaN else if (n % 2 == 0) (sorted(n / 2 - 1) + sorted(n / 2)).toDouble() / 2.0 else diff --git a/src/test/scala/org/hammerlab/stats/ShowTest.scala b/src/test/scala/org/hammerlab/stats/ShowTest.scala new file mode 100644 index 0000000..e1eca47 --- /dev/null +++ b/src/test/scala/org/hammerlab/stats/ShowTest.scala @@ -0,0 +1,399 @@ +package org.hammerlab.stats + +import cats.Show +import cats.instances.all.{ catsStdShowForInt, catsStdShowForLong } +import cats.syntax.all._ +import org.hammerlab.test.Suite +import spire.math.Numeric + +import scala.util.Random.{ nextInt, setSeed, shuffle } + +/** + * Test constructing [[Stats]] instances. + */ +class ShowTest extends Suite { + + setSeed(123L) + + def check[K : Numeric : Ordering : Show](input: Seq[K], lines: String*): Unit = + Stats(input).show should be( + lines.mkString("\n") + ) + + def check[K : Numeric : Ordering : Show](input: Seq[K], numToSample: Int, lines: String*): Unit = + Stats( + input, + numToSample + ) + .show should be( + lines.mkString("\n") + ) + + def check[K : Numeric : Ordering : Show](input: Seq[K], + numToSample: Int, + onlySampleSorted: Boolean, + lines: String*): Unit = + Stats( + input, + numToSample, + onlySampleSorted + ) + .show should be( + lines.mkString("\n") + ) + + test("empty") { + check[Int]( + Nil, + "(empty)" + ) + } + + test("0 to 0") { + check( + 0 to 0, + "num: 1, mean: 0, stddev: 0, mad: 0", + "elems: 0" + ) + } + + test("0 to 1") { + check( + 0 to 1, + "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", + "elems: 0, 1" + ) + } + + test("1 to 0") { + check( + 1 to 0 by -1, + "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", + "elems: 1, 0", + "sorted: 0, 1" + ) + } + + test("0 to 2") { + check( + 0 to 2, + "num: 3, mean: 1, stddev: 0.8, mad: 1", + "elems: 0, 1, 2", + "50: 1" + ) + } + + test("2 to 0") { + check( + 2 to 0 by -1, + "num: 3, mean: 1, stddev: 0.8, mad: 1", + "elems: 2, 1, 0", + "sorted: 0, 1, 2", + "50: 1" + ) + } + + test("0 to 3") { + check( + 0 to 3, + "num: 4, mean: 1.5, stddev: 1.1, mad: 1", + "elems: 0, 1, 2, 3", + "50: 1.5" + ) + } + + test("3 to 0") { + check( + 3 to 0 by -1, + "num: 4, mean: 1.5, stddev: 1.1, mad: 1", + "elems: 3, 2, 1, 0", + "sorted: 0, 1, 2, 3", + "50: 1.5" + ) + } + + test("1 to 3") { + check( + 1 to 3, + "num: 3, mean: 2, stddev: 0.8, mad: 1", + "elems: 1, 2, 3", + "25: 1", + "50: 2", + "75: 3" + ) + } + + test("3 to 1") { + check( + 3 to 1 by -1, + "num: 3, mean: 2, stddev: 0.8, mad: 1", + "elems: 3, 2, 1", + "sorted: 1, 2, 3", + "25: 1", + "50: 2", + "75: 3" + ) + } + + test("1 to 9") { + check( + 1 to 9, + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 1, 2, 3, 4, 5, 6, 7, 8, 9", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("9 to 1") { + check( + 9 to 1 by -1, + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 9, 8, 7, 6, 5, 4, 3, 2, 1", + "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + val shuffled0to10 = shuffle(0 to 10).toArray + + test("0 to 10 sample 5") { + check( + shuffled0to10, + numToSample = 5, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "elems: 9, 3, 7, 1, 6, …, 4, 8, 2, 0, 10", + "sorted: 0, 1, 2, 3, 4, …, 6, 7, 8, 9, 10", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 10 sample 4") { + check( + shuffled0to10, + numToSample = 4, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "elems: 9, 3, 7, 1, …, 8, 2, 0, 10", + "sorted: 0, 1, 2, 3, …, 7, 8, 9, 10", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 10 sample 3") { + check( + shuffled0to10, + numToSample = 3, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "elems: 9, 3, 7, …, 2, 0, 10", + "sorted: 0, 1, 2, …, 8, 9, 10", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 10 sample 2") { + check( + shuffled0to10, + numToSample = 2, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "elems: 9, 3, …, 0, 10", + "sorted: 0, 1, …, 9, 10", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 10 sample 1") { + check( + shuffled0to10, + numToSample = 1, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "elems: 9, …, 10", + "sorted: 0, …, 10", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 10 sample 0") { + check( + shuffled0to10, + numToSample = 0, + "num: 11, mean: 5, stddev: 3.2, mad: 3", + "10: 1", + "25: 2.5", + "50: 5", + "75: 7.5", + "90: 9" + ) + } + + test("0 to 100") { + check( + 0 to 100, + "num: 101, mean: 50, stddev: 29.2, mad: 25", + "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", + "1: 1", + "5: 5", + "10: 10", + "25: 25", + "50: 50", + "75: 75", + "90: 90", + "95: 95", + "99: 99" + ) + } + + test("100 to 0") { + check( + 100 to 0 by -1, + "num: 101, mean: 50, stddev: 29.2, mad: 25", + "elems: 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, …, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0", + "sorted: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", + "1: 1", + "5: 5", + "10: 10", + "25: 25", + "50: 50", + "75: 75", + "90: 90", + "95: 95", + "99: 99" + ) + } + + val shuffledDigits = (0 until 100).map(_ ⇒ nextInt(10)) + + test("100 digits") { + check( + shuffledDigits, + "num: 100, mean: 4.3, stddev: 2.9, mad: 2", + "elems: 9, 6, 2, 5, 7, 9, 0, 5, 4, 6, …, 1, 9, 0×2, 8, 0, 7×2, 0, 6, 2, 4", + "sorted: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", + "5: 0", + "10: 0", + "25: 2", + "50: 4", + "75: 7", + "90: 8", + "95: 9" + + ) + } + + test("100 digits sample 4") { + check( + shuffledDigits, + numToSample = 4, + "num: 100, mean: 4.3, stddev: 2.9, mad: 2", + "elems: 9, 6, 2, 5, …, 0, 6, 2, 4", + "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "5: 0", + "10: 0", + "25: 2", + "50: 4", + "75: 7", + "90: 8", + "95: 9" + ) + } + + test("100 digits sample 4 only sample sorted") { + check( + shuffledDigits, + numToSample = 4, + onlySampleSorted = true, + "num: 100, mean: 4.3, stddev: 2.9, mad: 2", + "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "5: 0", + "10: 0", + "25: 2", + "50: 4", + "75: 7", + "90: 8", + "95: 9" + ) + } + + val sortedShuffledDigits = shuffledDigits.sorted + + test("100 sorted digits") { + check( + sortedShuffledDigits, + "num: 100, mean: 4.3, stddev: 2.9, mad: 2", + "elems: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", + "5: 0", + "10: 0", + "25: 2", + "50: 4", + "75: 7", + "90: 8", + "95: 9" + ) + } + + test("100 sorted digits only sample sorted overridden") { + check( + sortedShuffledDigits, + numToSample = 4, + onlySampleSorted = true, + "num: 100, mean: 4.3, stddev: 2.9, mad: 2", + "elems: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "5: 0", + "10: 0", + "25: 2", + "50: 4", + "75: 7", + "90: 8", + "95: 9" + ) + } + + test("values over Int.MAX_VALUE") { + check( + Seq( + 10000000000L, + 100000000000L, + 100000000000L, + 1000000000000L, + 1000000000000L, + 10000000000L, + 1000000000000L, + 100000000000L, + 10000000000L, + 10000000000L + ), + "num: 10, mean: 334000000000, stddev: 437588848121.2, mad: 90000000000", + "elems: 10000000000, 100000000000×2, 1000000000000×2, 10000000000, 1000000000000, 100000000000, 10000000000×2", + "sorted: 10000000000×4, 100000000000×3, 1000000000000×3", + "25: 10000000000", + "50: 100000000000", + "75: 325000000000" + ) + } +} diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index 53c1757..331d3e8 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -1,400 +1,94 @@ package org.hammerlab.stats -import cats.Show -import cats.instances.all.{ catsStdShowForInt, catsStdShowForLong } -import cats.syntax.all._ +import Double.NaN import org.hammerlab.test.Suite +import org.scalactic.Equality +import shapeless.{ Generic, HNil } import spire.math.Numeric -import scala.util.Random -import scala.util.Random.shuffle - -/** - * Test constructing [[Stats]] instances. - */ -class StatsTest extends Suite { - - Random.setSeed(123L) - - def check[K : Numeric : Ordering : Show](input: Seq[K], lines: String*): Unit = - Stats(input).show should be( - lines.mkString("\n") - ) - - def check[K : Numeric : Ordering : Show](input: Seq[K], numToSample: Int, lines: String*): Unit = +class StatsTest + extends Suite { + + implicit val de = + new Equality[Double] { + override def areEqual(a: Double, b: Any): Boolean = + b match { + case d: Double ⇒ + if (a.isNaN && d.isNaN) true + else a == d + case _ ⇒ + false + } + } + + def check[K : Numeric : Ordering](input: Seq[K], + expected: Stats[K, Int]): Unit = + Stats(input) should ===(expected) + + def check[K : Numeric : Ordering](input: Seq[K], + numToSample: Int, + expected: Stats[K, Int]): Unit = Stats( input, numToSample - ) - .show should be( - lines.mkString("\n") + ) should be( + expected ) - def check[K : Numeric : Ordering : Show](input: Seq[K], - numToSample: Int, - onlySampleSorted: Boolean, - lines: String*): Unit = + def check[K : Numeric : Ordering](input: Seq[K], + numToSample: Int, + onlySampleSorted: Boolean, + expected: Stats[K, Int]): Unit = Stats( input, numToSample, onlySampleSorted - ) - .show should be( - lines.mkString("\n") + ) should be( + expected ) test("empty") { - check[Int]( - Nil, - "(empty)" - ) - } - - test("0 to 0") { - check( - 0 to 0, - "num: 1, mean: 0, stddev: 0, mad: 0", - "elems: 0" - ) - } - - test("0 to 1") { - check( - 0 to 1, - "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 0, 1" - ) - } - - test("1 to 0") { - check( - 1 to 0 by -1, - "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 1, 0", - "sorted: 0, 1" - ) - } - - test("0 to 2") { - check( - 0 to 2, - "num: 3, mean: 1, stddev: 0.8, mad: 1", - "elems: 0, 1, 2", - "50: 1" - ) - } - - test("2 to 0") { - check( - 2 to 0 by -1, - "num: 3, mean: 1, stddev: 0.8, mad: 1", - "elems: 2, 1, 0", - "sorted: 0, 1, 2", - "50: 1" - ) - } - - test("0 to 3") { - check( - 0 to 3, - "num: 4, mean: 1.5, stddev: 1.1, mad: 1", - "elems: 0, 1, 2, 3", - "50: 1.5" - ) - } - - test("3 to 0") { - check( - 3 to 0 by -1, - "num: 4, mean: 1.5, stddev: 1.1, mad: 1", - "elems: 3, 2, 1, 0", - "sorted: 0, 1, 2, 3", - "50: 1.5" - ) - } - - test("0 to 4") { - check( - 0 to 4, - "num: 5, mean: 2, stddev: 1.4, mad: 1", - "elems: 0, 1, 2, 3, 4", - "25: 1", - "50: 2", - "75: 3" - ) - } + val stats = Stats[Int](Nil) + val hl = Generic[Stats[Int, Int]].to(stats) + hl should be(0 :: NaN :: NaN :: NaN :: NaN :: None :: None :: List() :: HNil) +// check[Int]( +// Nil, +// Stats.empty[Int, Int] +// ) + } + +// test("0 to 0") { +// check( +// 0 to 0, +// "num: 1, mean: 0, stddev: 0, mad: 0", +// "elems: 0" +// ) +// } +// +// test("0 to 1") { +// check( +// 0 to 1, +// "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", +// "elems: 0, 1" +// ) +// } +// +// test("1 to 0") { +// check( +// 1 to 0 by -1, +// "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", +// "elems: 1, 0", +// "sorted: 0, 1" +// ) +// } +// +// test("0 to 2") { +// check( +// 0 to 2, +// "num: 3, mean: 1, stddev: 0.8, mad: 1", +// "elems: 0, 1, 2", +// "50: 1" +// ) +// } - test("4 to 0") { - check( - 4 to 0 by -1, - "num: 5, mean: 2, stddev: 1.4, mad: 1", - "elems: 4, 3, 2, 1, 0", - "sorted: 0, 1, 2, 3, 4", - "25: 1", - "50: 2", - "75: 3" - ) - } - - test("0 to 10") { - check( - 0 to 10, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("10 to 0") { - check( - 10 to 0 by -1, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0", - "sorted: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - val shuffled0to10 = shuffle(0 to 10).toArray - - test("0 to 10 sample 5") { - check( - shuffled0to10, - numToSample = 5, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, 1, 6, …, 4, 8, 2, 0, 10", - "sorted: 0, 1, 2, 3, 4, …, 6, 7, 8, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 10 sample 4") { - check( - shuffled0to10, - numToSample = 4, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, 1, …, 8, 2, 0, 10", - "sorted: 0, 1, 2, 3, …, 7, 8, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 10 sample 3") { - check( - shuffled0to10, - numToSample = 3, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, …, 2, 0, 10", - "sorted: 0, 1, 2, …, 8, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 10 sample 2") { - check( - shuffled0to10, - numToSample = 2, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, …, 0, 10", - "sorted: 0, 1, …, 9, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 10 sample 1") { - check( - shuffled0to10, - numToSample = 1, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, …, 10", - "sorted: 0, …, 10", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 10 sample 0") { - check( - shuffled0to10, - numToSample = 0, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("0 to 100") { - check( - 0 to 100, - "num: 101, mean: 50, stddev: 29.2, mad: 25", - "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", - "1: 1", - "5: 5", - "10: 10", - "25: 25", - "50: 50", - "75: 75", - "90: 90", - "95: 95", - "99: 99" - ) - } - - test("100 to 0") { - check( - 100 to 0 by -1, - "num: 101, mean: 50, stddev: 29.2, mad: 25", - "elems: 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, …, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0", - "sorted: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", - "1: 1", - "5: 5", - "10: 10", - "25: 25", - "50: 50", - "75: 75", - "90: 90", - "95: 95", - "99: 99" - ) - } - - val shuffledDigits = (0 until 100).map(_ ⇒ Random.nextInt(10)) - - test("100 digits") { - check( - shuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 9, 6, 2, 5, 7, 9, 0, 5, 4, 6, …, 1, 9, 0×2, 8, 0, 7×2, 0, 6, 2, 4", - "sorted: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", - "5: 0", - "10: 0", - "25: 2", - "50: 4", - "75: 7", - "90: 8", - "95: 9" - - ) - } - - test("100 digits sample 4") { - check( - shuffledDigits, - numToSample = 4, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 9, 6, 2, 5, …, 0, 6, 2, 4", - "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", - "5: 0", - "10: 0", - "25: 2", - "50: 4", - "75: 7", - "90: 8", - "95: 9" - ) - } - - test("100 digits sample 4 only sample sorted") { - check( - shuffledDigits, - numToSample = 4, - onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", - "5: 0", - "10: 0", - "25: 2", - "50: 4", - "75: 7", - "90: 8", - "95: 9" - ) - } - - val sortedShuffledDigits = shuffledDigits.sorted - - test("100 sorted digits") { - check( - sortedShuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", - "5: 0", - "10: 0", - "25: 2", - "50: 4", - "75: 7", - "90: 8", - "95: 9" - ) - } - - test("100 sorted digits only sample sorted overridden") { - check( - sortedShuffledDigits, - numToSample = 4, - onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", - "5: 0", - "10: 0", - "25: 2", - "50: 4", - "75: 7", - "90: 8", - "95: 9" - ) - } - - test("values over Int.MAX_VALUE") { - check( - Seq( - 10000000000L, - 100000000000L, - 100000000000L, - 1000000000000L, - 1000000000000L, - 10000000000L, - 1000000000000L, - 100000000000L, - 10000000000L, - 10000000000L - ), - "num: 10, mean: 334000000000, stddev: 437588848121.2, mad: 90000000000", - "elems: 10000000000, 100000000000×2, 1000000000000×2, 10000000000, 1000000000000, 100000000000, 10000000000×2", - "sorted: 10000000000×4, 100000000000×3, 1000000000000×3", - "25: 10000000000", - "50: 100000000000", - "75: 325000000000" - ) - } } From a683a38450bbf26ff081df4d2ff46b6cdba574ac Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Wed, 2 Aug 2017 23:43:50 +0000 Subject: [PATCH 14/20] update Stats percentile tests, handle empty stats better --- .../scala/org/hammerlab/math/package.scala | 6 +- .../scala/org/hammerlab/stats/Samples.scala | 2 - .../scala/org/hammerlab/stats/Stats.scala | 291 ++++++++---------- .../scala/org/hammerlab/types/package.scala | 11 + ...StatsHistTest.scala => HistShowTest.scala} | 2 +- .../scala/org/hammerlab/stats/ShowTest.scala | 197 ++++++------ .../scala/org/hammerlab/stats/StatsTest.scala | 153 ++++++--- 7 files changed, 344 insertions(+), 318 deletions(-) create mode 100644 src/main/scala/org/hammerlab/types/package.scala rename src/test/scala/org/hammerlab/stats/{StatsHistTest.scala => HistShowTest.scala} (99%) diff --git a/src/main/scala/org/hammerlab/math/package.scala b/src/main/scala/org/hammerlab/math/package.scala index b28f395..d27932c 100644 --- a/src/main/scala/org/hammerlab/math/package.scala +++ b/src/main/scala/org/hammerlab/math/package.scala @@ -1,6 +1,7 @@ package org.hammerlab -import spire.math.Integral +import spire.implicits._ +import spire.math._ package object math { /** @@ -15,4 +16,7 @@ package object math { ) ) } + + def interpolate[N: Numeric](start: N, end: N, delta: Double): Double = + start.toDouble() + delta * (end - start).toDouble() } diff --git a/src/main/scala/org/hammerlab/stats/Samples.scala b/src/main/scala/org/hammerlab/stats/Samples.scala index b7d3379..f14aaef 100644 --- a/src/main/scala/org/hammerlab/stats/Samples.scala +++ b/src/main/scala/org/hammerlab/stats/Samples.scala @@ -10,9 +10,7 @@ import spire.math.Integral * Used by [[Stats]] to wrap some [[Runs]] of elements from the start and end of a dataset. * @param n total number of elements in the dataset. * @param first [[Runs]] of elements from the start of the dataset. - * @param numFirst the number of elements represented by the [[Runs]] in [[first]], i.e. the sum of the their values. * @param last [[Runs]] of elements from the end of the dataset. - * @param numLast the number of elements represented by the [[Runs]] in [[last]], i.e. the sum of the their values. * @tparam K arbitrary element type * @tparam V [[Integral]] type, e.g. [[Int]] or [[Long]]. */ diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index b684b11..0e8e1f7 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -5,6 +5,8 @@ import cats.Show.show import cats.instances.all.catsStdShowForString import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ +import org.hammerlab.math.interpolate +import org.hammerlab.types._ import spire.implicits._ import spire.math.{ Integral, Numeric, Rational } @@ -13,6 +15,78 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.math.{ abs, ceil, floor, sqrt } +sealed abstract class StatsI[K: Numeric, V: Integral] + +object StatsI { + implicit def makeShow[ + K : Numeric : Show, + V: Integral : Show + ]( + implicit + percentileShow: Show[Rational] = showPercentile, + statShow: Show[Double] = showDouble + ): Show[StatsI[K, V]] = + show { + case Empty() ⇒ "(empty)" + case Stats(n, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ + def pair[L: Show, R: Show](l: L, r: R): String = + show"$l:\t$r" + + val strings = ArrayBuffer[String]() + + strings += + List( + pair("num", n), + pair("mean", mean), + pair("stddev", stddev), + pair("mad", mad) + ) + .mkString(",\t") + + for { + samples ← samplesOpt + if samples.nonEmpty + } { + strings += pair("elems", samples) + } + + for { + sortedSamples ← sortedSamplesOpt + if sortedSamples.nonEmpty + } { + strings += pair("sorted", sortedSamples) + } + + strings ++= + percentiles.map { + case (k, v) ⇒ + pair(k, v) + } + + strings.mkString("\n") + } + + def showDouble: Show[Double] = + show( + d ⇒ + if (floor(d).toLong == ceil(d).toLong) + d.toLong.toString + else + "%.1f".format(d) + ) + + def showPercentile: Show[Rational] = + show( + r ⇒ + if (r.isWhole()) + r.toLong.toString + else + r.toDouble.toString + ) +} + +case class Empty[K: Numeric, V: Integral]() extends StatsI[K, V] + /** * Wrapper for some computed statistics about a dataset of [[Numeric]] elements. * @@ -33,6 +107,7 @@ case class Stats[K: Numeric, V: Integral](n: V, samplesOpt: Option[Samples[K, V]], sortedSamplesOpt: Option[Samples[K, V]], percentiles: Seq[(Rational, Double)]) + extends StatsI[K, V] /** * Helpers for constructing [[Stats]] / computing the statistics that populate a [[Stats]] instance. @@ -49,7 +124,7 @@ object Stats { */ def fromHist[K: Numeric: Ordering, V: Integral](v: Iterable[(K, V)], numToSample: Int = 10, - onlySampleSorted: Boolean = false): Stats[K, V] = { + onlySampleSorted: Boolean = false): StatsI[K, V] = { var alreadySorted = true val hist = mutable.HashMap[K, V]() @@ -76,7 +151,7 @@ object Stats { } if (values.isEmpty) - return Empty + return Empty[K, V]() val sorted = if (alreadySorted) @@ -87,9 +162,9 @@ object Stats { } yield key → hist(key) - val ps = histPercentiles(n, sorted) + val percentiles = histPercentiles(n, sorted) - val median = ps(ps.length / 2)._2 + val median = percentiles(percentiles.length / 2)._2 val medianDeviationsBuilder = Vector.newBuilder[(Double, V)] @@ -118,35 +193,20 @@ object Stats { val mean = sum / n.toDouble() val stddev = sqrt(sumSquares / n.toDouble() - mean * mean) + def samples(vs: Vector[(K, V)]): Samples[K, V] = + Samples[K, V]( + n, + vs.take(numToSample), + vs.takeRight(numToSample) + ) + val samplesOpt = - if (alreadySorted || !onlySampleSorted) { - val firstElems = values.take(numToSample) - val lastElems = values.takeRight(numToSample) - - Some( - Samples[K, V]( - n, - firstElems, - lastElems - ) - ) - } else - None + (alreadySorted || !onlySampleSorted) | + samples(values) val sortedSamplesOpt = - if (!alreadySorted) { - val leastElems = sorted.take(numToSample) - val greatestElems = sorted.takeRight(numToSample) - - Some( - Samples( - n, - leastElems, - greatestElems - ) - ) - } else - None + !alreadySorted | + samples(sorted) Stats( n, @@ -154,7 +214,7 @@ object Stats { median, mad, samplesOpt, sortedSamplesOpt, - ps + percentiles ) } @@ -167,7 +227,7 @@ object Stats { */ def apply[K: Numeric: Ordering](v: Iterable[K], numToSample: Int = 10, - onlySampleSorted: Boolean = false): Stats[K, Int] = { + onlySampleSorted: Boolean = false): StatsI[K, Int] = { val vBuilder = Vector.newBuilder[K] var alreadySorted = true @@ -185,7 +245,7 @@ object Stats { val values = vBuilder.result() if (values.isEmpty) - return Empty + return Empty[K, Int]() val n = values.length @@ -214,60 +274,36 @@ object Stats { val mean = sum / n val stddev = sqrt(sumSquares / n - mean * mean) - val samplesOpt: Option[Samples[K, Int]] = - if (alreadySorted || !onlySampleSorted) { - - // Count occurrences of the first N distinct values. - val (firstElems, numFirstElems) = - runLengthEncodeWithSum( - values.iterator, - numToSample - ) - - // Count occurrences of the last N distinct values. - val (lastElems, numLastElems) = - runLengthEncodeWithSum( - values.reverseIterator, - numToSample, - reverse = true - ) + def samples(vs: Vector[K]): Samples[K, Int] = { + // Count occurrences of the first N distinct values. + val (firstElems, numFirstElems) = + runLengthEncodeWithSum( + vs.iterator, + numToSample + ) - Some( - Samples( - n, - Runs(firstElems, numFirstElems), - Runs(lastElems, numLastElems) - ) + // Count occurrences of the last N distinct values. + val (lastElems, numLastElems) = + runLengthEncodeWithSum( + vs.reverseIterator, + numToSample, + reverse = true ) - } else - None - - val sortedSamplesOpt: Option[Samples[K, Int]] = - if (!alreadySorted) { - // Count occurrences of the least N distinct values. - val (leastElems, numLeastElems) = - runLengthEncodeWithSum[K]( - sorted.iterator, - numToSample - ) - // Count occurrences of the greatest N distinct values. - val (greatestElems, numGreatestElems) = - runLengthEncodeWithSum( - sorted.reverseIterator, - numToSample, - reverse = true - ) + Samples( + n, + Runs(firstElems, numFirstElems), + Runs(lastElems, numLastElems) + ) + } - Some( - Samples( - n, - Runs(leastElems, numLeastElems), - Runs(greatestElems, numGreatestElems) - ) - ) - } else - None + val samplesOpt = + (alreadySorted || !onlySampleSorted) | + samples(values) + + val sortedSamplesOpt = + !alreadySorted | + samples(sorted) new Stats( n, @@ -279,21 +315,6 @@ object Stats { ) } - /** - * Construct an empty [[Stats]] instance. - */ - private def empty[K: Numeric, V: Integral]: Stats[K, V] = - new Stats( - n = Integral[V].zero, - mean = 0, - stddev = 0, - median = 0, - mad = 0, - samplesOpt = None, - sortedSamplesOpt = None, - percentiles = Nil - ) - /** * Compute percentiles listed in `ps` of the data in `values`; wrapper for implementation below. */ @@ -421,8 +442,8 @@ object Stats { else { val floorWeight = loRemainder.toDouble() / d ( - values(loFloor).toDouble() * floorWeight + values(loFloor + 1).toDouble() * (1 - floorWeight), - values( hiCeil).toDouble() * floorWeight + values( hiCeil - 1).toDouble() * (1 - floorWeight) + interpolate(values(loFloor), values(loFloor + 1), floorWeight), + interpolate(values( hiCeil), values( hiCeil - 1), floorWeight) ) } @@ -471,74 +492,4 @@ object Stats { } runs → sum } - - implicit def makeShow[ - K : Numeric : Show, - V: Integral : Show - ]( - implicit - percentileShow: Show[Rational] = showPercentile, - statShow: Show[Double] = showDouble - ): Show[Stats[K, V]] = - show { - case Stats(n, mean, stddev, median, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ - if (n == 0) - "(empty)" - else { - - def pair[L: Show, R: Show](l: L, r: R): String = - show"$l:\t$r" - - val strings = ArrayBuffer[String]() - - strings += - List( - pair("num", n), - pair("mean", mean), - pair("stddev", stddev), - pair("mad", mad) - ) - .mkString(",\t") - - for { - samples ← samplesOpt - if samples.nonEmpty - } { - strings += pair("elems", samples) - } - - for { - sortedSamples ← sortedSamplesOpt - if sortedSamples.nonEmpty - } { - strings += pair("sorted", sortedSamples) - } - - strings ++= - percentiles.map { - case (k, v) ⇒ - pair(k, v) - } - - strings.mkString("\n") - } - } - - def showDouble: Show[Double] = - show( - d ⇒ - if (floor(d).toLong == ceil(d).toLong) - d.toLong.toString - else - "%.1f".format(d) - ) - - def showPercentile: Show[Rational] = - show( - r ⇒ - if (r.isWhole()) - r.toLong.toString - else - r.toDouble.toString - ) } diff --git a/src/main/scala/org/hammerlab/types/package.scala b/src/main/scala/org/hammerlab/types/package.scala new file mode 100644 index 0000000..4c28b0c --- /dev/null +++ b/src/main/scala/org/hammerlab/types/package.scala @@ -0,0 +1,11 @@ +package org.hammerlab + +package object types { + implicit class BoolOps(val b: Boolean) extends AnyVal { + def |[A](a: ⇒ A): Option[A] = + if (b) + Some(a) + else + None + } +} diff --git a/src/test/scala/org/hammerlab/stats/StatsHistTest.scala b/src/test/scala/org/hammerlab/stats/HistShowTest.scala similarity index 99% rename from src/test/scala/org/hammerlab/stats/StatsHistTest.scala rename to src/test/scala/org/hammerlab/stats/HistShowTest.scala index 91db115..8840f39 100644 --- a/src/test/scala/org/hammerlab/stats/StatsHistTest.scala +++ b/src/test/scala/org/hammerlab/stats/HistShowTest.scala @@ -15,7 +15,7 @@ import scala.util.Random * with an associated repetition count, which allows the total number of elements represented to be much larger * ([[Long]] vs. [[Int]]). */ -class StatsHistTest extends Suite { +class HistShowTest extends Suite { Random.setSeed(123L) diff --git a/src/test/scala/org/hammerlab/stats/ShowTest.scala b/src/test/scala/org/hammerlab/stats/ShowTest.scala index e1eca47..a183b0c 100644 --- a/src/test/scala/org/hammerlab/stats/ShowTest.scala +++ b/src/test/scala/org/hammerlab/stats/ShowTest.scala @@ -53,7 +53,8 @@ class ShowTest extends Suite { check( 0 to 0, "num: 1, mean: 0, stddev: 0, mad: 0", - "elems: 0" + "elems: 0", + "50: 0" ) } @@ -61,7 +62,8 @@ class ShowTest extends Suite { check( 0 to 1, "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 0, 1" + "elems: 0, 1", + "50: 0.5" ) } @@ -70,7 +72,8 @@ class ShowTest extends Suite { 1 to 0 by -1, "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", "elems: 1, 0", - "sorted: 0, 1" + "sorted: 0, 1", + "50: 0.5" ) } @@ -79,7 +82,9 @@ class ShowTest extends Suite { 0 to 2, "num: 3, mean: 1, stddev: 0.8, mad: 1", "elems: 0, 1, 2", - "50: 1" + "25: 0", + "50: 1", + "75: 2" ) } @@ -89,7 +94,9 @@ class ShowTest extends Suite { "num: 3, mean: 1, stddev: 0.8, mad: 1", "elems: 2, 1, 0", "sorted: 0, 1, 2", - "50: 1" + "25: 0", + "50: 1", + "75: 2" ) } @@ -98,7 +105,9 @@ class ShowTest extends Suite { 0 to 3, "num: 4, mean: 1.5, stddev: 1.1, mad: 1", "elems: 0, 1, 2, 3", - "50: 1.5" + "25: 0.3", + "50: 1.5", + "75: 2.8" ) } @@ -108,30 +117,9 @@ class ShowTest extends Suite { "num: 4, mean: 1.5, stddev: 1.1, mad: 1", "elems: 3, 2, 1, 0", "sorted: 0, 1, 2, 3", - "50: 1.5" - ) - } - - test("1 to 3") { - check( - 1 to 3, - "num: 3, mean: 2, stddev: 0.8, mad: 1", - "elems: 1, 2, 3", - "25: 1", - "50: 2", - "75: 3" - ) - } - - test("3 to 1") { - check( - 3 to 1 by -1, - "num: 3, mean: 2, stddev: 0.8, mad: 1", - "elems: 3, 2, 1", - "sorted: 1, 2, 3", - "25: 1", - "50: 2", - "75: 3" + "25: 0.3", + "50: 1.5", + "75: 2.8" ) } @@ -162,15 +150,15 @@ class ShowTest extends Suite { ) } - val shuffled0to10 = shuffle(0 to 10).toArray + val shuffled1to9 = shuffle(1 to 9).toArray - test("0 to 10 sample 5") { + test("1 to 9 sample 5") { check( - shuffled0to10, + shuffled1to9, numToSample = 5, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, 1, 6, …, 4, 8, 2, 0, 10", - "sorted: 0, 1, 2, 3, 4, …, 6, 7, 8, 9, 10", + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 8, 4, 5, 3, 1, 6, 7, 2, 9", + "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9", "10: 1", "25: 2.5", "50: 5", @@ -179,13 +167,13 @@ class ShowTest extends Suite { ) } - test("0 to 10 sample 4") { + test("1 to 9 sample 4") { check( - shuffled0to10, + shuffled1to9, numToSample = 4, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, 1, …, 8, 2, 0, 10", - "sorted: 0, 1, 2, 3, …, 7, 8, 9, 10", + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 8, 4, 5, 3, …, 6, 7, 2, 9", + "sorted: 1, 2, 3, 4, …, 6, 7, 8, 9", "10: 1", "25: 2.5", "50: 5", @@ -194,13 +182,13 @@ class ShowTest extends Suite { ) } - test("0 to 10 sample 3") { + test("1 to 9 sample 3") { check( - shuffled0to10, + shuffled1to9, numToSample = 3, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, 7, …, 2, 0, 10", - "sorted: 0, 1, 2, …, 8, 9, 10", + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 8, 4, 5, …, 7, 2, 9", + "sorted: 1, 2, 3, …, 7, 8, 9", "10: 1", "25: 2.5", "50: 5", @@ -209,13 +197,13 @@ class ShowTest extends Suite { ) } - test("0 to 10 sample 2") { + test("1 to 9 sample 2") { check( - shuffled0to10, + shuffled1to9, numToSample = 2, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, 3, …, 0, 10", - "sorted: 0, 1, …, 9, 10", + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 8, 4, …, 2, 9", + "sorted: 1, 2, …, 8, 9", "10: 1", "25: 2.5", "50: 5", @@ -224,13 +212,13 @@ class ShowTest extends Suite { ) } - test("0 to 10 sample 1") { + test("1 to 9 sample 1") { check( - shuffled0to10, + shuffled1to9, numToSample = 1, - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 9, …, 10", - "sorted: 0, …, 10", + "num: 9, mean: 5, stddev: 2.6, mad: 2", + "elems: 8, …, 9", + "sorted: 1, …, 9", "10: 1", "25: 2.5", "50: 5", @@ -239,11 +227,11 @@ class ShowTest extends Suite { ) } - test("0 to 10 sample 0") { + test("1 to 9 sample 0") { check( - shuffled0to10, + shuffled1to9, numToSample = 0, - "num: 11, mean: 5, stddev: 3.2, mad: 3", + "num: 9, mean: 5, stddev: 2.6, mad: 2", "10: 1", "25: 2.5", "50: 5", @@ -252,11 +240,11 @@ class ShowTest extends Suite { ) } - test("0 to 100") { + test("1 to 99") { check( - 0 to 100, - "num: 101, mean: 50, stddev: 29.2, mad: 25", - "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", + 1 to 99, + "num: 99, mean: 50, stddev: 28.6, mad: 25", + "elems: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99", "1: 1", "5: 5", "10: 10", @@ -269,12 +257,12 @@ class ShowTest extends Suite { ) } - test("100 to 0") { + test("99 to 1") { check( - 100 to 0 by -1, - "num: 101, mean: 50, stddev: 29.2, mad: 25", - "elems: 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, …, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0", - "sorted: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, …, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100", + 99 to 1 by -1, + "num: 99, mean: 50, stddev: 28.6, mad: 25", + "elems: 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, …, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1", + "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99", "1: 1", "5: 5", "10: 10", @@ -292,17 +280,18 @@ class ShowTest extends Suite { test("100 digits") { check( shuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 9, 6, 2, 5, 7, 9, 0, 5, 4, 6, …, 1, 9, 0×2, 8, 0, 7×2, 0, 6, 2, 4", - "sorted: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", + "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", + "elems: 5, 3, 9, 6, 2, 5, 7, 9, 0, 5, …, 7, 9, 1, 9, 0×2, 8, 0, 7×2, 0, 6", + "sorted: 0×15, 1×7, 2×8, 3×11, 4×9, 5×12, 6×11, 7×9, 8×9, 9×9", + "1: 0", "5: 0", "10: 0", "25: 2", - "50: 4", + "50: 4.5", "75: 7", "90: 8", - "95: 9" - + "95: 9", + "99: 9" ) } @@ -310,16 +299,18 @@ class ShowTest extends Suite { check( shuffledDigits, numToSample = 4, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 9, 6, 2, 5, …, 0, 6, 2, 4", - "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", + "elems: 5, 3, 9, 6, …, 0, 7×2, 0, 6", + "sorted: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", + "1: 0", "5: 0", "10: 0", "25: 2", - "50: 4", + "50: 4.5", "75: 7", "90: 8", - "95: 9" + "95: 9", + "99: 9" ) } @@ -328,15 +319,17 @@ class ShowTest extends Suite { shuffledDigits, numToSample = 4, onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "sorted: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", + "sorted: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", + "1: 0", "5: 0", "10: 0", "25: 2", - "50: 4", + "50: 4.5", "75: 7", "90: 8", - "95: 9" + "95: 9", + "99: 9" ) } @@ -345,15 +338,17 @@ class ShowTest extends Suite { test("100 sorted digits") { check( sortedShuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 0×15, 1×7, 2×9, 3×10, 4×10, 5×11, 6×11, 7×9, 8×9, 9×9", + "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", + "elems: 0×15, 1×7, 2×8, 3×11, 4×9, 5×12, 6×11, 7×9, 8×9, 9×9", + "1: 0", "5: 0", "10: 0", "25: 2", - "50: 4", + "50: 4.5", "75: 7", "90: 8", - "95: 9" + "95: 9", + "99: 9" ) } @@ -362,38 +357,42 @@ class ShowTest extends Suite { sortedShuffledDigits, numToSample = 4, onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2", - "elems: 0×15, 1×7, 2×9, 3×10, …, 6×11, 7×9, 8×9, 9×9", + "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", + "elems: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", + "1: 0", "5: 0", "10: 0", "25: 2", - "50: 4", + "50: 4.5", "75: 7", "90: 8", - "95: 9" + "95: 9", + "99: 9" ) } test("values over Int.MAX_VALUE") { check( Seq( - 10000000000L, - 100000000000L, - 100000000000L, + 10000000000L, + 100000000000L, + 100000000000L, 1000000000000L, 1000000000000L, - 10000000000L, + 10000000000L, 1000000000000L, - 100000000000L, - 10000000000L, - 10000000000L + 100000000000L, + 10000000000L, + 10000000000L ), "num: 10, mean: 334000000000, stddev: 437588848121.2, mad: 90000000000", "elems: 10000000000, 100000000000×2, 1000000000000×2, 10000000000, 1000000000000, 100000000000, 10000000000×2", "sorted: 10000000000×4, 100000000000×3, 1000000000000×3", + "10: 10000000000", "25: 10000000000", "50: 100000000000", - "75: 325000000000" + "75: 1000000000000", + "90: 1000000000000" ) } } diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index 331d3e8..d56b10a 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -1,11 +1,12 @@ package org.hammerlab.stats -import Double.NaN import org.hammerlab.test.Suite import org.scalactic.Equality -import shapeless.{ Generic, HNil } +import spire.implicits._ import spire.math.Numeric +import scala.math.sqrt + class StatsTest extends Suite { @@ -22,12 +23,12 @@ class StatsTest } def check[K : Numeric : Ordering](input: Seq[K], - expected: Stats[K, Int]): Unit = + expected: StatsI[K, Int]): Unit = Stats(input) should ===(expected) def check[K : Numeric : Ordering](input: Seq[K], numToSample: Int, - expected: Stats[K, Int]): Unit = + expected: StatsI[K, Int]): Unit = Stats( input, numToSample @@ -38,7 +39,7 @@ class StatsTest def check[K : Numeric : Ordering](input: Seq[K], numToSample: Int, onlySampleSorted: Boolean, - expected: Stats[K, Int]): Unit = + expected: StatsI[K, Int]): Unit = Stats( input, numToSample, @@ -48,47 +49,109 @@ class StatsTest ) test("empty") { - val stats = Stats[Int](Nil) - val hl = Generic[Stats[Int, Int]].to(stats) - hl should be(0 :: NaN :: NaN :: NaN :: NaN :: None :: None :: List() :: HNil) -// check[Int]( -// Nil, -// Stats.empty[Int, Int] -// ) + check[Int]( + Nil, + Empty[Int, Int]() + ) } -// test("0 to 0") { -// check( -// 0 to 0, -// "num: 1, mean: 0, stddev: 0, mad: 0", -// "elems: 0" -// ) -// } -// -// test("0 to 1") { -// check( -// 0 to 1, -// "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", -// "elems: 0, 1" -// ) -// } -// -// test("1 to 0") { -// check( -// 1 to 0 by -1, -// "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", -// "elems: 1, 0", -// "sorted: 0, 1" -// ) -// } -// -// test("0 to 2") { -// check( -// 0 to 2, -// "num: 3, mean: 1, stddev: 0.8, mad: 1", -// "elems: 0, 1, 2", -// "50: 1" -// ) -// } + test("0 to 0") { + check( + 0 to 0, + Stats( + n = 1, + mean = 0, + stddev = 0, + median = 0, + mad = 0, + samplesOpt = + Some( + Samples( + 1, + Runs(Seq(0 → 1)), + Runs(Seq(0 → 1)) + ) + ), + sortedSamplesOpt = None, + percentiles = Vector(r"50" → 0.0) + ) + ) + } + + test("0 to 1") { + check( + 0 to 1, + Stats( + n = 2, + mean = .5, + stddev = .5, + median = .5, + mad = .5, + samplesOpt = + Some( + Samples( + 2, + Runs(Seq(0 → 1, 1 → 1)), + Runs(Seq(0 → 1, 1 → 1)) + ) + ), + sortedSamplesOpt = None, + percentiles = Vector(r"50" → .5) + ) + ) + } + + test("1 to 0") { + check( + 1 to 0 by -1, + Stats( + n = 2, + mean = .5, + stddev = .5, + median = .5, + mad = .5, + samplesOpt = + Some( + Samples( + 2, + Runs(Seq(1 → 1, 0 → 1)), + Runs(Seq(1 → 1, 0 → 1)) + ) + ), + sortedSamplesOpt = + Some( + Samples( + 2, + Runs(Seq(0 → 1, 1 → 1)), + Runs(Seq(0 → 1, 1 → 1)) + ) + ), + percentiles = Vector(r"50" → .5) + ) + ) + } + + test("0 to 2") { + check( + 0 to 2, + Stats( + n = 3, + mean = 1, + stddev = sqrt(2 / 3.0), + median = 1, + mad = 1, + samplesOpt = + Some( + Samples( + 3, + Runs(Seq(0 → 1, 1 → 1, 2 → 1)), + Runs(Seq(0 → 1, 1 → 1, 2 → 1)) + ) + ), + sortedSamplesOpt = None, + percentiles = Vector(r"25" → 0, r"50" → 1, r"75" → 2) + ) + ) + } } From cd37a1b8b7bca80d6e48674aa1cdac92e292f9ff Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 3 Aug 2017 00:06:51 +0000 Subject: [PATCH 15/20] rename Stats interface, impls --- src/main/scala/org/hammerlab/stats/Runs.scala | 2 +- .../scala/org/hammerlab/stats/Samples.scala | 2 +- .../scala/org/hammerlab/stats/Stats.scala | 203 +++++++++--------- .../org/hammerlab/stats/HistShowTest.scala | 2 +- .../scala/org/hammerlab/stats/ShowTest.scala | 2 +- .../scala/org/hammerlab/stats/StatsTest.scala | 28 +-- 6 files changed, 114 insertions(+), 125 deletions(-) diff --git a/src/main/scala/org/hammerlab/stats/Runs.scala b/src/main/scala/org/hammerlab/stats/Runs.scala index cb053dc..7218511 100644 --- a/src/main/scala/org/hammerlab/stats/Runs.scala +++ b/src/main/scala/org/hammerlab/stats/Runs.scala @@ -7,7 +7,7 @@ import spire.implicits._ import spire.math.Integral /** - * Convenience class wrapping a sequence of key-number pairs, used in run-length-encoding in [[Stats]]. + * Convenience class wrapping a sequence of key-number pairs, used in run-length-encoding in [[NonEmpty]]. */ case class Runs[K, V: Integral](elems: Seq[(K, V)], num: V) diff --git a/src/main/scala/org/hammerlab/stats/Samples.scala b/src/main/scala/org/hammerlab/stats/Samples.scala index f14aaef..e4fac09 100644 --- a/src/main/scala/org/hammerlab/stats/Samples.scala +++ b/src/main/scala/org/hammerlab/stats/Samples.scala @@ -7,7 +7,7 @@ import spire.implicits._ import spire.math.Integral /** - * Used by [[Stats]] to wrap some [[Runs]] of elements from the start and end of a dataset. + * Used by [[NonEmpty]] to wrap some [[Runs]] of elements from the start and end of a dataset. * @param n total number of elements in the dataset. * @param first [[Runs]] of elements from the start of the dataset. * @param last [[Runs]] of elements from the end of the dataset. diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 0e8e1f7..193418a 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -15,107 +15,18 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.math.{ abs, ceil, floor, sqrt } -sealed abstract class StatsI[K: Numeric, V: Integral] - -object StatsI { - implicit def makeShow[ - K : Numeric : Show, - V: Integral : Show - ]( - implicit - percentileShow: Show[Rational] = showPercentile, - statShow: Show[Double] = showDouble - ): Show[StatsI[K, V]] = - show { - case Empty() ⇒ "(empty)" - case Stats(n, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ - def pair[L: Show, R: Show](l: L, r: R): String = - show"$l:\t$r" - - val strings = ArrayBuffer[String]() - - strings += - List( - pair("num", n), - pair("mean", mean), - pair("stddev", stddev), - pair("mad", mad) - ) - .mkString(",\t") - - for { - samples ← samplesOpt - if samples.nonEmpty - } { - strings += pair("elems", samples) - } - - for { - sortedSamples ← sortedSamplesOpt - if sortedSamples.nonEmpty - } { - strings += pair("sorted", sortedSamples) - } - - strings ++= - percentiles.map { - case (k, v) ⇒ - pair(k, v) - } - - strings.mkString("\n") - } - - def showDouble: Show[Double] = - show( - d ⇒ - if (floor(d).toLong == ceil(d).toLong) - d.toLong.toString - else - "%.1f".format(d) - ) - - def showPercentile: Show[Rational] = - show( - r ⇒ - if (r.isWhole()) - r.toLong.toString - else - r.toDouble.toString - ) -} - -case class Empty[K: Numeric, V: Integral]() extends StatsI[K, V] - /** - * Wrapper for some computed statistics about a dataset of [[Numeric]] elements. + * Stores some computed statistics about a dataset of [[Numeric]] elements. * - * @param n number of elements in the dataset. - * @param mad median absolute deviation (from the median). - * @param samplesOpt "sample" elements; the start and end of the data. - * @param sortedSamplesOpt "sample" elements; the least and greatest elements. If the dataset is already sorted, meaning - * this would be equivalent to [[samplesOpt]], it is omitted. - * @param percentiles selected percentiles of the dataset. * @tparam K [[Numeric]] element type. TODO(ryan): allow this to be non-[[Numeric]]. * @tparam V [[Integral]] value type. */ -case class Stats[K: Numeric, V: Integral](n: V, - mean: Double, - stddev: Double, - median: Double, - mad: Double, - samplesOpt: Option[Samples[K, V]], - sortedSamplesOpt: Option[Samples[K, V]], - percentiles: Seq[(Rational, Double)]) - extends StatsI[K, V] +sealed abstract class Stats[K: Numeric, V: Integral] -/** - * Helpers for constructing [[Stats]] / computing the statistics that populate a [[Stats]] instance. - */ object Stats { /** - * Construct a [[Stats]] from a sequence of "runs"; elements paired with a count of repetitions. + * Construct a [[NonEmpty]] from a sequence of "runs"; elements paired with a count of repetitions. * * @param v values. * @param numToSample highlight this many "runs" of data from the start and end of the data; likewise the least and @@ -124,7 +35,7 @@ object Stats { */ def fromHist[K: Numeric: Ordering, V: Integral](v: Iterable[(K, V)], numToSample: Int = 10, - onlySampleSorted: Boolean = false): StatsI[K, V] = { + onlySampleSorted: Boolean = false): Stats[K, V] = { var alreadySorted = true val hist = mutable.HashMap[K, V]() @@ -208,7 +119,7 @@ object Stats { !alreadySorted | samples(sorted) - Stats( + NonEmpty( n, mean, stddev, median, mad, @@ -219,7 +130,8 @@ object Stats { } /** - * Construct a [[Stats]] instance from input data `v`. + * Construct a [[NonEmpty]] instance from input data `v`. + * * @param v values. * @param numToSample highlight this many "runs" of data from the start and end of the data; likewise the least and * greatest elements (and repetition counts). @@ -227,7 +139,7 @@ object Stats { */ def apply[K: Numeric: Ordering](v: Iterable[K], numToSample: Int = 10, - onlySampleSorted: Boolean = false): StatsI[K, Int] = { + onlySampleSorted: Boolean = false): Stats[K, Int] = { val vBuilder = Vector.newBuilder[K] var alreadySorted = true @@ -305,7 +217,7 @@ object Stats { !alreadySorted | samples(sorted) - new Stats( + NonEmpty( n, mean, stddev, median, mad, @@ -350,6 +262,7 @@ object Stats { override def next(): (Rational, Double) = { val (percentile, idx) = percentiles.next() + while(elemsPast <= idx) { val (k, v) = values.next() curK = Some(k.toDouble()) @@ -390,11 +303,11 @@ object Stats { val hiIdx = nd - loIdx if (d == 2) - // Median (50th percentile, denominator 2) only emits one tuple. + // Median (50th percentile, denominator 2) only emits one tuple. Iterator(loPercentile → loIdx) else - // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. + // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for + // denominator 4, we emit the 25th and 75th percentiles. Iterator(loPercentile → loIdx, hiPercentile → hiIdx) } .toArray @@ -454,7 +367,7 @@ object Stats { // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for // denominator 4, we emit the 25th and 75th percentiles. Iterator(loPercentile → lo, hiPercentile → hi) - } + } .toVector .sortBy(_._1) } @@ -492,4 +405,92 @@ object Stats { } runs → sum } + + implicit def makeShow[ + K : Numeric : Show, + V: Integral : Show + ]( + implicit + percentileShow: Show[Rational] = showPercentile, + statShow: Show[Double] = showDouble + ): Show[Stats[K, V]] = + show { + case Empty() ⇒ "(empty)" + case NonEmpty(n, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ + def pair[L: Show, R: Show](l: L, r: R): String = + show"$l:\t$r" + + val strings = ArrayBuffer[String]() + + strings += + List( + pair("num", n), + pair("mean", mean), + pair("stddev", stddev), + pair("mad", mad) + ) + .mkString(",\t") + + for { + samples ← samplesOpt + if samples.nonEmpty + } { + strings += pair("elems", samples) + } + + for { + sortedSamples ← sortedSamplesOpt + if sortedSamples.nonEmpty + } { + strings += pair("sorted", sortedSamples) + } + + strings ++= + percentiles.map { + case (k, v) ⇒ + pair(k, v) + } + + strings.mkString("\n") + } + + def showDouble: Show[Double] = + show( + d ⇒ + if (floor(d).toLong == ceil(d).toLong) + d.toLong.toString + else + "%.1f".format(d) + ) + + def showPercentile: Show[Rational] = + show( + r ⇒ + if (r.isWhole()) + r.toLong.toString + else + r.toDouble.toString + ) } + +case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] + +/** + * Stores some computed statistics about a dataset of [[Numeric]] elements. + * + * @param n number of elements in the dataset. + * @param mad median absolute deviation (from the median). + * @param samplesOpt "sample" elements; the start and end of the data. + * @param sortedSamplesOpt "sample" elements; the least and greatest elements. If the dataset is already sorted, meaning + * this would be equivalent to [[samplesOpt]], it is omitted. + * @param percentiles selected percentiles of the dataset. + */ +case class NonEmpty[K: Numeric, V: Integral](n: V, + mean: Double, + stddev: Double, + median: Double, + mad: Double, + samplesOpt: Option[Samples[K, V]], + sortedSamplesOpt: Option[Samples[K, V]], + percentiles: Seq[(Rational, Double)]) + extends Stats[K, V] diff --git a/src/test/scala/org/hammerlab/stats/HistShowTest.scala b/src/test/scala/org/hammerlab/stats/HistShowTest.scala index 8840f39..4574809 100644 --- a/src/test/scala/org/hammerlab/stats/HistShowTest.scala +++ b/src/test/scala/org/hammerlab/stats/HistShowTest.scala @@ -11,7 +11,7 @@ import spire.math.Integral import scala.util.Random /** - * Tests of the [[Stats.fromHist]] API for constructing [[Stats]] instances from "histograms" of elements that each come + * Tests of the [[Stats.fromHist]] API for constructing [[NonEmpty]] instances from "histograms" of elements that each come * with an associated repetition count, which allows the total number of elements represented to be much larger * ([[Long]] vs. [[Int]]). */ diff --git a/src/test/scala/org/hammerlab/stats/ShowTest.scala b/src/test/scala/org/hammerlab/stats/ShowTest.scala index a183b0c..d853cba 100644 --- a/src/test/scala/org/hammerlab/stats/ShowTest.scala +++ b/src/test/scala/org/hammerlab/stats/ShowTest.scala @@ -9,7 +9,7 @@ import spire.math.Numeric import scala.util.Random.{ nextInt, setSeed, shuffle } /** - * Test constructing [[Stats]] instances. + * Test the default [[Show.show]] method of [[Stats]] instances. */ class ShowTest extends Suite { diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index d56b10a..0d0a03d 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -10,25 +10,13 @@ import scala.math.sqrt class StatsTest extends Suite { - implicit val de = - new Equality[Double] { - override def areEqual(a: Double, b: Any): Boolean = - b match { - case d: Double ⇒ - if (a.isNaN && d.isNaN) true - else a == d - case _ ⇒ - false - } - } - def check[K : Numeric : Ordering](input: Seq[K], - expected: StatsI[K, Int]): Unit = - Stats(input) should ===(expected) + expected: Stats[K, Int]): Unit = + Stats(input) should be(expected) def check[K : Numeric : Ordering](input: Seq[K], numToSample: Int, - expected: StatsI[K, Int]): Unit = + expected: Stats[K, Int]): Unit = Stats( input, numToSample @@ -39,7 +27,7 @@ class StatsTest def check[K : Numeric : Ordering](input: Seq[K], numToSample: Int, onlySampleSorted: Boolean, - expected: StatsI[K, Int]): Unit = + expected: Stats[K, Int]): Unit = Stats( input, numToSample, @@ -58,7 +46,7 @@ class StatsTest test("0 to 0") { check( 0 to 0, - Stats( + NonEmpty( n = 1, mean = 0, stddev = 0, @@ -81,7 +69,7 @@ class StatsTest test("0 to 1") { check( 0 to 1, - Stats( + NonEmpty( n = 2, mean = .5, stddev = .5, @@ -104,7 +92,7 @@ class StatsTest test("1 to 0") { check( 1 to 0 by -1, - Stats( + NonEmpty( n = 2, mean = .5, stddev = .5, @@ -134,7 +122,7 @@ class StatsTest test("0 to 2") { check( 0 to 2, - Stats( + NonEmpty( n = 3, mean = 1, stddev = sqrt(2 / 3.0), From 4e3d94d90b8835c00bae4bf3687a67563a2ead07 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 3 Aug 2017 03:16:55 +0000 Subject: [PATCH 16/20] unify flat/hist percentile-computations --- .../scala/org/hammerlab/math/package.scala | 3 + .../scala/org/hammerlab/stats/Stats.scala | 148 ++++++++---------- .../org/hammerlab/stats/HistShowTest.scala | 48 ++++-- 3 files changed, 106 insertions(+), 93 deletions(-) diff --git a/src/main/scala/org/hammerlab/math/package.scala b/src/main/scala/org/hammerlab/math/package.scala index d27932c..c4db735 100644 --- a/src/main/scala/org/hammerlab/math/package.scala +++ b/src/main/scala/org/hammerlab/math/package.scala @@ -17,6 +17,9 @@ package object math { ) } + def interpolate[N: Integral](start: N, end: N, delta: Rational): Rational = + Rational(start.toSafeLong) + delta * Rational((end - start).toSafeLong) + def interpolate[N: Numeric](start: N, end: N, delta: Double): Double = start.toDouble() + delta * (end - start).toDouble() } diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 193418a..c3458ef 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -7,7 +7,9 @@ import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ import org.hammerlab.math.interpolate import org.hammerlab.types._ -import spire.implicits._ +//import spire.implicits._ +//import spire.math._ +import spire.syntax.all._ import spire.math.{ Integral, Numeric, Rational } import scala.Double.NaN @@ -18,6 +20,8 @@ import scala.math.{ abs, ceil, floor, sqrt } /** * Stores some computed statistics about a dataset of [[Numeric]] elements. * + * Two concrete implementations are below: [[Empty]] and [[NonEmpty]]. + * * @tparam K [[Numeric]] element type. TODO(ryan): allow this to be non-[[Numeric]]. * @tparam V [[Integral]] value type. */ @@ -95,7 +99,10 @@ object Stats { medianDeviations, Seq( Rational(50) → - ((n.toDouble() - 1) / 2.0) + ( + (n - 1) /~ 2, + ((n - 1) % 2).toDouble() / 2.0 + ) ) ) .head @@ -231,7 +238,7 @@ object Stats { * Compute percentiles listed in `ps` of the data in `values`; wrapper for implementation below. */ private def getRunPercentiles[K: Numeric, V: Integral](values: Seq[(K, V)], - ps: Seq[(Rational, Double)]): Vector[(Rational, Double)] = + ps: Seq[(Rational, (V, Double))]): Vector[(Rational, Double)] = getRunPercentiles( values .iterator @@ -252,81 +259,43 @@ object Stats { * @return pairs of (percentile, value). */ private def getRunPercentiles[K: Numeric, V: Integral](values: BufferedIterator[(K, V)], - percentiles: BufferedIterator[(Rational, Double)]): Iterator[(Rational, Double)] = + percentiles: BufferedIterator[(Rational, (V, Double))]): Iterator[(Rational, Double)] = new Iterator[(Rational, Double)] { - var elemsPast = 0.0 - var curK: Option[Double] = None + var elemsPast = Integral[V].zero + var curK: Option[K] = None override def hasNext: Boolean = percentiles.hasNext override def next(): (Rational, Double) = { - val (percentile, idx) = percentiles.next() + val (percentile, (floor, remainder)) = percentiles.next() - while(elemsPast <= idx) { + while(elemsPast <= floor) { val (k, v) = values.next() - curK = Some(k.toDouble()) - elemsPast += v.toDouble() + curK = Some(k) + elemsPast += v } - val distancePast = elemsPast - idx + val distancePast = elemsPast - floor percentile → ( - if (distancePast < 1) - curK.get * distancePast + values.head._1.toDouble() * (1 - distancePast) + if (distancePast == 1 && values.hasNext) + interpolate(curK.get, values.head._1, remainder) else - curK.get + curK.get.toDouble() ) } } - /** - * Compute some relevant percentiles based on the number of elements present. - * @return pairs of (percentile, value). - */ - private def histPercentiles[K: Numeric, V: Integral](N: V, - values: IndexedSeq[(K, V)]): Vector[(Rational, Double)] = { - val n = N - 1 - val denominators: Iterator[Int] = Iterator(2, 4, 10, 20, 100, 1000, 10000) - - val nd = n.toDouble - val percentileIdxs = - denominators - .takeWhile(d ⇒ d <= n || d == 2) // Always take the median (denominator 2 aka 50th percentile). - .flatMap { - d ⇒ - val loPercentile = Rational(100, d) - val hiPercentile = 100 - loPercentile - - val loIdx = nd / d - val hiIdx = nd - loIdx - - if (d == 2) - // Median (50th percentile, denominator 2) only emits one tuple. - Iterator(loPercentile → loIdx) - else - // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. - Iterator(loPercentile → loIdx, hiPercentile → hiIdx) - } - .toArray - .sortBy(_._1) + private def percentileIdxs[K: Numeric, V: Integral](N: V): Vector[(Rational, (V, Double))] = { + val n = N + 1 - getRunPercentiles(values, percentileIdxs) - } + implicit def fromInt = Integral[V].fromInt _ - /** - * Compute some relevant percentiles based on the number of elements present. - * - * @return pairs of (percentile, value). - */ - private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Rational, Double)] = { - val n = values.length + 1 - - val denominators: Iterator[Int] = { - lazy val pow10s: Stream[Int] = 100 #:: pow10s.map(_ * 10) - Iterator( + val denominators: Iterator[V] = { + lazy val pow10s: Stream[V] = 100 #:: pow10s.map(_ * 10) + Iterator[V]( 2, // 50 4, // 25/75 10, // 10/90 @@ -338,40 +307,59 @@ object Stats { .takeWhile(_ <= n) .flatMap { d ⇒ - val loPercentile = Rational(100, d) + val loPercentile = Rational(100, d.toSafeLong) val hiPercentile = 100 - loPercentile - val loFloor = n / d - 1 - val loRemainder = n % d - - val hiCeil = n - 2 - loFloor - - val (lo, hi) = - if (loRemainder == 0) - ( - values(loFloor).toDouble(), - values( hiCeil).toDouble() - ) - else { - val floorWeight = loRemainder.toDouble() / d - ( - interpolate(values(loFloor), values(loFloor + 1), floorWeight), - interpolate(values( hiCeil), values( hiCeil - 1), floorWeight) - ) - } + val loFloor: V = n /~ d - 1 + val loRemainder = (n % d).toDouble() / d.toDouble() + + val hiFloor = n - 3 - loFloor + val hiRemainder = 1 - loRemainder if (d == 2) // Median (50th percentile, denominator 2) only emits one tuple. - Iterator(loPercentile → lo) + Iterator(loPercentile → (loFloor, loRemainder)) else // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. - Iterator(loPercentile → lo, hiPercentile → hi) + // denominator 4, we emit the 25th and 75th percentiles. + Iterator( + loPercentile → (loFloor, loRemainder), + hiPercentile → (hiFloor, hiRemainder) + ) } .toVector .sortBy(_._1) } + /** + * Compute some relevant percentiles based on the number of elements present. + * @return pairs of (percentile, value). + */ + private def histPercentiles[K: Numeric, V: Integral](N: V, + values: IndexedSeq[(K, V)]): Vector[(Rational, Double)] = + getRunPercentiles( + values, + percentileIdxs(N) + ) + + /** + * Compute some relevant percentiles based on the number of elements present. + * + * @return pairs of (percentile, value). + */ + private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Rational, Double)] = + percentileIdxs(values.length) + .map { + case (d, (floor, weight)) ⇒ + val value = + if (weight == 0) + values(floor).toDouble() + else + interpolate(values(floor), values(floor + 1), weight) + + d → value + } + private def getMedian[T: Numeric](sorted: Vector[T]): Double = { val n = sorted.length if (n == 0) diff --git a/src/test/scala/org/hammerlab/stats/HistShowTest.scala b/src/test/scala/org/hammerlab/stats/HistShowTest.scala index 4574809..23e19cb 100644 --- a/src/test/scala/org/hammerlab/stats/HistShowTest.scala +++ b/src/test/scala/org/hammerlab/stats/HistShowTest.scala @@ -87,7 +87,9 @@ class HistShowTest extends Suite { "num: 3, mean: 2, stddev: 2.2, mad: 1", "elems: 0, 5, 1", "sorted: 0, 1, 5", - "50: 1" + "25: 0", + "50: 1", + "75: 5" ) } @@ -96,7 +98,9 @@ class HistShowTest extends Suite { List(0 → 1, 1 → 2), "num: 3, mean: 0.7, stddev: 0.5, mad: 0", "elems: 0, 1×2", - "50: 1" + "25: 0", + "50: 1", + "75: 1" ) } @@ -105,9 +109,11 @@ class HistShowTest extends Suite { List(1 → 5, 2 → 4), "num: 9, mean: 1.4, stddev: 0.5, mad: 0", "elems: 1×5, 2×4", + "10: 1", "25: 1", "50: 1", - "75: 2" + "75: 2", + "90: 2" ) } @@ -116,9 +122,11 @@ class HistShowTest extends Suite { List(0 → 5, 1 → 5), "num: 10, mean: 0.5, stddev: 0.5, mad: 0.5", "elems: 0×5, 1×5", + "10: 0", "25: 0", "50: 0.5", - "75: 1" + "75: 1", + "90: 1" ) } @@ -127,9 +135,11 @@ class HistShowTest extends Suite { List(0 → 4, 1 → 6), "num: 10, mean: 0.6, stddev: 0.5, mad: 0", "elems: 0×4, 1×6", + "10: 0", "25: 0", "50: 1", - "75: 1" + "75: 1", + "90: 1" ) } @@ -138,8 +148,8 @@ class HistShowTest extends Suite { (1 to 10).map(i ⇒ i → i), "num: 55, mean: 7, stddev: 2.4, mad: 2", "elems: 1, 2×2, 3×3, 4×4, 5×5, 6×6, 7×7, 8×8, 9×9, 10×10", - "5: 2.7", - "10: 3.4", + "5: 2", + "10: 3", "25: 5", "50: 7", "75: 9", @@ -153,11 +163,11 @@ class HistShowTest extends Suite { (0 to 10).map(i ⇒ i → 1), "num: 11, mean: 5, stddev: 3.2, mad: 3", "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", - "10: 1", - "25: 2.5", + "10: 0.2", + "25: 2", "50: 5", - "75: 7.5", - "90: 9" + "75: 8", + "90: 9.8" ) } @@ -168,7 +178,7 @@ class HistShowTest extends Suite { "elems: 0×2, 10×7, 3×5, 0×2, 3", "sorted: 0×4, 3×6, 10×7", "10: 0", - "25: 3", + "25: 1.5", "50: 3", "75: 10", "90: 10" @@ -186,6 +196,12 @@ class HistShowTest extends Suite { "num: 12000000100, mean: 1.2, stddev: 0.4, mad: 0", "elems: 1×10000000000, 2×1000000000, 1×100, 2×1000000000", "sorted: 1×10000000100, 2×2000000000", + "1.0E-8: 1", + "1.0E-7: 1", + "1.0E-6: 1", + "1.0E-5: 1", + "1.0E-4: 1", + "0.001: 1", "0.01: 1", "0.1: 1", "1: 1", @@ -198,7 +214,13 @@ class HistShowTest extends Suite { "95: 2", "99: 2", "99.9: 2", - "99.99: 2" + "99.99: 2", + "99.999: 2", + "99.9999: 2", + "99.99999: 2", + "99.999999: 2", + "99.9999999: 2", + "99.99999999: 2" ) } } From cd4e64f24c930f9e27f48c1fa6e2f20d0f55734f Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 3 Aug 2017 15:30:07 +0000 Subject: [PATCH 17/20] expose n, sum on Stats --- .../scala/org/hammerlab/stats/Stats.scala | 19 +++++++++++++------ .../scala/org/hammerlab/stats/StatsTest.scala | 4 ++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index c3458ef..1752042 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -7,10 +7,8 @@ import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ import org.hammerlab.math.interpolate import org.hammerlab.types._ -//import spire.implicits._ -//import spire.math._ -import spire.syntax.all._ import spire.math.{ Integral, Numeric, Rational } +import spire.syntax.all._ import scala.Double.NaN import scala.collection.mutable @@ -25,7 +23,10 @@ import scala.math.{ abs, ceil, floor, sqrt } * @tparam K [[Numeric]] element type. TODO(ryan): allow this to be non-[[Numeric]]. * @tparam V [[Integral]] value type. */ -sealed abstract class Stats[K: Numeric, V: Integral] +sealed abstract class Stats[K: Numeric, V: Integral] { + def n: V + def sum: Double +} object Stats { @@ -128,6 +129,7 @@ object Stats { NonEmpty( n, + sum, mean, stddev, median, mad, samplesOpt, @@ -226,6 +228,7 @@ object Stats { NonEmpty( n, + sum, mean, stddev, median, mad, samplesOpt, @@ -404,7 +407,7 @@ object Stats { ): Show[Stats[K, V]] = show { case Empty() ⇒ "(empty)" - case NonEmpty(n, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ + case NonEmpty(n, _, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ def pair[L: Show, R: Show](l: L, r: R): String = show"$l:\t$r" @@ -461,7 +464,10 @@ object Stats { ) } -case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] +case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] { + override def n: V = Integral[V].zero + override def sum: Double = 0 +} /** * Stores some computed statistics about a dataset of [[Numeric]] elements. @@ -474,6 +480,7 @@ case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] * @param percentiles selected percentiles of the dataset. */ case class NonEmpty[K: Numeric, V: Integral](n: V, + sum: Double, mean: Double, stddev: Double, median: Double, diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala index 0d0a03d..31e7b42 100644 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ b/src/test/scala/org/hammerlab/stats/StatsTest.scala @@ -48,6 +48,7 @@ class StatsTest 0 to 0, NonEmpty( n = 1, + sum = 0, mean = 0, stddev = 0, median = 0, @@ -71,6 +72,7 @@ class StatsTest 0 to 1, NonEmpty( n = 2, + sum = 1, mean = .5, stddev = .5, median = .5, @@ -94,6 +96,7 @@ class StatsTest 1 to 0 by -1, NonEmpty( n = 2, + sum = 1, mean = .5, stddev = .5, median = .5, @@ -124,6 +127,7 @@ class StatsTest 0 to 2, NonEmpty( n = 3, + sum = 3, mean = 1, stddev = sqrt(2 / 3.0), median = 1, From 506e4207e6a4be4089b8eb25497d951b025fd8fe Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 3 Aug 2017 16:14:10 +0000 Subject: [PATCH 18/20] Stats.show improvements --- .../scala/org/hammerlab/stats/Stats.scala | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala index 1752042..e3744e3 100644 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ b/src/main/scala/org/hammerlab/stats/Stats.scala @@ -6,6 +6,7 @@ import cats.instances.all.catsStdShowForString import cats.syntax.all._ import org.hammerlab.iterator.RunLengthIterator._ import org.hammerlab.math.interpolate +import org.hammerlab.stats.Stats.{ makeShow, showDouble, showPercentile } import org.hammerlab.types._ import spire.math.{ Integral, Numeric, Rational } import spire.syntax.all._ @@ -26,6 +27,16 @@ import scala.math.{ abs, ceil, floor, sqrt } sealed abstract class Stats[K: Numeric, V: Integral] { def n: V def sum: Double + def showStatsAsElems(implicit + showElem: Show[K], + showCount: Show[V], + percentileShow: Show[Rational] = showPercentile): String + + def show(implicit + showElem: Show[K], + showCount: Show[V], + showStat: Show[Double] = showDouble, + percentileShow: Show[Rational] = showPercentile): String } object Stats { @@ -467,6 +478,18 @@ object Stats { case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] { override def n: V = Integral[V].zero override def sum: Double = 0 + override def showStatsAsElems(implicit + showElem: Show[K], + showCount: Show[V], + percentileShow: Show[Rational]): String = + "(empty)" + + override def show(implicit + showElem: Show[K], + showCount: Show[V], + showStat: Show[Double], + percentileShow: Show[Rational]): String = + "(empty)" } /** @@ -488,4 +511,25 @@ case class NonEmpty[K: Numeric, V: Integral](n: V, samplesOpt: Option[Samples[K, V]], sortedSamplesOpt: Option[Samples[K, V]], percentiles: Seq[(Rational, Double)]) - extends Stats[K, V] + extends Stats[K, V] { + + override def showStatsAsElems(implicit + showElem: Show[K], + showCount: Show[V], + percentileShow: Show[Rational]): String = { + implicit val showStat = + Show.show[Double]( + stat ⇒ + showElem.show(Numeric[K].fromDouble(stat)) + ) + + makeShow.show(this) + } + + override def show(implicit + showElem: Show[K], + showCount: Show[V], + showStat: Show[Double], + percentileShow: Show[Rational]): String = + makeShow.show(this) +} From f127573b4d44c08acad4863dfa54c224d63f474e Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Thu, 3 Aug 2017 21:55:54 +0000 Subject: [PATCH 19/20] move out math, stats modules --- build.sbt | 11 +- .../scala/org/hammerlab/math/Format.scala | 48 -- .../math/HypergeometricDistribution.scala | 74 --- .../scala/org/hammerlab/math/Monoid.scala | 60 -- .../org/hammerlab/math/PartiallyOrdered.scala | 45 -- .../org/hammerlab/math/RoundNumbers.scala | 48 -- src/main/scala/org/hammerlab/math/Steps.scala | 81 --- .../scala/org/hammerlab/math/VarNum.scala | 92 --- .../scala/org/hammerlab/math/package.scala | 25 - src/main/scala/org/hammerlab/stats/Runs.scala | 39 -- .../scala/org/hammerlab/stats/Samples.scala | 53 -- .../scala/org/hammerlab/stats/Stats.scala | 535 ------------------ .../scala/org/hammerlab/math/CeilTest.scala | 28 - .../scala/org/hammerlab/math/FormatTest.scala | 138 ----- .../math/HypergeometricDistributionTest.scala | 109 ---- .../scala/org/hammerlab/math/MonoidTest.scala | 19 - .../scala/org/hammerlab/math/StepsTest.scala | 25 - .../scala/org/hammerlab/math/VarNumTest.scala | 133 ----- .../org/hammerlab/stats/HistShowTest.scala | 226 -------- .../scala/org/hammerlab/stats/ShowTest.scala | 398 ------------- .../scala/org/hammerlab/stats/StatsTest.scala | 149 ----- 21 files changed, 1 insertion(+), 2335 deletions(-) delete mode 100644 src/main/scala/org/hammerlab/math/Format.scala delete mode 100644 src/main/scala/org/hammerlab/math/HypergeometricDistribution.scala delete mode 100644 src/main/scala/org/hammerlab/math/Monoid.scala delete mode 100644 src/main/scala/org/hammerlab/math/PartiallyOrdered.scala delete mode 100644 src/main/scala/org/hammerlab/math/RoundNumbers.scala delete mode 100644 src/main/scala/org/hammerlab/math/Steps.scala delete mode 100644 src/main/scala/org/hammerlab/math/VarNum.scala delete mode 100644 src/main/scala/org/hammerlab/math/package.scala delete mode 100644 src/main/scala/org/hammerlab/stats/Runs.scala delete mode 100644 src/main/scala/org/hammerlab/stats/Samples.scala delete mode 100644 src/main/scala/org/hammerlab/stats/Stats.scala delete mode 100644 src/test/scala/org/hammerlab/math/CeilTest.scala delete mode 100644 src/test/scala/org/hammerlab/math/FormatTest.scala delete mode 100644 src/test/scala/org/hammerlab/math/HypergeometricDistributionTest.scala delete mode 100644 src/test/scala/org/hammerlab/math/MonoidTest.scala delete mode 100644 src/test/scala/org/hammerlab/math/StepsTest.scala delete mode 100644 src/test/scala/org/hammerlab/math/VarNumTest.scala delete mode 100644 src/test/scala/org/hammerlab/stats/HistShowTest.scala delete mode 100644 src/test/scala/org/hammerlab/stats/ShowTest.scala delete mode 100644 src/test/scala/org/hammerlab/stats/StatsTest.scala diff --git a/build.sbt b/build.sbt index 948a049..7ba76b5 100644 --- a/build.sbt +++ b/build.sbt @@ -4,13 +4,4 @@ version := "1.3.0-SNAPSHOT" addScala212 -deps ++= Seq( - cats, - commons_math, - shapeless, - spire -) - -testDeps += kryo - -testUtilsVersion := "1.2.4-SNAPSHOT" +deps += spire diff --git a/src/main/scala/org/hammerlab/math/Format.scala b/src/main/scala/org/hammerlab/math/Format.scala deleted file mode 100644 index 0903527..0000000 --- a/src/main/scala/org/hammerlab/math/Format.scala +++ /dev/null @@ -1,48 +0,0 @@ -package org.hammerlab.math - -import cats.Show -import cats.Show.show -import spire.implicits._ -import spire.math.Integral - -import scala.math.round - -object Format { - def scientific[I: Integral](n: I, precision: Int): String = - if (n < 0) - s"-${scientific(-n, precision)}" - else { - assert(precision >= 2) - val digits = n.toString - val numDigits = digits.length - if (numDigits > precision + 3) { - - val integral = implicitly[Integral[I]] - import integral.fromDouble - - val roundedDigits = - fromDouble( - round( - s"${digits.substring(0, precision)}.${digits(precision)}" - .toDouble - ) - ) - .toString - - val first = roundedDigits.head - val rest = roundedDigits.substring(1, precision) - - s"$first.${rest}e${numDigits - 1 + roundedDigits.length - precision}" - } else - digits - } - - def scientific[I: Integral](precision: Int): Show[I] = - show(scientific(_, precision)) - - object scientific { - implicit def digits2[I: Integral] = scientific[I](2) - implicit def digits3[I: Integral] = scientific[I](3) - implicit def digits4[I: Integral] = scientific[I](3) - } -} diff --git a/src/main/scala/org/hammerlab/math/HypergeometricDistribution.scala b/src/main/scala/org/hammerlab/math/HypergeometricDistribution.scala deleted file mode 100644 index 8b90dec..0000000 --- a/src/main/scala/org/hammerlab/math/HypergeometricDistribution.scala +++ /dev/null @@ -1,74 +0,0 @@ -package org.hammerlab.math - -import org.apache.commons.math3.util.FastMath - -import scala.collection.mutable.ArrayBuffer - -/** - * Implementation of a hypergeometric distribution, modeled after - * [[org.apache.commons.math3.distribution.HypergeometricDistribution]], but supporting [[Long]] parameters. - * @param N Population size. - * @param K Number of successes. - * @param n Number to sample. - */ -case class HypergeometricDistribution(N: Long, K: Long, n: Int) { - - // These will be filled with n+1 elements corresponding to the PDF and CDF values for k ∈ [0, n]. - val pdf = ArrayBuffer[Double]() - val cdf = ArrayBuffer[Double]() - - // This will be set to the log of the binomial coefficient C(N, n), which is used multiple times in subsequent - // calculations. - var d = 0.0 - - // logs of k!, for k in [0, n]. - val logBinomPartialSumsLo = ArrayBuffer[Double]() - - // logs of K! / (K - k)!, for k in [0, n]. - val logBinomPartialSumsK = ArrayBuffer[Double]() - - // logs of (N - K)! / (N - K - k)!, for k in [0, n]. - val logBinomPartialSumsNK = ArrayBuffer[Double]() - - // Compute log-arrays described above. - (0 to n).foreach(k ⇒ { - if (k == 0) { - logBinomPartialSumsLo += 0 - logBinomPartialSumsK += 0 - logBinomPartialSumsNK += 0 - } else { - logBinomPartialSumsLo += (logBinomPartialSumsLo(k - 1) + FastMath.log(k)) - logBinomPartialSumsK += (logBinomPartialSumsK(k - 1) + FastMath.log(K + 1 - k)) - logBinomPartialSumsNK += (logBinomPartialSumsNK(k - 1) + FastMath.log(N - K + 1 - k)) - - d += FastMath.log(N + 1 - k) - d -= FastMath.log(k) - } - }) - - // Compute PDF and CDF. - (0 to n).foreach(k ⇒ { - val p1 = logBinomPartialSumsK(k) - logBinomPartialSumsLo(k) - val p2 = logBinomPartialSumsNK(n - k) - logBinomPartialSumsLo(n - k) - val v = FastMath.exp(p1 + p2 - d) - pdf += v - if (k == 0) - cdf += v - else - cdf += (v + cdf(k - 1)) - }) - - // Given a double x in [0, 1], binary-search the CDF to find the greatest integer k such that CDF(k) ≤ x. - def invCDF(x: Double, start: Int = 0, end: Int = n): Int = { - if (start == end) - start - else { - val mid = (start + end) / 2 - val c = cdf(mid) - if (x <= c) - invCDF(x, start, mid) - else - invCDF(x, mid + 1, end) - } - } -} diff --git a/src/main/scala/org/hammerlab/math/Monoid.scala b/src/main/scala/org/hammerlab/math/Monoid.scala deleted file mode 100644 index da864a1..0000000 --- a/src/main/scala/org/hammerlab/math/Monoid.scala +++ /dev/null @@ -1,60 +0,0 @@ -package org.hammerlab.math - -import shapeless._ - -/** - * Copied/Adapted from - * https://github.com/milessabin/shapeless/blob/shapeless-2.3.2/examples/src/main/scala/shapeless/examples/monoids.scala - */ - -trait MonoidSyntax[T] { - def |+|(b: T): T -} - -object MonoidSyntax { - implicit def monoidSyntax[T](a: T)(implicit mt: Monoid[T]): MonoidSyntax[T] = - new MonoidSyntax[T] { - def |+|(b: T) = mt.append(a, b) - } -} - -trait Monoid[T] { - def zero: T - def append(a: T, b: T): T -} - -object Monoid extends ProductTypeClassCompanion[Monoid] { - def zero[T](implicit mt: Monoid[T]) = mt.zero - - implicit def longMonoid: Monoid[Long] = new Monoid[Long] { - def zero = 0 - def append(a: Long, b: Long) = a + b - } - - implicit def intMonoid: Monoid[Int] = new Monoid[Int] { - override def zero: Int = 0 - override def append(a: Int, b: Int): Int = a + b - } - - implicit def stringMonoid: Monoid[String] = new Monoid[String] { - override def zero: String = "" - override def append(a: String, b: String): String = a + b - } - - object typeClass extends ProductTypeClass[Monoid] { - def emptyProduct = new Monoid[HNil] { - def zero = HNil - def append(a: HNil, b: HNil) = HNil - } - - def product[F, T <: HList](mh: Monoid[F], mt: Monoid[T]) = new Monoid[F :: T] { - def zero = mh.zero :: mt.zero - def append(a: F :: T, b: F :: T) = mh.append(a.head, b.head) :: mt.append(a.tail, b.tail) - } - - def project[F, G](instance: => Monoid[G], to: F => G, from: G => F) = new Monoid[F] { - def zero = from(instance.zero) - def append(a: F, b: F) = from(instance.append(to(a), to(b))) - } - } -} diff --git a/src/main/scala/org/hammerlab/math/PartiallyOrdered.scala b/src/main/scala/org/hammerlab/math/PartiallyOrdered.scala deleted file mode 100644 index 982f6fa..0000000 --- a/src/main/scala/org/hammerlab/math/PartiallyOrdered.scala +++ /dev/null @@ -1,45 +0,0 @@ -package org.hammerlab.math - -/** - * Fork of [[scala.math.PartiallyOrdered]] trait that inherits [[Any]], making it a universal trait suitable for - * inheritance by value-classes. See https://issues.scala-lang.org/browse/SI-10128. - * - * A class for partially ordered data. - * - * Forked from the Scala standard-lib in order to make it a universal trait, for mixing-in to value-classes. - * - * @author Martin Odersky - * @version 1.0, 23/04/2004 - */ -trait PartiallyOrdered[+A] extends Any { - - /** Result of comparing `'''this'''` with operand `that`. - * Returns `None` if operands are not comparable. - * If operands are comparable, returns `Some(x)` where - * - `x < 0` iff `'''this''' < that` - * - `x == 0` iff `'''this''' == that` - * - `x > 0` iff `'''this''' > that` - */ - def tryCompareTo [B >: A : PartiallyOrdered](that: B): Option[Int] - - def < [B >: A : PartiallyOrdered](that: B): Boolean = - this tryCompareTo that match { - case Some(x) if x < 0 ⇒ true - case _ ⇒ false - } - def > [B >: A : PartiallyOrdered](that: B): Boolean = - this tryCompareTo that match { - case Some(x) if x > 0 ⇒ true - case _ ⇒ false - } - def <= [B >: A : PartiallyOrdered](that: B): Boolean = - this tryCompareTo that match { - case Some(x) if x <= 0 ⇒ true - case _ ⇒ false - } - def >= [B >: A : PartiallyOrdered](that: B): Boolean = - this tryCompareTo that match { - case Some(x) if x >= 0 ⇒ true - case _ ⇒ false - } -} diff --git a/src/main/scala/org/hammerlab/math/RoundNumbers.scala b/src/main/scala/org/hammerlab/math/RoundNumbers.scala deleted file mode 100644 index b5f76f2..0000000 --- a/src/main/scala/org/hammerlab/math/RoundNumbers.scala +++ /dev/null @@ -1,48 +0,0 @@ -package org.hammerlab.math - -import org.hammerlab.iterator.SimpleBufferedIterator -import spire.math.Integral -import spire.implicits._ - -/** - * Emit an exponentially-increasing sequence of integers composed of repetitions of `steps` scaled by successive powers - * of `base`. - */ -class RoundNumbers[I: Integral] private(steps: Seq[Int], - base: Int = 10, - limitOpt: Option[I]) - extends SimpleBufferedIterator[I] { - - private var idx = 0 - private var basePow: I = Integral[I].one - - override protected def _advance: Option[I] = { - val n = steps(idx) * basePow - if (limitOpt.exists(_ < n)) - None - else - Some(n) - } - - override protected def postNext(): Unit = { - idx += 1 - if (idx == steps.size) { - idx = 0 - basePow *= base - } - } -} - -/** - * Constructors. - */ -object RoundNumbers { - def apply[I: Integral](steps: Seq[Int], - limit: I, - base: Int = 10): Iterator[I] = - new RoundNumbers(steps, base, Some(limit)) - - def apply(steps: Seq[Int], - base: Int): Iterator[Long] = - new RoundNumbers[Long](steps, base, None) -} diff --git a/src/main/scala/org/hammerlab/math/Steps.scala b/src/main/scala/org/hammerlab/math/Steps.scala deleted file mode 100644 index f4d457b..0000000 --- a/src/main/scala/org/hammerlab/math/Steps.scala +++ /dev/null @@ -1,81 +0,0 @@ -package org.hammerlab.math - -import math.{ exp, log, max, min } -import scala.collection.immutable.SortedSet - -/** - * Some utilities for generating exponential sequences of integers that can be used as e.g. histogram-bucket boundaries. - */ -object Steps { - - /** - * Divide [0, maxDepth] into N geometrically-evenly-spaced steps (of size ≈maxDepth^(1/N)). - * - * Until the k-th step is bigger than k, the whole number k is used in its stead. - */ - def geometricEvenSteps(maxDepth: Int, N: Int = 100): SortedSet[Int] = { - val logMaxDepth = log(maxDepth) - - SortedSet( - 0 :: - ( - for { - i ← 1 until N - } yield - min( - maxDepth, - max( - i, - exp( - (i - 1) * logMaxDepth / (N - 2) - ) - .toInt - ) - ) - ) - .toList: _* - ) - } - - /** - * Produce a set of "round numbers" between 0 and a provided N, inclusive. - * - * Coverage is relatively dense but the total number of sampled/returned integers is still O(log(N)) in the input N; - * specifically, 35 integers are returned in each factor-of-10 window (detailed below). - * - * The absolute difference between consecutive integers is non-decreasing over the entire range and, (after the [0,10] - * interval), no two consecutive integers returned are more than 10% different from one another. - * - * - * 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, base case: include all of [0, 10]. - * 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, step by one from 10% to 20% of the next power of 10 (100 here). - * 20, 22, 24, 26, 28, - * 30, 32, 34, 36, 38, - * 40, 42, 44, 46, 48, step by two from 20% to 50% of the next power of 10. - * 50, 55, - * 60, 65, - * 70, 75, - * 80, 85, - * 90, 95, step by five from 50% to 100% of the next power of 10. - * - * …then repeat the [10, 95] portion, multiplied by powers of 10: - * - * 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, this is 10x the "steps by one" section above. - * 200, 220, 240, 260, 280, - * 300, 320, 340, 360, 380, likewise, 10x the "steps by two" from above. - * - * …etc. - */ - def roundNumbers(maxDepth: Int): SortedSet[Int] = - SortedSet( - ( - (0 until 10) ++ - RoundNumbers( - (10 until 20) ++ (20 until 50 by 2) ++ (50 until 100 by 5), - maxDepth, - 10 - ) - .toSeq - ): _* - ) -} diff --git a/src/main/scala/org/hammerlab/math/VarNum.scala b/src/main/scala/org/hammerlab/math/VarNum.scala deleted file mode 100644 index eeb22c1..0000000 --- a/src/main/scala/org/hammerlab/math/VarNum.scala +++ /dev/null @@ -1,92 +0,0 @@ -package org.hammerlab.math - -import java.io.{OutputStream, InputStream} - -/** - * Serialization wrapper for [[Long]]s which burns one bit per byte indicating whether any more bytes follow. - * - * Can utilize less serialized space than naively writing 8-byte [[Long]]s in datasets where absolute values tend to be - * less than 2^48 more often than they are ≥ 2^55. - * - * [[Long]]'s absolute values correspond to the following number of serialized bytes: - * - * [ 0, 2^6): 1 byte - * [ 2^6, 2^13): 2 bytes - * [2^13, 2^20): 3 bytes - * [2^20, 2^27): 4 bytes - * [2^27, 2^34): 5 bytes - * [2^34, 2^41): 6 bytes - * [2^41, 2^48): 7 bytes - * [2^48, 2^55): 8 bytes - * [2^55, 2^63): 9 bytes - * - * The first byte, in addition to its most significant bit indicating whether any more bites follow, uses its - * second-most-significant bit to represent the sign of the [[Long]]. - */ -object VarNum { - def write(output: OutputStream, l: Long): Unit = { - var n = l - var more = true - var total = 0 - while (more) { - if (total == 56) { - output.write(n.toByte) - more = false - } else { - val b = - if (total == 0) { - val byte = - if (n < 0) { - n = -n - (n & 0x3F).toByte | 0x40 - } else { - (n & 0x3F).toByte - } - - n = n >> 6 - byte - } else { - val byte = (n & 0x7F).toByte - n = n >> 7 - byte - } - - total += 7 - more = n > 0 - output.write(b | (if (more) 0x80 else 0).toByte) - } - } - } - - def read(input: InputStream): Long = { - var l = 0L - var bits = 0 - val readBytes = Array[Byte](0) - var negate = false - while (bits < 63) { - input.read(readBytes) - val b = readBytes(0) - if (bits == 55) { - l += ((b & 0xffL) << bits) - bits += 8 - } else { - if (bits == 0) { - negate = (b & 0x40) > 0 - l += (b & 0x3FL) - bits += 6 - } else { - l += (b & 0x7FL) << bits - bits += 7 - } - - if ((b & 0x80) == 0) { - bits = 63 - } - } - } - if (negate) - -l - else - l - } -} diff --git a/src/main/scala/org/hammerlab/math/package.scala b/src/main/scala/org/hammerlab/math/package.scala deleted file mode 100644 index c4db735..0000000 --- a/src/main/scala/org/hammerlab/math/package.scala +++ /dev/null @@ -1,25 +0,0 @@ -package org.hammerlab - -import spire.implicits._ -import spire.math._ - -package object math { - /** - * Simple helper for rounding-up integer-division - */ - def ceil[N: Integral](numerator: N, denominator: N): N = { - val numeric = implicitly[Integral[N]] - import numeric._ - fromDouble( - scala.math.ceil( - toDouble(numerator) / toDouble(denominator) - ) - ) - } - - def interpolate[N: Integral](start: N, end: N, delta: Rational): Rational = - Rational(start.toSafeLong) + delta * Rational((end - start).toSafeLong) - - def interpolate[N: Numeric](start: N, end: N, delta: Double): Double = - start.toDouble() + delta * (end - start).toDouble() -} diff --git a/src/main/scala/org/hammerlab/stats/Runs.scala b/src/main/scala/org/hammerlab/stats/Runs.scala deleted file mode 100644 index 7218511..0000000 --- a/src/main/scala/org/hammerlab/stats/Runs.scala +++ /dev/null @@ -1,39 +0,0 @@ -package org.hammerlab.stats - -import cats.Show -import cats.Show.show -import cats.implicits._ -import spire.implicits._ -import spire.math.Integral - -/** - * Convenience class wrapping a sequence of key-number pairs, used in run-length-encoding in [[NonEmpty]]. - */ -case class Runs[K, V: Integral](elems: Seq[(K, V)], num: V) - -object Runs { - implicit def runsToSeq[K, V: Integral](runs: Runs[K, V]): Seq[(K, V)] = runs.elems - implicit def seqToRuns[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = apply[K, V](elems) - - def apply[K, V: Integral](elems: Seq[(K, V)]): Runs[K, V] = - Runs( - elems, - elems - .map(_._2) - .reduce(_ + _) - ) - - implicit def makeShow[K, V: Integral](implicit elemShow: Show[K], countShow: Show[V]): Show[Runs[K, V]] = - show { - case Runs(elems, _) ⇒ - elems - .map { - case (elem, count) ⇒ - if (count == 1) - elem.show - else - show"$elem×$count" - } - .mkString(", ") - } -} diff --git a/src/main/scala/org/hammerlab/stats/Samples.scala b/src/main/scala/org/hammerlab/stats/Samples.scala deleted file mode 100644 index e4fac09..0000000 --- a/src/main/scala/org/hammerlab/stats/Samples.scala +++ /dev/null @@ -1,53 +0,0 @@ -package org.hammerlab.stats - -import cats.Show -import cats.Show.show -import cats.implicits._ -import spire.implicits._ -import spire.math.Integral - -/** - * Used by [[NonEmpty]] to wrap some [[Runs]] of elements from the start and end of a dataset. - * @param n total number of elements in the dataset. - * @param first [[Runs]] of elements from the start of the dataset. - * @param last [[Runs]] of elements from the end of the dataset. - * @tparam K arbitrary element type - * @tparam V [[Integral]] type, e.g. [[Int]] or [[Long]]. - */ -case class Samples[K, V: Integral](n: V, - first: Runs[K, V], - last: Runs[K, V]) { - def isEmpty: Boolean = first.isEmpty - def nonEmpty: Boolean = first.nonEmpty -} - -object Samples { - implicit def makeShow[K, V: Integral](implicit showRuns: Show[Runs[K, V]]): Show[Samples[K, V]] = - show { - case Samples(n, first, last) ⇒ - val numSampled = first.num + last.num - val numSkipped = n - numSampled - if (numSkipped > 0) - s"${first.show}, …, ${last.show}" - else - removeOverlap(-numSkipped, first, last).show - } - - def removeOverlap[K, V: Integral](num: V, - first: Runs[K, V], - last: Runs[K, V]): Runs[K, V] = { - val lastIt = last.iterator.buffered - var dropped = Integral[V].zero - Runs( - first ++ - lastIt - .dropWhile { - t ⇒ - val (_, count) = t - val drop = dropped < num - dropped += count - drop - } - ) - } -} diff --git a/src/main/scala/org/hammerlab/stats/Stats.scala b/src/main/scala/org/hammerlab/stats/Stats.scala deleted file mode 100644 index e3744e3..0000000 --- a/src/main/scala/org/hammerlab/stats/Stats.scala +++ /dev/null @@ -1,535 +0,0 @@ -package org.hammerlab.stats - -import cats.Show -import cats.Show.show -import cats.instances.all.catsStdShowForString -import cats.syntax.all._ -import org.hammerlab.iterator.RunLengthIterator._ -import org.hammerlab.math.interpolate -import org.hammerlab.stats.Stats.{ makeShow, showDouble, showPercentile } -import org.hammerlab.types._ -import spire.math.{ Integral, Numeric, Rational } -import spire.syntax.all._ - -import scala.Double.NaN -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer -import scala.math.{ abs, ceil, floor, sqrt } - -/** - * Stores some computed statistics about a dataset of [[Numeric]] elements. - * - * Two concrete implementations are below: [[Empty]] and [[NonEmpty]]. - * - * @tparam K [[Numeric]] element type. TODO(ryan): allow this to be non-[[Numeric]]. - * @tparam V [[Integral]] value type. - */ -sealed abstract class Stats[K: Numeric, V: Integral] { - def n: V - def sum: Double - def showStatsAsElems(implicit - showElem: Show[K], - showCount: Show[V], - percentileShow: Show[Rational] = showPercentile): String - - def show(implicit - showElem: Show[K], - showCount: Show[V], - showStat: Show[Double] = showDouble, - percentileShow: Show[Rational] = showPercentile): String -} - -object Stats { - - /** - * Construct a [[NonEmpty]] from a sequence of "runs"; elements paired with a count of repetitions. - * - * @param v values. - * @param numToSample highlight this many "runs" of data from the start and end of the data; likewise the least and - * greatest elements (and repetition counts). - * @param onlySampleSorted only highlight the least and greatest elements; omit the first and last - */ - def fromHist[K: Numeric: Ordering, V: Integral](v: Iterable[(K, V)], - numToSample: Int = 10, - onlySampleSorted: Boolean = false): Stats[K, V] = { - - var alreadySorted = true - val hist = mutable.HashMap[K, V]() - var n = Integral[V].zero - - val values = { - val vBuilder = Vector.newBuilder[(K, V)] - var prevOpt: Option[K] = None - for { - (value, num) ← reencode[K, V](v.iterator) - } { - if (alreadySorted) { - if (prevOpt.exists(_ > value)) - alreadySorted = false - else - prevOpt = Some(value) - } - vBuilder += value → num - n += num - hist.update(value, hist.getOrElse(value, Integral[V].zero) + num) - } - - vBuilder.result() - } - - if (values.isEmpty) - return Empty[K, V]() - - val sorted = - if (alreadySorted) - values - else - for { - key ← hist.keys.toVector.sorted - } yield - key → hist(key) - - val percentiles = histPercentiles(n, sorted) - - val median = percentiles(percentiles.length / 2)._2 - - val medianDeviationsBuilder = Vector.newBuilder[(Double, V)] - - var sum = 0.0 - var sumSquares = 0.0 - for ((value, num) ← sorted) { - val d = value.toDouble - sum += d * num.toDouble() - sumSquares += d * d * num.toDouble() - medianDeviationsBuilder += abs(d - median) → num - } - - val medianDeviations = medianDeviationsBuilder.result().sortBy(_._1) - - val mad = - getRunPercentiles( - medianDeviations, - Seq( - Rational(50) → - ( - (n - 1) /~ 2, - ((n - 1) % 2).toDouble() / 2.0 - ) - ) - ) - .head - ._2 - - val mean = sum / n.toDouble() - val stddev = sqrt(sumSquares / n.toDouble() - mean * mean) - - def samples(vs: Vector[(K, V)]): Samples[K, V] = - Samples[K, V]( - n, - vs.take(numToSample), - vs.takeRight(numToSample) - ) - - val samplesOpt = - (alreadySorted || !onlySampleSorted) | - samples(values) - - val sortedSamplesOpt = - !alreadySorted | - samples(sorted) - - NonEmpty( - n, - sum, - mean, stddev, - median, mad, - samplesOpt, - sortedSamplesOpt, - percentiles - ) - } - - /** - * Construct a [[NonEmpty]] instance from input data `v`. - * - * @param v values. - * @param numToSample highlight this many "runs" of data from the start and end of the data; likewise the least and - * greatest elements (and repetition counts). - * @param onlySampleSorted only highlight the least and greatest elements; omit the first and last - */ - def apply[K: Numeric: Ordering](v: Iterable[K], - numToSample: Int = 10, - onlySampleSorted: Boolean = false): Stats[K, Int] = { - - val vBuilder = Vector.newBuilder[K] - var alreadySorted = true - var prevOpt: Option[K] = None - for (value ← v) { - if (alreadySorted) { - if (prevOpt.exists(_ > value)) - alreadySorted = false - else - prevOpt = Some(value) - } - vBuilder += value - } - - val values = vBuilder.result() - - if (values.isEmpty) - return Empty[K, Int]() - - val n = values.length - - val sorted = - if (alreadySorted) - values - else - values.sorted - - val median = getMedian(sorted) - - val medianDeviationsBuilder = Vector.newBuilder[Double] - - var sum = 0.0 - var sumSquares = 0.0 - for (value ← sorted) { - val d = value.toDouble - sum += d - sumSquares += d * d - medianDeviationsBuilder += abs(d - median) - } - - val medianDeviations = medianDeviationsBuilder.result().sorted - val mad = getMedian(medianDeviations) - - val mean = sum / n - val stddev = sqrt(sumSquares / n - mean * mean) - - def samples(vs: Vector[K]): Samples[K, Int] = { - // Count occurrences of the first N distinct values. - val (firstElems, numFirstElems) = - runLengthEncodeWithSum( - vs.iterator, - numToSample - ) - - // Count occurrences of the last N distinct values. - val (lastElems, numLastElems) = - runLengthEncodeWithSum( - vs.reverseIterator, - numToSample, - reverse = true - ) - - Samples( - n, - Runs(firstElems, numFirstElems), - Runs(lastElems, numLastElems) - ) - } - - val samplesOpt = - (alreadySorted || !onlySampleSorted) | - samples(values) - - val sortedSamplesOpt = - !alreadySorted | - samples(sorted) - - NonEmpty( - n, - sum, - mean, stddev, - median, mad, - samplesOpt, - sortedSamplesOpt, - percentiles(sorted) - ) - } - - /** - * Compute percentiles listed in `ps` of the data in `values`; wrapper for implementation below. - */ - private def getRunPercentiles[K: Numeric, V: Integral](values: Seq[(K, V)], - ps: Seq[(Rational, (V, Double))]): Vector[(Rational, Double)] = - getRunPercentiles( - values - .iterator - .buffered, - ps - .iterator - .buffered - ) - .toVector - - /** - * Compute percentiles listed in `ps` of the data in `values`. - * - * @param values runs of elements. - * @param percentiles percentiles to compute, specified as tuples where the key is the percentile and the value is the - * index in `values` at which that percentile lies (interpolated to be a fractional amount between - * two indices, where appropriate). - * @return pairs of (percentile, value). - */ - private def getRunPercentiles[K: Numeric, V: Integral](values: BufferedIterator[(K, V)], - percentiles: BufferedIterator[(Rational, (V, Double))]): Iterator[(Rational, Double)] = - new Iterator[(Rational, Double)] { - - var elemsPast = Integral[V].zero - var curK: Option[K] = None - - override def hasNext: Boolean = percentiles.hasNext - - override def next(): (Rational, Double) = { - val (percentile, (floor, remainder)) = percentiles.next() - - while(elemsPast <= floor) { - val (k, v) = values.next() - curK = Some(k) - elemsPast += v - } - - val distancePast = elemsPast - floor - - percentile → - ( - if (distancePast == 1 && values.hasNext) - interpolate(curK.get, values.head._1, remainder) - else - curK.get.toDouble() - ) - } - } - - private def percentileIdxs[K: Numeric, V: Integral](N: V): Vector[(Rational, (V, Double))] = { - val n = N + 1 - - implicit def fromInt = Integral[V].fromInt _ - - val denominators: Iterator[V] = { - lazy val pow10s: Stream[V] = 100 #:: pow10s.map(_ * 10) - Iterator[V]( - 2, // 50 - 4, // 25/75 - 10, // 10/90 - 20 // 5/95 - ) ++ pow10s.iterator // 1/99, .1/99.9, .01/99.99, … - } - - denominators - .takeWhile(_ <= n) - .flatMap { - d ⇒ - val loPercentile = Rational(100, d.toSafeLong) - val hiPercentile = 100 - loPercentile - - val loFloor: V = n /~ d - 1 - val loRemainder = (n % d).toDouble() / d.toDouble() - - val hiFloor = n - 3 - loFloor - val hiRemainder = 1 - loRemainder - - if (d == 2) - // Median (50th percentile, denominator 2) only emits one tuple. - Iterator(loPercentile → (loFloor, loRemainder)) - else - // In general, we emit two tuples per "denominator", one on the high side and one on the low. For example, for - // denominator 4, we emit the 25th and 75th percentiles. - Iterator( - loPercentile → (loFloor, loRemainder), - hiPercentile → (hiFloor, hiRemainder) - ) - } - .toVector - .sortBy(_._1) - } - - /** - * Compute some relevant percentiles based on the number of elements present. - * @return pairs of (percentile, value). - */ - private def histPercentiles[K: Numeric, V: Integral](N: V, - values: IndexedSeq[(K, V)]): Vector[(Rational, Double)] = - getRunPercentiles( - values, - percentileIdxs(N) - ) - - /** - * Compute some relevant percentiles based on the number of elements present. - * - * @return pairs of (percentile, value). - */ - private def percentiles[T: Numeric](values: IndexedSeq[T]): Vector[(Rational, Double)] = - percentileIdxs(values.length) - .map { - case (d, (floor, weight)) ⇒ - val value = - if (weight == 0) - values(floor).toDouble() - else - interpolate(values(floor), values(floor + 1), weight) - - d → value - } - - private def getMedian[T: Numeric](sorted: Vector[T]): Double = { - val n = sorted.length - if (n == 0) - NaN - else if (n % 2 == 0) - (sorted(n / 2 - 1) + sorted(n / 2)).toDouble() / 2.0 - else - sorted(n / 2).toDouble() - } - - /** - * Find the first `N` "runs" from the beginning of `it`. If `reverse`, return them in reversed order. - */ - private def runLengthEncodeWithSum[K: Numeric](it: Iterator[K], - N: Int, - reverse: Boolean = false): (Seq[(K, Int)], Int) = { - var sum = 0 - var i = 0 - val runs = ArrayBuffer[(K, Int)]() - val runLengthIterator = it.runLengthEncode() - while (i < N && runLengthIterator.hasNext) { - val (elem, count) = runLengthIterator.next() - - if (reverse) - runs.prepend(elem → count) - else - runs += ((elem, count)) - - sum += count - i += 1 - } - runs → sum - } - - implicit def makeShow[ - K : Numeric : Show, - V: Integral : Show - ]( - implicit - percentileShow: Show[Rational] = showPercentile, - statShow: Show[Double] = showDouble - ): Show[Stats[K, V]] = - show { - case Empty() ⇒ "(empty)" - case NonEmpty(n, _, mean, stddev, _, mad, samplesOpt, sortedSamplesOpt, percentiles) ⇒ - def pair[L: Show, R: Show](l: L, r: R): String = - show"$l:\t$r" - - val strings = ArrayBuffer[String]() - - strings += - List( - pair("num", n), - pair("mean", mean), - pair("stddev", stddev), - pair("mad", mad) - ) - .mkString(",\t") - - for { - samples ← samplesOpt - if samples.nonEmpty - } { - strings += pair("elems", samples) - } - - for { - sortedSamples ← sortedSamplesOpt - if sortedSamples.nonEmpty - } { - strings += pair("sorted", sortedSamples) - } - - strings ++= - percentiles.map { - case (k, v) ⇒ - pair(k, v) - } - - strings.mkString("\n") - } - - def showDouble: Show[Double] = - show( - d ⇒ - if (floor(d).toLong == ceil(d).toLong) - d.toLong.toString - else - "%.1f".format(d) - ) - - def showPercentile: Show[Rational] = - show( - r ⇒ - if (r.isWhole()) - r.toLong.toString - else - r.toDouble.toString - ) -} - -case class Empty[K: Numeric, V: Integral]() extends Stats[K, V] { - override def n: V = Integral[V].zero - override def sum: Double = 0 - override def showStatsAsElems(implicit - showElem: Show[K], - showCount: Show[V], - percentileShow: Show[Rational]): String = - "(empty)" - - override def show(implicit - showElem: Show[K], - showCount: Show[V], - showStat: Show[Double], - percentileShow: Show[Rational]): String = - "(empty)" -} - -/** - * Stores some computed statistics about a dataset of [[Numeric]] elements. - * - * @param n number of elements in the dataset. - * @param mad median absolute deviation (from the median). - * @param samplesOpt "sample" elements; the start and end of the data. - * @param sortedSamplesOpt "sample" elements; the least and greatest elements. If the dataset is already sorted, meaning - * this would be equivalent to [[samplesOpt]], it is omitted. - * @param percentiles selected percentiles of the dataset. - */ -case class NonEmpty[K: Numeric, V: Integral](n: V, - sum: Double, - mean: Double, - stddev: Double, - median: Double, - mad: Double, - samplesOpt: Option[Samples[K, V]], - sortedSamplesOpt: Option[Samples[K, V]], - percentiles: Seq[(Rational, Double)]) - extends Stats[K, V] { - - override def showStatsAsElems(implicit - showElem: Show[K], - showCount: Show[V], - percentileShow: Show[Rational]): String = { - implicit val showStat = - Show.show[Double]( - stat ⇒ - showElem.show(Numeric[K].fromDouble(stat)) - ) - - makeShow.show(this) - } - - override def show(implicit - showElem: Show[K], - showCount: Show[V], - showStat: Show[Double], - percentileShow: Show[Rational]): String = - makeShow.show(this) -} diff --git a/src/test/scala/org/hammerlab/math/CeilTest.scala b/src/test/scala/org/hammerlab/math/CeilTest.scala deleted file mode 100644 index 3c229cd..0000000 --- a/src/test/scala/org/hammerlab/math/CeilTest.scala +++ /dev/null @@ -1,28 +0,0 @@ -package org.hammerlab.math - -import org.hammerlab.test.Suite -import org.scalactic.TypeCheckedTripleEquals - -class CeilTest - extends Suite - with TypeCheckedTripleEquals { - - test("ints") { - ceil( 0, 20) should ===(0) - ceil(10, 20) should ===(1) - ceil(19, 20) should ===(1) - ceil(20, 20) should ===(1) - ceil(21, 20) should ===(2) - } - - test("longs") { - ceil( 0L, 20L) should ===(0) - ceil(10L, 20L) should ===(1L) - ceil(19L, 20L) should ===(1L) - ceil(20L, 20L) should ===(1L) - ceil(21L, 20L) should ===(2L) - - ceil(1L << 40, 1 << 4) should be(1L << 36) - ceil(1L << 40, 1L << 36) should be(1 << 4) - } -} diff --git a/src/test/scala/org/hammerlab/math/FormatTest.scala b/src/test/scala/org/hammerlab/math/FormatTest.scala deleted file mode 100644 index 6ad75da..0000000 --- a/src/test/scala/org/hammerlab/math/FormatTest.scala +++ /dev/null @@ -1,138 +0,0 @@ -package org.hammerlab.math - -import cats.Show -import cats.syntax.all._ -import org.hammerlab.test.Suite -import spire.math._ -import Format.scientific - -class FormatTest - extends Suite { - - def check[I : Integral : Show](i: I, expected: String): Unit = { - i.show should be(expected) - } - - test("2-digit ints") { - import scientific.digits2 - - (-20 to 20).foreach(n ⇒ check(n, n.toString)) - - check( -100, "-100") - - check( 99, "99") - check( 100, "100") - check( 101, "101") - - check( 999, "999") - check( 1000, "1000") - check( 1001, "1001") - - check( 9999, "9999") - check( 10000, "10000") - check( 10001, "10001") - - check( 99999, "99999") - check( 100000, "1.0e5") - check( 100001, "1.0e5") - - check( 104999, "1.0e5") - check( 105000, "1.1e5") - check( 105001, "1.1e5") - - check( 144999, "1.4e5") - check( 145000, "1.5e5") - check( 145001, "1.5e5") - - check( 149999, "1.5e5") - check( 150000, "1.5e5") - check( 150001, "1.5e5") - - check( 199999, "2.0e5") - check( 200000, "2.0e5") - check( 200001, "2.0e5") - - check( 449999, "4.5e5") - check( 450000, "4.5e5") - check( 450001, "4.5e5") - - check(-494999, "-4.9e5") - check(-495000, "-5.0e5") - check(-495001, "-5.0e5") - - check( 494999, "4.9e5") - check( 495000, "5.0e5") - check( 495001, "5.0e5") - - check( 994999, "9.9e5") - check( 995000, "1.0e6") - check( 995001, "1.0e6") - - check( 999999, "1.0e6") - check(1000000, "1.0e6") - check(1000001, "1.0e6") - - check(1049999, "1.0e6") - check(1050000, "1.1e6") - } - - test("3-digit ints") { - import scientific.digits3 - - (-20 to 20).foreach(n ⇒ check(n, n.toString)) - - check( -100, "-100") - - check( 99, "99") - check( 100, "100") - check( 101, "101") - - check( 999, "999") - check( 1000, "1000") - check( 1001, "1001") - - check( 9999, "9999") - check( 10000, "10000") - check( 10001, "10001") - - check( 99999, "99999") - check( 100000, "100000") - check( 100001, "100001") - - check( 999999, "999999") - check( 1000000, "1.00e6") - check( 1000001, "1.00e6") - - check( 1004999, "1.00e6") - check( 1005000, "1.01e6") - check( 1005001, "1.01e6") - - check( 1044999, "1.04e6") - check( 1045000, "1.05e6") - check( 1045001, "1.05e6") - - check( 1049999, "1.05e6") - check( 1050000, "1.05e6") - check( 1050001, "1.05e6") - - check( 1944999, "1.94e6") - check( 1945000, "1.95e6") - check( 1945001, "1.95e6") - - check( 1994999, "1.99e6") - check( 1995000, "2.00e6") - check( 1995001, "2.00e6") - - check( 1999999, "2.00e6") - check( 2000000, "2.00e6") - check( 2000001, "2.00e6") - - check( 9994999, "9.99e6") - check(10000000, "1.00e7") - check(10000001, "1.00e7") - - check(10049999, "1.00e7") - check(10050000, "1.01e7") - check(10050001, "1.01e7") - } -} diff --git a/src/test/scala/org/hammerlab/math/HypergeometricDistributionTest.scala b/src/test/scala/org/hammerlab/math/HypergeometricDistributionTest.scala deleted file mode 100644 index fff6bbb..0000000 --- a/src/test/scala/org/hammerlab/math/HypergeometricDistributionTest.scala +++ /dev/null @@ -1,109 +0,0 @@ -package org.hammerlab.math - -import org.scalactic.Equality -import org.hammerlab.test.Suite - -import org.apache.commons.math3.distribution.{HypergeometricDistribution ⇒ ApacheHyperGeometricDistribution} - -import scala.collection.mutable.ArrayBuffer - -class HypergeometricDistributionTest extends Suite { - - var epsilon = 0.00001 - - implicit val tolerance = - new Equality[Double] { - override def areEqual(a: Double, b: Any): Boolean = - b match { - case d: Double ⇒ a === d +- epsilon - case _ ⇒ false - } - } - - implicit val approxBuffers = - new Equality[ArrayBuffer[Double]] { - override def areEqual(a: ArrayBuffer[Double], b: Any): Boolean = - b match { - case s: ArrayBuffer[Double] ⇒ a.size == s.size && a.zip(s).forall(t ⇒ t._1 === t._2) - case _ ⇒ false - } - } - - def compareToApache(hgd: HypergeometricDistribution): Unit = { - val N = hgd.N.toInt - val K = hgd.K.toInt - val n = hgd.n - - val apache = new ApacheHyperGeometricDistribution(N, K, n) - - hgd.pdf should ===( - ArrayBuffer((0 to n).map(apache.probability): _*) - ) - - hgd.cdf should ===( - ArrayBuffer((0 to n).map(apache.cumulativeProbability): _*) - ) - } - - test("10-4-2") { - val hgd = HypergeometricDistribution(10, 4, 2) - - hgd.pdf should ===( - ArrayBuffer( - 1.0 / 3, - 8.0 / 15, - 2.0 / 15 - ) - ) - - hgd.cdf should ===( - ArrayBuffer( - 1.0 / 3, - 13.0 / 15, - 1 - ) - ) - - List[Double]( - 0, - 1.0 / 3 - epsilon, - 1.0 / 3, - 13.0 / 15 - epsilon, - 13.0 / 15, - 1 - epsilon, - 1 - ).map(hgd.invCDF(_)) should be( - List( - 0, 0, 1, 1, 2, 2, 2 - ) - ) - - compareToApache(hgd) - } - - test("500-100-10") { - val hgd = HypergeometricDistribution(500, 100, 10) - - compareToApache(hgd) - } - - test("5000000000-4000000000-10") { - val hgd = HypergeometricDistribution(5000000000L, 4000000000L, 10) - - hgd.pdf should be( - ArrayBuffer( - 1.0239999631360417E-7, // 0 - 4.0959998894081015E-6, // 1 - 7.372799858073784E-5, // 2 - 7.864319899730114E-4, // 3 - 0.005505023958712356, // 4 - 0.026424115107515793, // 5 - 0.08808038393393967, // 6 - 0.20132659215099705, // 7 - 0.30198988830199097, // 8 - 0.26843545599999896, // 9 - 0.10737418215840859 // 10 - ) - ) - } -} diff --git a/src/test/scala/org/hammerlab/math/MonoidTest.scala b/src/test/scala/org/hammerlab/math/MonoidTest.scala deleted file mode 100644 index 274354a..0000000 --- a/src/test/scala/org/hammerlab/math/MonoidTest.scala +++ /dev/null @@ -1,19 +0,0 @@ -package org.hammerlab.math - -import org.hammerlab.test.Suite -import MonoidSyntax._ -import Monoid.zero - -class MonoidTest - extends Suite { - test("case class") { - case class Foo(a: Int, b: String, c: Long) - - val foo1 = Foo(111, "abc", 123) - val foo2 = Foo(222, "def", 456) - - foo1 |+| foo2 should be(Foo(333, "abcdef", 579)) - foo1 |+| zero[Foo] should be(foo1) - foo2 |+| zero[Foo] should be(foo2) - } -} diff --git a/src/test/scala/org/hammerlab/math/StepsTest.scala b/src/test/scala/org/hammerlab/math/StepsTest.scala deleted file mode 100644 index 8248994..0000000 --- a/src/test/scala/org/hammerlab/math/StepsTest.scala +++ /dev/null @@ -1,25 +0,0 @@ -package org.hammerlab.math - -import org.hammerlab.math.Steps._ -import org.hammerlab.test.Suite -import org.hammerlab.test.matchers.seqs.SeqMatcher.seqMatch - -class StepsTest - extends Suite { - test("roundNumbers") { - roundNumbers(200) should seqMatch( - ( 0 until 20) ++ - ( 20 until 50 by 2) ++ - ( 50 until 100 by 5) ++ - (100 to 200 by 10) - ) - } - - test("geometricEvenSteps") { - geometricEvenSteps(1000, 20) should seqMatch( - Seq( - 0, 1, 2, 3, 4, 5, 6, 9, 14, 21, 31, 46, 68, 99, 146, 215, 316, 464, 681, 999 - ) - ) - } -} diff --git a/src/test/scala/org/hammerlab/math/VarNumTest.scala b/src/test/scala/org/hammerlab/math/VarNumTest.scala deleted file mode 100644 index 65e9f55..0000000 --- a/src/test/scala/org/hammerlab/math/VarNumTest.scala +++ /dev/null @@ -1,133 +0,0 @@ -package org.hammerlab.math - -import java.io.ByteArrayOutputStream - -import com.esotericsoftware.kryo.io.{Input, Output} -import org.hammerlab.test.Suite - -class VarNumTest extends Suite { - - // [0,8] - testBytes(0, List(0)) - testBytes(1, List(1)) - testBytes(2, List(2)) - testBytes(3, List(3)) - testBytes(4, List(4)) - testBytes(5, List(5)) - testBytes(6, List(6)) - testBytes(7, List(7)) - testBytes(8, List(8)) - - // Jump from 1 byte to 2 bytes at 2^6. - testBytes(0x3b, List(0x3b)) - testBytes(0x3f, List(0x3f)) - testBytes(0x40, List(0x80, 0x01)) - testBytes(0x41, List(0x81, 0x01)) - testBytes(0x42, List(0x82, 0x01)) - testBytes(0x43, List(0x83, 0x01)) - testBytes(0x44, List(0x84, 0x01)) - testBytes(0x45, List(0x85, 0x01)) - - // Jump from 2 bytes to 3 bytes at 2^13. - testBytes(0x1fff, List(0xbf, 0x7f)) - testBytes(0x2000, List(0x80, 0x80, 0x01)) - testBytes(0x2001, List(0x81, 0x80, 0x01)) - testBytes(0x2002, List(0x82, 0x80, 0x01)) - testBytes(0x2003, List(0x83, 0x80, 0x01)) - testBytes(0x2004, List(0x84, 0x80, 0x01)) - - // 3 bytes to 4 bytes at 2^20. - testBytes( 0xfffff, List(0xbf, 0xff, 0x7f)) - testBytes(0x100000, List(0x80, 0x80, 0x80, 0x01)) - testBytes(0x100001, List(0x81, 0x80, 0x80, 0x01)) - testBytes(0x100002, List(0x82, 0x80, 0x80, 0x01)) - testBytes(0x100003, List(0x83, 0x80, 0x80, 0x01)) - testBytes(0x100004, List(0x84, 0x80, 0x80, 0x01)) - - // 4 bytes to 5 bytes at 2^27. - testBytes(0x7ffffff, List(0xbf, 0xff, 0xff, 0x7f)) - testBytes(0x8000000, List(0x80, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x8000001, List(0x81, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x8000002, List(0x82, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x8000003, List(0x83, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x8000004, List(0x84, 0x80, 0x80, 0x80, 0x01)) - - // 5 bytes to 6 bytes at 2^34. - testBytes(0x3ffffffffL, List(0xbf, 0xff, 0xff, 0xff, 0x7f)) - testBytes(0x400000000L, List(0x80, 0x80, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x400000001L, List(0x81, 0x80, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x400000002L, List(0x82, 0x80, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x400000003L, List(0x83, 0x80, 0x80, 0x80, 0x80, 0x01)) - testBytes(0x400000004L, List(0x84, 0x80, 0x80, 0x80, 0x80, 0x01)) - - // Largest Long. - testBytes(0x7fffffffffffffffL, List(0xbf, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff)) - - // Every 4th Fibonacci number from 21 to 2^63. - testBytes( 21L, List(0x15)) - testBytes( 144L, List(0x90, 0x02)) - testBytes( 987L, List(0x9b, 0x0f)) - testBytes( 6765L, List(0xad, 0x69)) - testBytes( 46368L, List(0xa0, 0xd4, 0x05)) - testBytes( 317811L, List(0xb3, 0xe5, 0x26)) - testBytes( 2178309L, List(0x85, 0xf4, 0x89, 0x02)) - testBytes( 14930352L, List(0xb0, 0xc6, 0x9e, 0x0e)) - testBytes( 102334155L, List(0x8b, 0xfb, 0xcb, 0x61)) - testBytes( 701408733L, List(0x9d, 0x97, 0xf5, 0x9c, 0x05)) - testBytes( 4807526976L, List(0x80, 0xa9, 0xe8, 0xe8, 0x23)) - testBytes( 32951280099L, List(0xa3, 0x87, 0xe5, 0xc0, 0xf5, 0x01)) - testBytes( 225851433717L, List(0xb5, 0x8b, 0xdb, 0xdc, 0x92, 0x0d)) - testBytes( 1548008755920L, List(0x90, 0xcb, 0x98, 0xc8, 0x8d, 0x5a)) - testBytes( 10610209857723L, List(0xbb, 0x82, 0xd1, 0x9c, 0xcc, 0xe9, 0x04)) - testBytes( 72723460248141L, List(0x8d, 0xc9, 0x9e, 0x80, 0x88, 0x89, 0x21)) - testBytes( 498454011879264L, List(0xa0, 0xfd, 0x84, 0xe5, 0xeb, 0xd5, 0xe2, 0x01)) - testBytes( 3416454622906707L, List(0x93, 0xa5, 0x84, 0xc3, 0xea, 0xcf, 0x91, 0x0c)) - testBytes( 23416728348467685L, List(0xa5, 0x87, 0x99, 0xf0, 0xfd, 0xd8, 0x98, 0x53)) - testBytes( 160500643816367088L, List(0xb0, 0x8f, 0xab, 0xce, 0x86, 0x9f, 0x9b, 0xba, 0x04)) - testBytes(1100087778366101931L, List(0xab, 0xe6, 0x94, 0xb4, 0xb0, 0x80, 0xa6, 0xc4, 0x1e)) - testBytes(7540113804746346429L, List(0xbd, 0xbe, 0xe6, 0x9e, 0xcc, 0xe3, 0xee, 0xa3, 0xd1)) - - /** - * Generate test cases that verify that: - * - * - `l` is written to bytes matching `expected` bytes, - * - those `expected` bytes are read back in to a [[Long]] equal to `l`, - * - and then do the same for the additive inverse of `l` with a list of `expected` bytes where the sign bit (0x40 - * in the first byte) is flipped. - */ - def testBytes(l: Long, expected: List[Int]): Unit = { - - test(s"0x${l.toHexString} ($l)") { - run(l, expected) - } - - if (l > 0) { - val negExpected = { - val head = expected.head - if ((head & 0x40) > 0) { - throw new IllegalArgumentException(s"Expected bytes [${expected.mkString(",")}] already have sign bit set negative") - } - (head | 0x40) :: expected.tail - } - - test(s"-0x${l.toHexString} (-$l)") { - run(-l, negExpected) - } - } - } - - def run(l: Long, expected: List[Int]): Unit = { - val baos = new ByteArrayOutputStream() - - val op = new Output(baos) - VarNum.write(op, l) - op.close() - - val bytes = baos.toByteArray - bytes should be(expected.toArray.map(_.toByte)) - - val ip = new Input(bytes) - VarNum.read(ip) should be(l) - ip.close() - } -} diff --git a/src/test/scala/org/hammerlab/stats/HistShowTest.scala b/src/test/scala/org/hammerlab/stats/HistShowTest.scala deleted file mode 100644 index 23e19cb..0000000 --- a/src/test/scala/org/hammerlab/stats/HistShowTest.scala +++ /dev/null @@ -1,226 +0,0 @@ -package org.hammerlab.stats - -import cats.Show -import cats.implicits.{ catsStdShowForInt, catsStdShowForLong } -import cats.syntax.all._ -import org.hammerlab.stats.Stats.fromHist -import org.hammerlab.test.Suite -import spire.implicits._ -import spire.math.Integral - -import scala.util.Random - -/** - * Tests of the [[Stats.fromHist]] API for constructing [[NonEmpty]] instances from "histograms" of elements that each come - * with an associated repetition count, which allows the total number of elements represented to be much larger - * ([[Long]] vs. [[Int]]). - */ -class HistShowTest extends Suite { - - Random.setSeed(123L) - - def check[V: Integral : Show](input: Seq[(Int, V)], - lines: String*): Unit = - fromHist(input).show should be(lines.mkString("\n")) - - def check[V: Integral : Show](input: Seq[(Int, V)], - numToSample: Int, - lines: String*): Unit = - fromHist( - input, - numToSample - ) - .show should be( - lines.mkString("\n") - ) - - def check[V: Integral : Show](input: Seq[(Int, V)], - numToSample: Int, - onlySampleSorted: Boolean, - lines: String*): Unit = - fromHist( - input, - numToSample, - onlySampleSorted - ) - .show should be( - lines.mkString("\n") - ) - - test("empty") { - check( - List[(Int, Int)](), - "(empty)" - ) - } - - test("single") { - check( - List(0 → 1), - "num: 1, mean: 0, stddev: 0, mad: 0", - "elems: 0", - "50: 0" - ) - } - - test("double") { - check( - List(0 → 2), - "num: 2, mean: 0, stddev: 0, mad: 0", - "elems: 0×2", - "50: 0" - ) - } - - test("two singles") { - check( - List(0 → 1, 1 → 1), - "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 0, 1", - "50: 0.5" - ) - } - - test("three singles") { - check( - List(0 → 1, 5 → 1, 1 → 1), - "num: 3, mean: 2, stddev: 2.2, mad: 1", - "elems: 0, 5, 1", - "sorted: 0, 1, 5", - "25: 0", - "50: 1", - "75: 5" - ) - } - - test("single double") { - check( - List(0 → 1, 1 → 2), - "num: 3, mean: 0.7, stddev: 0.5, mad: 0", - "elems: 0, 1×2", - "25: 0", - "50: 1", - "75: 1" - ) - } - - test("1×5 2×4") { - check( - List(1 → 5, 2 → 4), - "num: 9, mean: 1.4, stddev: 0.5, mad: 0", - "elems: 1×5, 2×4", - "10: 1", - "25: 1", - "50: 1", - "75: 2", - "90: 2" - ) - } - - test("0×5 1×5") { - check( - List(0 → 5, 1 → 5), - "num: 10, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 0×5, 1×5", - "10: 0", - "25: 0", - "50: 0.5", - "75: 1", - "90: 1" - ) - } - - test("0×4 1×6") { - check( - List(0 → 4, 1 → 6), - "num: 10, mean: 0.6, stddev: 0.5, mad: 0", - "elems: 0×4, 1×6", - "10: 0", - "25: 0", - "50: 1", - "75: 1", - "90: 1" - ) - } - - test("x(x) 1 to 10") { - check( - (1 to 10).map(i ⇒ i → i), - "num: 55, mean: 7, stddev: 2.4, mad: 2", - "elems: 1, 2×2, 3×3, 4×4, 5×5, 6×6, 7×7, 8×8, 9×9, 10×10", - "5: 2", - "10: 3", - "25: 5", - "50: 7", - "75: 9", - "90: 10", - "95: 10" - ) - } - - test("singletons") { - check( - (0 to 10).map(i ⇒ i → 1), - "num: 11, mean: 5, stddev: 3.2, mad: 3", - "elems: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10", - "10: 0.2", - "25: 2", - "50: 5", - "75: 8", - "90: 9.8" - ) - } - - test("re-encode") { - check( - List(0 → 1, 0 → 1, 10 → 3, 10 → 4, 3 → 5, 0 → 2, 3 → 1), - "num: 17, mean: 5.2, stddev: 4.2, mad: 3", - "elems: 0×2, 10×7, 3×5, 0×2, 3", - "sorted: 0×4, 3×6, 10×7", - "10: 0", - "25: 1.5", - "50: 3", - "75: 10", - "90: 10" - ) - } - - test("large hist") { - check( - List[(Int, Long)]( - 1 → 10000000000L, - 2 → 1000000000, - 1 → 100, - 2 → 1000000000 - ), - "num: 12000000100, mean: 1.2, stddev: 0.4, mad: 0", - "elems: 1×10000000000, 2×1000000000, 1×100, 2×1000000000", - "sorted: 1×10000000100, 2×2000000000", - "1.0E-8: 1", - "1.0E-7: 1", - "1.0E-6: 1", - "1.0E-5: 1", - "1.0E-4: 1", - "0.001: 1", - "0.01: 1", - "0.1: 1", - "1: 1", - "5: 1", - "10: 1", - "25: 1", - "50: 1", - "75: 1", - "90: 2", - "95: 2", - "99: 2", - "99.9: 2", - "99.99: 2", - "99.999: 2", - "99.9999: 2", - "99.99999: 2", - "99.999999: 2", - "99.9999999: 2", - "99.99999999: 2" - ) - } -} diff --git a/src/test/scala/org/hammerlab/stats/ShowTest.scala b/src/test/scala/org/hammerlab/stats/ShowTest.scala deleted file mode 100644 index d853cba..0000000 --- a/src/test/scala/org/hammerlab/stats/ShowTest.scala +++ /dev/null @@ -1,398 +0,0 @@ -package org.hammerlab.stats - -import cats.Show -import cats.instances.all.{ catsStdShowForInt, catsStdShowForLong } -import cats.syntax.all._ -import org.hammerlab.test.Suite -import spire.math.Numeric - -import scala.util.Random.{ nextInt, setSeed, shuffle } - -/** - * Test the default [[Show.show]] method of [[Stats]] instances. - */ -class ShowTest extends Suite { - - setSeed(123L) - - def check[K : Numeric : Ordering : Show](input: Seq[K], lines: String*): Unit = - Stats(input).show should be( - lines.mkString("\n") - ) - - def check[K : Numeric : Ordering : Show](input: Seq[K], numToSample: Int, lines: String*): Unit = - Stats( - input, - numToSample - ) - .show should be( - lines.mkString("\n") - ) - - def check[K : Numeric : Ordering : Show](input: Seq[K], - numToSample: Int, - onlySampleSorted: Boolean, - lines: String*): Unit = - Stats( - input, - numToSample, - onlySampleSorted - ) - .show should be( - lines.mkString("\n") - ) - - test("empty") { - check[Int]( - Nil, - "(empty)" - ) - } - - test("0 to 0") { - check( - 0 to 0, - "num: 1, mean: 0, stddev: 0, mad: 0", - "elems: 0", - "50: 0" - ) - } - - test("0 to 1") { - check( - 0 to 1, - "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 0, 1", - "50: 0.5" - ) - } - - test("1 to 0") { - check( - 1 to 0 by -1, - "num: 2, mean: 0.5, stddev: 0.5, mad: 0.5", - "elems: 1, 0", - "sorted: 0, 1", - "50: 0.5" - ) - } - - test("0 to 2") { - check( - 0 to 2, - "num: 3, mean: 1, stddev: 0.8, mad: 1", - "elems: 0, 1, 2", - "25: 0", - "50: 1", - "75: 2" - ) - } - - test("2 to 0") { - check( - 2 to 0 by -1, - "num: 3, mean: 1, stddev: 0.8, mad: 1", - "elems: 2, 1, 0", - "sorted: 0, 1, 2", - "25: 0", - "50: 1", - "75: 2" - ) - } - - test("0 to 3") { - check( - 0 to 3, - "num: 4, mean: 1.5, stddev: 1.1, mad: 1", - "elems: 0, 1, 2, 3", - "25: 0.3", - "50: 1.5", - "75: 2.8" - ) - } - - test("3 to 0") { - check( - 3 to 0 by -1, - "num: 4, mean: 1.5, stddev: 1.1, mad: 1", - "elems: 3, 2, 1, 0", - "sorted: 0, 1, 2, 3", - "25: 0.3", - "50: 1.5", - "75: 2.8" - ) - } - - test("1 to 9") { - check( - 1 to 9, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 1, 2, 3, 4, 5, 6, 7, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("9 to 1") { - check( - 9 to 1 by -1, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 9, 8, 7, 6, 5, 4, 3, 2, 1", - "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - val shuffled1to9 = shuffle(1 to 9).toArray - - test("1 to 9 sample 5") { - check( - shuffled1to9, - numToSample = 5, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 8, 4, 5, 3, 1, 6, 7, 2, 9", - "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 9 sample 4") { - check( - shuffled1to9, - numToSample = 4, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 8, 4, 5, 3, …, 6, 7, 2, 9", - "sorted: 1, 2, 3, 4, …, 6, 7, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 9 sample 3") { - check( - shuffled1to9, - numToSample = 3, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 8, 4, 5, …, 7, 2, 9", - "sorted: 1, 2, 3, …, 7, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 9 sample 2") { - check( - shuffled1to9, - numToSample = 2, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 8, 4, …, 2, 9", - "sorted: 1, 2, …, 8, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 9 sample 1") { - check( - shuffled1to9, - numToSample = 1, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "elems: 8, …, 9", - "sorted: 1, …, 9", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 9 sample 0") { - check( - shuffled1to9, - numToSample = 0, - "num: 9, mean: 5, stddev: 2.6, mad: 2", - "10: 1", - "25: 2.5", - "50: 5", - "75: 7.5", - "90: 9" - ) - } - - test("1 to 99") { - check( - 1 to 99, - "num: 99, mean: 50, stddev: 28.6, mad: 25", - "elems: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99", - "1: 1", - "5: 5", - "10: 10", - "25: 25", - "50: 50", - "75: 75", - "90: 90", - "95: 95", - "99: 99" - ) - } - - test("99 to 1") { - check( - 99 to 1 by -1, - "num: 99, mean: 50, stddev: 28.6, mad: 25", - "elems: 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, …, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1", - "sorted: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99", - "1: 1", - "5: 5", - "10: 10", - "25: 25", - "50: 50", - "75: 75", - "90: 90", - "95: 95", - "99: 99" - ) - } - - val shuffledDigits = (0 until 100).map(_ ⇒ nextInt(10)) - - test("100 digits") { - check( - shuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", - "elems: 5, 3, 9, 6, 2, 5, 7, 9, 0, 5, …, 7, 9, 1, 9, 0×2, 8, 0, 7×2, 0, 6", - "sorted: 0×15, 1×7, 2×8, 3×11, 4×9, 5×12, 6×11, 7×9, 8×9, 9×9", - "1: 0", - "5: 0", - "10: 0", - "25: 2", - "50: 4.5", - "75: 7", - "90: 8", - "95: 9", - "99: 9" - ) - } - - test("100 digits sample 4") { - check( - shuffledDigits, - numToSample = 4, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", - "elems: 5, 3, 9, 6, …, 0, 7×2, 0, 6", - "sorted: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", - "1: 0", - "5: 0", - "10: 0", - "25: 2", - "50: 4.5", - "75: 7", - "90: 8", - "95: 9", - "99: 9" - ) - } - - test("100 digits sample 4 only sample sorted") { - check( - shuffledDigits, - numToSample = 4, - onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", - "sorted: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", - "1: 0", - "5: 0", - "10: 0", - "25: 2", - "50: 4.5", - "75: 7", - "90: 8", - "95: 9", - "99: 9" - ) - } - - val sortedShuffledDigits = shuffledDigits.sorted - - test("100 sorted digits") { - check( - sortedShuffledDigits, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", - "elems: 0×15, 1×7, 2×8, 3×11, 4×9, 5×12, 6×11, 7×9, 8×9, 9×9", - "1: 0", - "5: 0", - "10: 0", - "25: 2", - "50: 4.5", - "75: 7", - "90: 8", - "95: 9", - "99: 9" - ) - } - - test("100 sorted digits only sample sorted overridden") { - check( - sortedShuffledDigits, - numToSample = 4, - onlySampleSorted = true, - "num: 100, mean: 4.3, stddev: 2.9, mad: 2.5", - "elems: 0×15, 1×7, 2×8, 3×11, …, 6×11, 7×9, 8×9, 9×9", - "1: 0", - "5: 0", - "10: 0", - "25: 2", - "50: 4.5", - "75: 7", - "90: 8", - "95: 9", - "99: 9" - ) - } - - test("values over Int.MAX_VALUE") { - check( - Seq( - 10000000000L, - 100000000000L, - 100000000000L, - 1000000000000L, - 1000000000000L, - 10000000000L, - 1000000000000L, - 100000000000L, - 10000000000L, - 10000000000L - ), - "num: 10, mean: 334000000000, stddev: 437588848121.2, mad: 90000000000", - "elems: 10000000000, 100000000000×2, 1000000000000×2, 10000000000, 1000000000000, 100000000000, 10000000000×2", - "sorted: 10000000000×4, 100000000000×3, 1000000000000×3", - "10: 10000000000", - "25: 10000000000", - "50: 100000000000", - "75: 1000000000000", - "90: 1000000000000" - ) - } -} diff --git a/src/test/scala/org/hammerlab/stats/StatsTest.scala b/src/test/scala/org/hammerlab/stats/StatsTest.scala deleted file mode 100644 index 31e7b42..0000000 --- a/src/test/scala/org/hammerlab/stats/StatsTest.scala +++ /dev/null @@ -1,149 +0,0 @@ -package org.hammerlab.stats - -import org.hammerlab.test.Suite -import org.scalactic.Equality -import spire.implicits._ -import spire.math.Numeric - -import scala.math.sqrt - -class StatsTest - extends Suite { - - def check[K : Numeric : Ordering](input: Seq[K], - expected: Stats[K, Int]): Unit = - Stats(input) should be(expected) - - def check[K : Numeric : Ordering](input: Seq[K], - numToSample: Int, - expected: Stats[K, Int]): Unit = - Stats( - input, - numToSample - ) should be( - expected - ) - - def check[K : Numeric : Ordering](input: Seq[K], - numToSample: Int, - onlySampleSorted: Boolean, - expected: Stats[K, Int]): Unit = - Stats( - input, - numToSample, - onlySampleSorted - ) should be( - expected - ) - - test("empty") { - check[Int]( - Nil, - Empty[Int, Int]() - ) - } - - test("0 to 0") { - check( - 0 to 0, - NonEmpty( - n = 1, - sum = 0, - mean = 0, - stddev = 0, - median = 0, - mad = 0, - samplesOpt = - Some( - Samples( - 1, - Runs(Seq(0 → 1)), - Runs(Seq(0 → 1)) - ) - ), - sortedSamplesOpt = None, - percentiles = Vector(r"50" → 0.0) - ) - ) - } - - test("0 to 1") { - check( - 0 to 1, - NonEmpty( - n = 2, - sum = 1, - mean = .5, - stddev = .5, - median = .5, - mad = .5, - samplesOpt = - Some( - Samples( - 2, - Runs(Seq(0 → 1, 1 → 1)), - Runs(Seq(0 → 1, 1 → 1)) - ) - ), - sortedSamplesOpt = None, - percentiles = Vector(r"50" → .5) - ) - ) - } - - test("1 to 0") { - check( - 1 to 0 by -1, - NonEmpty( - n = 2, - sum = 1, - mean = .5, - stddev = .5, - median = .5, - mad = .5, - samplesOpt = - Some( - Samples( - 2, - Runs(Seq(1 → 1, 0 → 1)), - Runs(Seq(1 → 1, 0 → 1)) - ) - ), - sortedSamplesOpt = - Some( - Samples( - 2, - Runs(Seq(0 → 1, 1 → 1)), - Runs(Seq(0 → 1, 1 → 1)) - ) - ), - percentiles = Vector(r"50" → .5) - ) - ) - } - - test("0 to 2") { - check( - 0 to 2, - NonEmpty( - n = 3, - sum = 3, - mean = 1, - stddev = sqrt(2 / 3.0), - median = 1, - mad = 1, - samplesOpt = - Some( - Samples( - 3, - Runs(Seq(0 → 1, 1 → 1, 2 → 1)), - Runs(Seq(0 → 1, 1 → 1, 2 → 1)) - ) - ), - sortedSamplesOpt = None, - percentiles = Vector(r"25" → 0, r"50" → 1, r"75" → 2) - ) - ) - } - -} From 76d42ac9a5b376ffa1ed66278f458ebfba46bbbd Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Sat, 5 Aug 2017 16:06:06 +0000 Subject: [PATCH 20/20] 1.3.0 --- build.sbt | 2 +- project/plugins.sbt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 7ba76b5..d6eb9bd 100644 --- a/build.sbt +++ b/build.sbt @@ -1,6 +1,6 @@ name := "iterator" -version := "1.3.0-SNAPSHOT" +version := "1.3.0" addScala212 diff --git a/project/plugins.sbt b/project/plugins.sbt index ba64db2..f53eab3 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1 +1 @@ -addSbtPlugin("org.hammerlab" % "sbt-parent" % "3.0.0-SNAPSHOT") +addSbtPlugin("org.hammerlab" % "sbt-parent" % "3.0.0")