In [1]:
%AddJar file:/app/scala_lib/build/libs/robustInfer-scala-0.1.0.jar

Starting download from file:/app/scala_lib/build/libs/robustInfer-scala-0.1.0.jar
Finished download of robustInfer-scala-0.1.0.jar
Using cached version of robustInfer-scala-0.1.0.jar


In [2]:
import robustinfer._

In [3]:
import org.apache.commons.math3.distribution.CauchyDistribution
import scala.util.Random
val rng = new Random(234)

rng = scala.util.Random@1bef0b2f


scala.util.Random@1bef0b2f

In [9]:
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

def samplePositiveCauchyWithTie(dist: CauchyDistribution, n: Int): Seq[Double] =
  // Sample from Cauchy distribution and ensure positive values and create ties by rounding
  Seq.fill(n)(math.max(0.0, math.round(dist.sample()/5.0)*5.0))

def simulationCauchy(n: Int, diff: Double): (RDD[Double], RDD[Double]) = {
  val dist1 = new CauchyDistribution(0.0, 1.0)
  val dist2 = new CauchyDistribution(diff, 1.0)
  
  // Sample positive values from Cauchy distributions
  val cauchy1 = samplePositiveCauchyWithTie(dist1, n)
  val cauchy2 = samplePositiveCauchyWithTie(dist2, n)

  val rdd1 = sc.parallelize(cauchy1)
  val rdd2 = sc.parallelize(cauchy2)
  (rdd1, rdd2)
}

samplePositiveCauchyWithTie: (dist: org.apache.commons.math3.distribution.CauchyDistribution, n: Int)Seq[Double]
simulationCauchy: (n: Int, diff: Double)(org.apache.spark.rdd.RDD[Double], org.apache.spark.rdd.RDD[Double])


In [10]:
import scala.collection.mutable.{ArrayBuffer, Map}

def runSimulation(n: Int, diff: Double, dataGenerator: (Int, Double) => (RDD[Double], RDD[Double])): Unit = {
  val maxIter = 200
  var iter = 0
  val pValues = Map(
    "tTest" -> ArrayBuffer[Double](),
    "zTUTie" -> ArrayBuffer[Double](),
    "zTU" -> ArrayBuffer[Double]()
  )

  while (iter < maxIter) {
    val data = dataGenerator(n, diff)  // Use the passed function
    val x = data._1
    val y = data._2

    val tTestResults = TwoSample.tTest(x, y)
    // val mwUResults = TwoSample.mwU(x, y)
    val zTUTieResults = TwoSample.zeroTrimmedU(x, y, tieCorrection = true)
    val zTUResults = TwoSample.zeroTrimmedU(x, y)

    pValues("tTest") += tTestResults._2
    // pValues("mwU") += mwUResults._2
    pValues("zTUTie") += zTUTieResults._2
    pValues("zTU") += zTUResults._2
    iter += 1

    // Print progress every maxIter / 10 steps
    if (iter % (maxIter / 10) == 0) {
      println(s"Iteration $iter/$maxIter completed.")
    }
  }
  // Compute the proportion of p-values less than 0.05 for each test
  val proportions = pValues.map { case (testName, values) =>
    testName -> values.count(_ < 0.05).toDouble / maxIter
  }

  // Print the results
  proportions.foreach { case (testName, proportion) =>
    println(s"$testName: Proportion of p-values < 0.05 = $proportion")
  }

}

runSimulation: (n: Int, diff: Double, dataGenerator: (Int, Double) => (org.apache.spark.rdd.RDD[Double], org.apache.spark.rdd.RDD[Double]))Unit


In [11]:
// Pass the function as a parameter
runSimulation(100, 1.0, simulationCauchy)

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.195
zTUTie: Proportion of p-values < 0.05 = 0.205
tTest: Proportion of p-values < 0.05 = 0.065


In [12]:
runSimulation(100, 0.0, simulationCauchy)

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.045
zTUTie: Proportion of p-values < 0.05 = 0.05
tTest: Proportion of p-values < 0.05 = 0.02


In [13]:
// Poisson Distribution Generator
import org.apache.commons.math3.distribution.PoissonDistribution

def simulationPoisson(n: Int, diff: Double): (RDD[Double], RDD[Double]) = {
  val dist1 = new PoissonDistribution(0.5)
  val dist2 = new PoissonDistribution(0.5 + diff)
  
  // Sample from Poisson distributions (naturally creates ties due to discrete nature)
  val poisson1 = Seq.fill(n)(dist1.sample().toDouble)
  val poisson2 = Seq.fill(n)(dist2.sample().toDouble)

  val rdd1 = sc.parallelize(poisson1)
  val rdd2 = sc.parallelize(poisson2)
  (rdd1, rdd2)
}

simulationPoisson: (n: Int, diff: Double)(org.apache.spark.rdd.RDD[Double], org.apache.spark.rdd.RDD[Double])


In [16]:
runSimulation(100, 0.1, simulationPoisson)


=== Poisson Distribution (lots of ties) ===
Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.12
zTUTie: Proportion of p-values < 0.05 = 0.135
tTest: Proportion of p-values < 0.05 = 0.135


In [18]:
runSimulation(100, 0.0, simulationPoisson)

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.03
zTUTie: Proportion of p-values < 0.05 = 0.055
tTest: Proportion of p-values < 0.05 = 0.065


lastException = null


null