## load jar
since we mount the repo to containder under /app, we have the abs path as: ```/app/scala_lib/build/libs/robustInfer-scala-0.1.0.jar```

In [1]:
%AddJar file:/app/scala_lib/build/libs/robustInfer-scala-0.1.0.jar

Starting download from file:/app/scala_lib/build/libs/robustInfer-scala-0.1.0.jar
Finished download of robustInfer-scala-0.1.0.jar
Using cached version of robustInfer-scala-0.1.0.jar


In [2]:
import robustinfer._

## small test

In [3]:
val data = Seq(
  Obs("1", Array(0.0, 1.0), 1.0),
  Obs("1", Array(1.0, -1.0), 0.0),
  Obs("2", Array(0.5, 0.5), 1.0),
  Obs("2", Array(1.0, 0.2), 0.0)
)

val df = spark.createDataset(data)

val gee = new GEE(corStruct = Exchangeable)
gee.fit(df)
gee.summary()

Updating R at warm-up iteration 0
Updating R at warm-up iteration 5
Warm-up iterations completed: 10, converged: false
Iteration: DenseVector(31.63248781163211, -42.916880238607014, 0.6267263792368888), ||delta|| = 4.607049104662032
Iteration: DenseVector(34.069822921562334, -46.22160766010823, 0.6971014269323877), ||delta|| = 4.106918360018712
Iteration: DenseVector(36.01035459506486, -48.903660252126606, 0.824488517978528), ||delta|| = 3.3128985425430977
Iteration: DenseVector(37.42469655493082, -50.90668313410217, 0.9960910521533917), ||delta|| = 2.4580299580945746
Iteration: DenseVector(38.69044404948559, -52.57841155139295, 0.9091657405574657), ||delta|| = 2.0986540045813373
Iteration: DenseVector(39.681422130298905, -53.87345119464639, 0.8281467676034963), ||delta|| = 1.632706130394858
Iteration: DenseVector(40.42548123186567, -54.82398666975567, 0.8018376396034246), ||delta|| = 1.2074078872865144
Iteration: DenseVector(41.024309566724746, -55.585927402148684, 0.7910792495666288)

data = List(Obs(1,[D@6c87a461,1.0,None,None), Obs(1,[D@10705c32,0.0,None,None), Obs(2,[D@63df7975,1.0,None,None), Obs(2,[D@44838f5,0.0,None,None))
df = [i: string, x: array<double> ... 3 more fields]
gee = robustinfer.GEE@1d60b598


EESummary(DenseVector(42.3430901452824, -57.28375103748985, 0.7671272123191027),0.2859845644832337    -0.3294780144074219  -0.09787819849504109
-0.32947801440742197  0.3817841516317764   0.10814057735932561
-0.0978781984950411   0.10814057735932567  0.043223025921816004  )


In [4]:
gee.dfSummary().show()

+---------+------------------+-------------------+------------------+--------------------+
|    names|              coef|                 se|                 z|             p-value|
+---------+------------------+-------------------+------------------+--------------------+
|intercept|  42.3430901452824| 0.5347752467001756| 79.17922605161691|                 0.0|
|    beta1|-57.28375103748985| 0.6178868437115136|-92.70912889712727|                 0.0|
|    beta2|0.7671272123191027|0.20790148128817168|3.6898592908810968|2.243781247428522...|
+---------+------------------+-------------------+------------------+--------------------+



## Simulation (MWU)

In [5]:
import org.apache.commons.math3.distribution.CauchyDistribution
import scala.util.Random
val rng = new Random(234)

rng = scala.util.Random@6db74262


scala.util.Random@6db74262

In [6]:
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

def samplePositiveCauchy(dist: CauchyDistribution, n: Int): Seq[Double] =
  Seq.fill(n)(math.max(0.0, dist.sample()))

def simulationCauchy(n: Int, diff: Double): (RDD[Double], RDD[Double]) = {
  val dist1 = new CauchyDistribution(0.0, 1.0)
  val dist2 = new CauchyDistribution(diff, 1.0)
    
  val cauchy1 = samplePositiveCauchy(dist1, n)
  val cauchy2 = samplePositiveCauchy(dist2, n)

  val rdd1 = sc.parallelize(cauchy1)
  val rdd2 = sc.parallelize(cauchy2)
  (rdd1, rdd2)
}

samplePositiveCauchy: (dist: org.apache.commons.math3.distribution.CauchyDistribution, n: Int)Seq[Double]
simulationCauchy: (n: Int, diff: Double)(org.apache.spark.rdd.RDD[Double], org.apache.spark.rdd.RDD[Double])


In [7]:
import scala.collection.mutable.{ArrayBuffer, Map}

def runSimulation(n: Int, diff: Double): Unit = {
  val maxIter = 200
  var iter = 0
  val pValues = Map(
    "tTest" -> ArrayBuffer[Double](),
    "mwU" -> ArrayBuffer[Double](),
    "zTU" -> ArrayBuffer[Double]()
  )

  while (iter < maxIter) {
    val data = simulationCauchy(n, diff)
    val x = data._1
    val y = data._2

    val tTestResults = TwoSample.tTest(x, y)
    val mwUResults = TwoSample.mwU(x, y)
    val zTUResults = TwoSample.zeroTrimmedU(x, y)

    pValues("tTest") += tTestResults._2
    pValues("mwU") += mwUResults._2
    pValues("zTU") += zTUResults._2
    iter += 1

    // Print progress every maxIter / 10 steps
    if (iter % (maxIter / 10) == 0) {
      println(s"Iteration $iter/$maxIter completed.")
    }
  }
  // Compute the proportion of p-values less than 0.05 for each test
  val proportions = pValues.map { case (testName, values) =>
    testName -> values.count(_ < 0.05).toDouble / maxIter
  }

  // Print the results
  proportions.foreach { case (testName, proportion) =>
    println(s"$testName: Proportion of p-values < 0.05 = $proportion")
  }

}

runSimulation: (n: Int, diff: Double)Unit


In [8]:
runSimulation(100, 1.0)

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.965
tTest: Proportion of p-values < 0.05 = 0.125
mwU: Proportion of p-values < 0.05 = 0.975


In [9]:
runSimulation(100, 0.0)

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
zTU: Proportion of p-values < 0.05 = 0.05
tTest: Proportion of p-values < 0.05 = 0.01
mwU: Proportion of p-values < 0.05 = 0.03


In [13]:
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import scala.collection.mutable.ArrayBuffer

def runSimulationDf(n: Int, diff: Double): DataFrame = {
  // Create Spark session
  val spark = SparkSession.builder()
    .appName("Simulation")
    .master("local[*]") // Use local mode for simplicity
    .getOrCreate()
  
  val maxIter = 200
  var iter = 0
  val results = ArrayBuffer[(String, Double, Double, Double)]() // Store test name, z, p-value, and U

  while (iter < maxIter) {
    val data = simulationCauchy(n, diff)
    val x = data._1
    val y = data._2

    val tTestResults = TwoSample.tTest(x, y)
    val mwUResults = TwoSample.mwU(x, y)
    val zTUResults = TwoSample.zeroTrimmedU(x, y)

    // Collect results for each test
    results += (("tTest", tTestResults._1, tTestResults._2, tTestResults._3))
    results += (("mwU", mwUResults._1, mwUResults._2, mwUResults._3))
    results += (("zTU", zTUResults._1, zTUResults._2, zTUResults._3))

    iter += 1

    // Print progress every maxIter / 10 steps
    if (iter % (maxIter / 10) == 0) {
      println(s"Iteration $iter/$maxIter completed.")
    }
  }

  // Create a DataFrame from the collected results
  val schema = StructType(Seq(
    StructField("TestName", StringType, nullable = false),
    StructField("Z", DoubleType, nullable = false),
    StructField("PValue", DoubleType, nullable = false),
    StructField("U", DoubleType, nullable = false)
  ))

  val rows = results.map { case (testName, z, pValue, u) =>
    Row(testName, z, pValue, u)
  }

  val df = spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
  df
}

runSimulationDf: (n: Int, diff: Double)org.apache.spark.sql.DataFrame


In [14]:
// Run the simulation and get the DataFrame
val df = runSimulationDf(100, 1.0)

// Show the DataFrame
df.show()

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
+--------+-------------------+--------------------+-------------------+
|TestName|                  Z|              PValue|                  U|
+--------+-------------------+--------------------+-------------------+
|   tTest|-0.5323557414710041|  0.5944796324537083|  0.349196161828488|
|     mwU|  3.557574219655608|3.742953640339941E-4|             0.6456|
|     zTU| 3.5752116651613997|3.499446606261003E-4| 0.6941157133464826|
|   tTest| 0.5116182709048532|  0.6089181956959913|-0.6397527399722434|
|     mwU| 3.9729503304670457|7.098785602677182E-5|             0.6626|
|     zTU|  4.125561560099488|3.698312378008772E-5| 0.7328341855368883|
|   tTest|-0.9790661045119029|  0.3275473181444273|   3.7928722488

df = [TestName: string, Z: double ... 2 more fields]


[TestName: string, Z: double ... 2 more fields]

In [15]:
// Run the simulation and get the DataFrame
val df = runSimulationDf(100, 0.0)

// Show the DataFrame
df.show()

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
+--------+--------------------+-------------------+------------------+
|TestName|                   Z|             PValue|                 U|
+--------+--------------------+-------------------+------------------+
|   tTest| -0.6435748352307389|  0.519851172867468|1.0639939810798402|
|     mwU| -1.0946382214324948| 0.2736751637374537|            0.4552|
|     zTU| -1.0166097148997475|0.30933913082668374|0.4173525377229081|
|   tTest| -0.8636190935886839| 0.3877971717704072|0.6880614199375452|
|     mwU| 0.31031038866501526| 0.7563249318138219|            0.5127|
|     zTU|  0.8003568004726366| 0.4235041025061954|0.5800619834710744|
|   tTest| -0.7543987350075011|0.45060983094618434|1.1766446220870044|
|   

df = [TestName: string, Z: double ... 2 more fields]


[TestName: string, Z: double ... 2 more fields]

## GEE (TOADD)

In [3]:
// Create Spark session
val spark = SparkSession.builder()
  .appName("Binomial Simulation")
  .master("local[*]") // Use local mode for simplicity
  .getOrCreate()

import spark.implicits._
import breeze.linalg._
import breeze.numerics._
import org.apache.spark.sql.Dataset
import scala.util.Random
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import scala.collection.mutable.ArrayBuffer

val rand = new Random(123)
val trueBeta = DenseVector(0.0, 1.0, -1.0)
val nClusters = 1000
val obsPerCluster = 3

spark = org.apache.spark.sql.SparkSession@6938671e
rand = scala.util.Random@681d89f6
trueBeta = DenseVector(0.0, 1.0, -1.0)
nClusters = 1000
obsPerCluster = 3


3

In [4]:
def simulateBinomialData(nClusters: Int, trueBeta: DenseVector[Double]): Dataset[robustinfer.Obs] = {

  val data = (0 until nClusters).flatMap { clusterId =>
    (0 until obsPerCluster).map { _ =>
      val x = Array(1.0, rand.nextGaussian(), rand.nextGaussian())
      val eta = x.zipWithIndex.map { case (xi, k) => xi * trueBeta(k) }.sum
      val prob = 1.0 / (1.0 + math.exp(-eta))
      val y = if (rand.nextDouble() < prob) 1.0 else 0.0
      Obs(clusterId.toString, x.drop(1), y)
    }
  }
  
  // Create a DataFrame
  val df = spark.createDataset(data)
  df
}

simulateBinomialData: (nClusters: Int, trueBeta: breeze.linalg.DenseVector[Double])org.apache.spark.sql.Dataset[robustinfer.Obs]


In [5]:
def runBinomialSimulation(nClusters: Int, trueBeta: DenseVector[Double]): DataFrame = {
  val maxIter = 200
  var iter = 0
  val results = ArrayBuffer[(Double, Double, Double)]() // Store beta, p-value, and standard error
  while (iter < maxIter) {
    val df = simulateBinomialData(nClusters, trueBeta)
    val gee = new GEE()
    gee.fit(df, verbose = false)

    // Collect results
    val summary = gee.summary()
    val beta1 = summary.beta(1) 
    val se = math.sqrt(summary.variance(1, 1))
    val z = beta1 / se
    val pValue = 2 * (1 - new org.apache.commons.math3.distribution.NormalDistribution().cumulativeProbability(math.abs(z)))

    results += ((beta1, pValue, se))
    
    iter += 1

    // Print progress every maxIter / 10 steps
    if (iter % (maxIter / 10) == 0) {
      println(s"Iteration $iter/$maxIter completed.")
    }
  }
  // Calculate proportion of p-values < 0.05
  val proportion = results.count(_._2 < 0.05).toDouble / maxIter
  println(s"Proportion of p-values < 0.05: $proportion")

  // Create a DataFrame from the results
  val schema = StructType(Seq(
    StructField("Beta", DoubleType, nullable = false),
    StructField("PValue", DoubleType, nullable = false),
    StructField("StandardError", DoubleType, nullable = false)
  ))

  val rows = results.map { case (beta, pValue, se) =>
    Row(beta, pValue, se)
  }

  spark.createDataFrame(spark.sparkContext.parallelize(rows), schema)
}

runBinomialSimulation: (nClusters: Int, trueBeta: breeze.linalg.DenseVector[Double])org.apache.spark.sql.DataFrame


In [7]:
val redDf0 = runBinomialSimulation(200, DenseVector(0.0, 0.0, -1.0))

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.
Iteration 200/200 completed.
Proportion of p-values < 0.05: 0.055


redDf0 = [Beta: double, PValue: double ... 1 more field]


[Beta: double, PValue: double ... 1 more field]

In [8]:
val resDf1 = runBinomialSimulation(200, DenseVector(0.0, 1.0, -1.0))

Iteration 20/200 completed.
Iteration 40/200 completed.
Iteration 60/200 completed.
Iteration 80/200 completed.
Iteration 100/200 completed.
Iteration 120/200 completed.
Iteration 140/200 completed.
Iteration 160/200 completed.
Iteration 180/200 completed.


resDf1 = [Beta: double, PValue: double ... 1 more field]


Iteration 200/200 completed.
Proportion of p-values < 0.05: 1.0


[Beta: double, PValue: double ... 1 more field]