In [33]:
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import breeze.linalg._
import breeze.numerics._

## small data

In [34]:
case class Obs(i: String, x: Array[Double], y: Double)  // cluster i, covariates x_ij, outcome y_ij

val data = Seq(
  Obs("1", Array(1.0, 2.0), 1.0),
  Obs("1", Array(1.5, 1.8), 0.0),
  Obs("2", Array(0.5, 0.7), 1.0),
  Obs("2", Array(1.1, 0.9), 1.0)
)
val df = spark.createDataset(data)

defined class Obs
data = List(Obs(1,[D@4825503b,1.0), Obs(1,[D@6c3e559a,0.0), Obs(2,[D@672eaf57,1.0), Obs(2,[D@5488747c,1.0))
df = [i: string, x: array<double> ... 1 more field]


[i: string, x: array<double> ... 1 more field]

In [35]:
val firstClusterId = df.select("i").limit(1).collect().head.getString(0)

firstClusterId = 1


1

In [36]:
val t = df.filter(_.i == firstClusterId).count().toInt

t = 2


2

In [37]:
var beta = DenseVector.zeros[Double](2)  // \beta \in \mathbb{R}^p
val rho = 0.0  // exchangeable working correlation \rho

def computeClusterStats(cluster: Seq[Obs], beta: DenseVector[Double], rho: Double): (DenseVector[Double], DenseMatrix[Double]) = {
  val X_i = DenseMatrix(cluster.map(_.x): _*)        // X_i \in \mathbb{R}^{m_i \times p}
  val Y_i = DenseVector(cluster.map(_.y): _*)        // Y_i \in \mathbb{R}^{m_i}
  val mu_i = sigmoid(X_i * beta)                     // \mu_i(\beta)
  val A_i = diag(mu_i *:* (1.0 - mu_i))               // A_i = diag(Var(Y_i))
  val A_sqrt = diag(mu_i.map(m => sqrt(m * (1.0 - m))))
  val m_i = Y_i.length
  val R = DenseMatrix.tabulate(m_i, m_i)((j, k) => if (j == k) 1.0 else rho)  // R: exchangeable
  val V_i = A_sqrt * R * A_sqrt
  val V_i_inv = pinv(V_i)
  val D_i = A_i * X_i                             // D_i = \partial \mu_i / \partial \beta^T
  val resid_i = Y_i - mu_i                        // residuals Y_i - \mu_i(\beta)
  val U_i = D_i.t * V_i_inv * resid_i             // score contribution
  val B_i = D_i.t * V_i_inv * D_i                 // information contribution
  (U_i, B_i)
}

beta = DenseVector(0.0, 0.0)
rho = 0.0


computeClusterStats: (cluster: Seq[Obs], beta: breeze.linalg.DenseVector[Double], rho: Double)(breeze.linalg.DenseVector[Double], breeze.linalg.DenseMatrix[Double])


0.0

In [38]:
val statsRdd = df.rdd
  .groupBy(_.i)
  .map { case (i, obsSeq) =>
    val cluster = obsSeq.toSeq
    val aggregated = computeClusterStats(cluster, beta, rho)
    (aggregated._1.toArray, aggregated._2.toArray)
  }

statsRdd = MapPartitionsRDD[68] at map at <console>:66


MapPartitionsRDD[68] at map at <console>:66

In [39]:
// def sumVectors(v1: Array[Double], v2: Array[Double]): Array[Double] = v1.zip(v2).map(t => t._1 + t._2)
// def sumMatrices(m1: Array[Double], m2: Array[Double]): Array[Double] = m1.zip(m2).map(t => t._1 + t._2)

val aggStats = statsRdd.reduce { case ((u1, b1), (u2, b2)) =>
  val u = u1.zip(u2).map { case (a, b) => a + b }
  val b = b1.zip(b2).map { case (a, b) => a + b }
  (u, b)
}

aggStats = (Array(0.55, 0.9),Array(1.1775, 1.51, 1.51, 2.1350000000000002))


(Array(0.55, 0.9),Array(1.1775, 1.51, 1.51, 2.1350000000000002))

In [40]:
val U = new DenseVector(aggStats._1)
val B = new DenseMatrix(beta.length, beta.length, aggStats._2)

U = DenseVector(0.55, 0.9)
B = 


1.1775  1.51
1.51    2.1350000000000002


In [41]:
val delta = inv(B) * U

delta = DenseVector(-0.7899941204767731, 0.9802768720936431)


DenseVector(-0.7899941204767731, 0.9802768720936431)

In [42]:
df.show()

+---+----------+---+
|  i|         x|  y|
+---+----------+---+
|  1|[1.0, 2.0]|1.0|
|  1|[1.5, 1.8]|0.0|
|  2|[0.5, 0.7]|1.0|
|  2|[1.1, 0.9]|1.0|
+---+----------+---+



In [43]:
val maxIter = 10
val tol = 1e-6

// GEE Fisher scoring loop
var iter = 0
var converged = false
while (iter < maxIter && !converged) {
  val statsRdd = df.rdd
    .groupBy(_.i)
    .map { case (i, obsSeq) =>
        val cluster = obsSeq.toSeq
        val aggregated = computeClusterStats(cluster, beta, rho)
        (aggregated._1.toArray, aggregated._2.toArray)
        }

  val aggStats = statsRdd.reduce { case ((u1, b1), (u2, b2)) =>
      val u = u1.zip(u2).map { case (a, b) => a + b }
      val b = b1.zip(b2).map { case (a, b) => a + b }
      (u, b)
      }

  val U = new DenseVector(aggStats._1)
  val B = new DenseMatrix(beta.length, beta.length, aggStats._2)
  val delta = pinv(B) * U
  beta = beta + delta

  println(s"Iter $iter: ||delta|| = ${norm(delta)}, beta = $beta")
  converged = norm(delta) < tol
  iter += 1
}

println(s"Final beta: $beta")

Iter 0: ||delta|| = 1.258981118345138, beta = DenseVector(-0.7899941204767761, 0.9802768720936443)
Iter 1: ||delta|| = 0.19504442133781175, beta = DenseVector(-0.9276849887190464, 1.1184200988465025)
Iter 2: ||delta|| = 0.006603013387385123, beta = DenseVector(-0.9323978070245222, 1.1230449371586555)
Iter 3: ||delta|| = 7.15161831723832E-6, beta = DenseVector(-0.9324029191684396, 1.1230499383214325)
Iter 4: ||delta|| = 8.311897613962925E-12, beta = DenseVector(-0.9324029191743826, 1.1230499383272436)
Final beta: DenseVector(-0.9324029191743826, 1.1230499383272436)


maxIter = 10
tol = 1.0E-6
iter = 5
converged = true


true

In [1]:
val predictions = df.map { obs =>
  val xVec = new DenseVector(obs.x)
  val predProb = sigmoid(beta dot xVec)
  (obs.y, predProb)
}.collect()

println("Actual vs Predicted Probabilities:")
predictions.foreach { case (y, p) =>
  println(f"y = $y%.1f, predicted = $p%.4f")
}

predictions = Array((1.0,0.788131133072431), (0.0,0.6508745280313687), (1.0,0.5793080448314912), (1.0,0.49627550224070044))


Actual vs Predicted Probabilities:
y = 1.0, predicted = 0.7881
y = 0.0, predicted = 0.6509
y = 1.0, predicted = 0.5793
y = 1.0, predicted = 0.4963


Array((1.0,0.788131133072431), (0.0,0.6508745280313687), (1.0,0.5793080448314912), (1.0,0.49627550224070044))

### correlation estimation

In [8]:
val covMatByCluster = df.rdd
  .groupBy(_.i)
  .map { case (_, obsSeq) =>
    val cluster = obsSeq.toSeq
    val X = DenseMatrix(cluster.map(_.x): _*)
    val Y = DenseVector(cluster.map(_.y): _*)
    val mu = sigmoid(X * beta)
    val resi = Y - mu
    val covMat = resi * resi.t
    covMat.toArray
  }

covMatByCluster = MapPartitionsRDD[101] at map at <console>:64


MapPartitionsRDD[101] at map at <console>:64

In [10]:
val aggCov = covMatByCluster.reduce((a, b) => a.zip(b).map { case (x, y) => x + y })

aggCov = Array(0.22187013791667454, 0.07401279506261255, 0.07401279506261255, 0.6773760208829156)


Array(0.22187013791667454, 0.07401279506261255, 0.07401279506261255, 0.6773760208829156)

In [14]:
val nClusters = covMatByCluster.count()
val avgCovMat = new DenseMatrix(t, t, aggCov.map(_ / nClusters))

nClusters = 2
avgCovMat = 


0.11093506895833727  0.03700639753130627
0.03700639753130627  0.3386880104414578


In [15]:
val stddevs = (0 until t).map(i => math.sqrt(avgCovMat(i, i)))

val corrMat = DenseMatrix.tabulate(t, t) { case (i, j) =>
  avgCovMat(i, j) / (stddevs(i) * stddevs(j))
}

stddevs = Vector(0.3330691654271486, 0.581969080313944)
corrMat = 


0.9999999999999999   0.19091606282450307
0.19091606282450307  1.0


In [16]:
val rhoHat_exchangeable = {
  val offDiags = for {
    i <- 0 until t
    j <- 0 until t if i != j
  } yield corrMat(i, j)
  offDiags.sum / offDiags.size
}

val R_exchangeable = DenseMatrix.tabulate(t, t) { (i, j) =>
  if (i == j) 1.0 else rhoHat_exchangeable
}

rhoHat_exchangeable = 0.19091606282450307
R_exchangeable = 


1.0                  0.19091606282450307
0.19091606282450307  1.0


In [17]:
val rhoHat_ar1 = {
  val lags = for (i <- 0 until t - 1) yield corrMat(i, i + 1)
  lags.sum / lags.size
}

val R_ar1 = DenseMatrix.tabulate(t, t) { (i, j) =>
  math.pow(rhoHat_ar1, math.abs(i - j))
}

rhoHat_ar1 = 0.19091606282450307
R_ar1 = 


1.0                  0.19091606282450307
0.19091606282450307  1.0


## large data

In [11]:
import scala.util.Random


In [12]:
val rand = new Random(42)
val trueBeta = DenseVector(1.0, -1.0)
val nClusters = 1000
val obsPerCluster = 2

val syntheticData = (0 until nClusters).flatMap { clusterId =>
  (0 until obsPerCluster).map { j =>
    val x = Array(rand.nextGaussian(), rand.nextGaussian())
    val eta = x.zipWithIndex.map { case (xi, k) => xi * trueBeta(k) }.sum
    val prob = 1.0 / (1.0 + math.exp(-eta))
    val y = if (rand.nextDouble() < prob) 1.0 else 0.0
    Obs(clusterId.toString, x, y)
  }
}
val df = spark.createDataset(syntheticData)

rand = scala.util.Random@462d72d9
trueBeta = DenseVector(1.0, -1.0)
nClusters = 1000
obsPerCluster = 2
syntheticData = Vector(Obs(0,[D@51a39d98,1.0), Obs(0,[D@6f8d8edf,0.0), Obs(1,[D@344339ff,1.0), Obs(1,[D@19d52e14,0.0), Obs(2,[D@54d72feb,0.0), Obs(2,[D@57afb8d8,1.0), Obs(3,[D@7a8eff13,1.0), Obs(3,[D@4a1bcbd8,0.0), Obs(4,[D@27de0d07,0.0), Obs(4,[D@d9d085c,1.0), Obs(5,[D@66089b,0.0), Obs(5,[D@14374644,0.0), Obs(6,[D@43d6c7f0,0.0), Obs(6,[D@266cf83e,1.0), Obs(7,[D@167e0eb8,1.0), Obs(7,[D@38d56f55,1.0), Obs(8,[D@4dc9d6f8,1.0), Obs(8,[D@c257a77,1.0), Obs(9,[D@6ac8be85,1.0), Obs(9,[D@7974cb4e,1.0), Obs(10,[D@7a09bc8e,0.0), Obs(10,[D@5dd8977a,1.0), Obs(11,[D@724b26d2,0.0), Obs(11,[D@2d...


Vector(Obs(0,[D@51a39d98,1.0), Obs(0,[D@6f8d8edf,0.0), Obs(1,[D@344339ff,1.0), Obs(1,[D@19d52e14,0.0), Obs(2,[D@54d72feb,0.0), Obs(2,[D@57afb8d8,1.0), Obs(3,[D@7a8eff13,1.0), Obs(3,[D@4a1bcbd8,0.0), Obs(4,[D@27de0d07,0.0), Obs(4,[D@d9d085c,1.0), Obs(5,[D@66089b,0.0), Obs(5,[D@14374644,0.0), Obs(6,[D@43d6c7f0,0.0), Obs(6,[D@266cf83e,1.0), Obs(7,[D@167e0eb8,1.0), Obs(7,[D@38d56f55,1.0), Obs(8,[D@4dc9d6f8,1.0), Obs(8,[D@c257a77,1.0), Obs(9,[D@6ac8be85,1.0), Obs(9,[D@7974cb4e,1.0), Obs(10,[D@7a09bc8e,0.0), Obs(10,[D@5dd8977a,1.0), Obs(11,[D@724b26d2,0.0), Obs(11,[D@2d...

In [13]:
val maxIter = 10
val tol = 1e-6
var beta = DenseVector.zeros[Double](2)  // \beta \in \mathbb{R}^p
val rho = 0.0  // exchangeable working correlation \rho

// GEE Fisher scoring loop
var iter = 0
var converged = false
while (iter < maxIter && !converged) {
  val statsRdd = df.rdd
    .groupBy(_.i)
    .map { case (i, obsSeq) =>
        val cluster = obsSeq.toSeq
        val aggregated = computeClusterStats(cluster, beta, rho)
        (aggregated._1.toArray, aggregated._2.toArray)
        }

  val aggStats = statsRdd.reduce { case ((u1, b1), (u2, b2)) =>
      val u = u1.zip(u2).map { case (a, b) => a + b }
      val b = b1.zip(b2).map { case (a, b) => a + b }
      (u, b)
      }

  val U = new DenseVector(aggStats._1)
  val B = new DenseMatrix(beta.length, beta.length, aggStats._2)
  val delta = pinv(B) * U
  beta = beta + delta

  println(s"Iter $iter: ||delta|| = ${norm(delta)}, beta = $beta")
  converged = norm(delta) < tol
  iter += 1
}

println(s"Final beta: $beta")

Iter 0: ||delta|| = 1.0404917804109737, beta = DenseVector(0.7249442352032562, -0.7463772510924778)
Iter 1: ||delta|| = 0.3382194731075008, beta = DenseVector(0.9614684872693847, -0.9881388902614616)
Iter 2: ||delta|| = 0.06060561475175511, beta = DenseVector(1.0040070695000878, -1.0313072763934434)
Iter 3: ||delta|| = 0.0016735821632032374, beta = DenseVector(1.0051849439990785, -1.0324961787955016)
Iter 4: ||delta|| = 1.2318771581922373E-6, beta = DenseVector(1.0051858127138649, -1.0324970522117558)
Iter 5: ||delta|| = 6.6666687118037E-13, beta = DenseVector(1.0051858127143356, -1.0324970522122279)
Final beta: DenseVector(1.0051858127143356, -1.0324970522122279)


maxIter = 10
tol = 1.0E-6
beta = DenseVector(1.0051858127143356, -1.0324970522122279)
rho = 0.0
iter = 6
converged = true


true

## historical debug

In [10]:
// Extract cluster 1 from the DataFrame
val cluster = df.filter($"i" === "1").collect()

// Show the contents of cluster 1
// cluster.show()

cluster = Array(Obs(1,[D@3096b40c,1.0), Obs(1,[D@2995107,0.0))


Array(Obs(1,[D@3096b40c,1.0), Obs(1,[D@2995107,0.0))

In [46]:
  val X_i = DenseMatrix(cluster.map(_.x): _*)        // X_i \in \mathbb{R}^{m_i \times p}
  val Y_i = DenseVector(cluster.map(_.y): _*)        // Y_i \in \mathbb{R}^{m_i}
  val mu_i = sigmoid(X_i * beta)                     // \mu_i(\beta)
  val A_i = diag(mu_i *:* (1.0 - mu_i))               // A_i = diag(Var(Y_i))
  val A_sqrt = diag(mu_i.map(m => sqrt(m * (1.0 - m))))
  val m_i = Y_i.length
  val R = DenseMatrix.tabulate(m_i, m_i)((j, k) => if (j == k) 1.0 else rho)  // R: exchangeable
  val V_i = A_sqrt * R * A_sqrt
  val V_i_inv = inv(V_i)
  val D_i = A_i * X_i                             // D_i = \partial \mu_i / \partial \beta^T
  val resid_i = Y_i - mu_i                        // residuals Y_i - \mu_i(\beta)
  val U_i = D_i.t * V_i_inv * resid_i             // score contribution
  val B_i = D_i.t * V_i_inv * D_i                 // information contribution                  // \mu_i(\beta)

lastException = null
X_i = 
Y_i = DenseVector(1.0, 0.0)
mu_i = DenseVector(0.5, 0.5)
A_i = 
A_sqrt = 
m_i = 2
R = 
V_i = 
V_i_inv = 
D_i = 
resid_i = DenseVector(0.5, -0.5)
U_i = DenseVector(-0.3125, 0.125)


1.0  2.0
1.5  1.8
0.25  0.0
0.0   0.25
0.5  0.0
0.0  0.5
1.0  0.2
0.2  1.0
0.25  0.05
0.05  0.25
4.166666666666667    -0.8333333333333335
-0.8333333333333335  4.166666666666667
0.25   0.5
0.375  0.45
B_i: breeze.linalg.DenseMatr...


DenseVector(-0.3125, 0.125)

In [47]:
U_i

DenseVector(-0.3125, 0.125)

In [51]:
B_i

0.6901041666666667  0.9739583333333334
0.9739583333333335  1.5104166666666667


In [52]:
println(B_i.getClass)

class breeze.linalg.DenseMatrix$mcD$sp


In [33]:
  val R = DenseMatrix.tabulate(m_i, m_i)((j, k) => if (j == k) 1.0 else rho)  // R: exchangeable
  val V_i = A_i * R * A_i
  val V_i_inv = inv(V_i)
  val D_i = A_i * X_i                             // D_i = \partial \mu_i / \partial \beta^T
  val resid_i = Y_i - mu_i                        // residuals Y_i - \mu_i(\beta)
  val U_i = D_i.t * V_i_inv * resid_i             // score contribution
  val B_i = D_i.t * V_i_inv * D_i                 // information contribution

R = 
V_i = 
V_i_inv = 
D_i = 
resid_i = DenseVector(0.5, -0.5)
U_i = DenseVector(-1.25, 0.5)
B_i = 


1.0  0.2
0.2  1.0
0.0625  0.0125
0.0125  0.0625
16.666666666666668  -3.333333333333334
-3.333333333333334  16.666666666666668
0.25   0.5
0.375  0.45
2.760416666666667  3.8958333333333335
3.895833333333334  6.041666666666667


In [5]:
val (U_i, B_i) = computeClusterStats(cluster, beta, rho)

Unknown Error: <console>:38: error: not found: value cluster
       val (U_i, B_i) = computeClusterStats(cluster, beta, rho)
                                            ^
<console>:38: error: not found: value U_i
       val (U_i, B_i) = computeClusterStats(cluster, beta, rho)
            ^
<console>:38: error: not found: value B_i
       val (U_i, B_i) = computeClusterStats(cluster, beta, rho)
                 ^


In [60]:
U_i.toArray

Array(-0.3125, 0.125)

In [59]:
B_i.toArray

Array(0.6901041666666667, 0.9739583333333335, 0.9739583333333334, 1.5104166666666667)