Compute RSVD
============

Here we read the preprcessed data and compute the rSVD

In [None]:
import com.criteo.rsvd._
import scala.util.Random
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.sql.functions.{min, max}

  

>     import com.criteo.rsvd._
>     import scala.util.Random
>     import org.apache.spark.mllib.linalg.distributed.MatrixEntry
>     import org.apache.spark.sql.functions.{min, max}

  

### Set up RSVD config with JSON file

In [None]:
// code snippet for saving config as json
val config_map = Map("embeddingDim" -> 100, "oversample" -> 30, "powerIter" -> 1, "seed" -> 0, "blockSize" -> 50000, "partitionWidthInBlocks" -> 35, "partitionHeightInBlocks" -> 10)
val config_spark_save = config_map.toSeq.toDF("key","value")
config_spark_save.write.mode("overwrite").json("/projects/group21/rsvd_config.json")

  

>     config_map: scala.collection.immutable.Map[String,Int] = Map(seed -> 0, oversample -> 30, blockSize -> 50000, partitionWidthInBlocks -> 35, partitionHeightInBlocks -> 10, powerIter -> 1, embeddingDim -> 100)
>     config_spark_save: org.apache.spark.sql.DataFrame = [key: string, value: int]

In [None]:
// load config from json (assuming only integer values)
val config_spark = spark.read.json("/projects/group21/rsvd_config.json").rdd.map(r => (r(0).toString -> r(1).toString.toInt)).collect.toMap

  

>     config_spark: scala.collection.immutable.Map[String,Int] = Map(seed -> 0, oversample -> 30, blockSize -> 50000, partitionWidthInBlocks -> 35, partitionHeightInBlocks -> 10, powerIter -> 1, embeddingDim -> 100)

In [None]:
// Create RSVD configuration
val config = RSVDConfig(
  embeddingDim = config_spark("embeddingDim"),
  oversample = config_spark("oversample"),
  powerIter = config_spark("powerIter"),
  seed = config_spark("seed"),
  blockSize = config_spark("blockSize"),
  partitionWidthInBlocks = config_spark("partitionWidthInBlocks"),
  partitionHeightInBlocks = config_spark("partitionHeightInBlocks"),
  computeLeftSingularVectors = false,
  computeRightSingularVectors = false
)

  

>     config: com.criteo.rsvd.RSVDConfig = RSVDConfig(100,30,1,0,50000,35,10,false,false)

  

### Create pipeline for computing RSVD from dataframe of edge

In [None]:
def computeRSVD (groupedCanonicalEdges : org.apache.spark.sql.DataFrame, config : RSVDConfig): RsvdResults = {
  val matHeight = groupedCanonicalEdges.count()
  val Row(maxValue: Int) = groupedCanonicalEdges.agg(max("dst")).head
  val matWidth = maxValue
  val incidenceMatrixEntries = groupedCanonicalEdges.rdd.flatMap{
    case Row(src: Int, dst: Int, id: Int) => List(MatrixEntry(id-1, src-1, -1), MatrixEntry(id-1, dst-1, 1))
  }
  // Create block matrix and compute RSVD
  val matrixToDecompose = BlockMatrix.fromMatrixEntries(incidenceMatrixEntries, matHeight = matHeight, matWidth = matWidth, config.blockSize, config.partitionHeightInBlocks, config.partitionWidthInBlocks)
  return RSVD.run(matrixToDecompose, config, sc)
}

  

>     computeRSVD: (groupedCanonicalEdges: org.apache.spark.sql.DataFrame, config: com.criteo.rsvd.RSVDConfig)com.criteo.rsvd.RsvdResults

  

### Compute and save RSVD for Ethereum graph

In [None]:
val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/test_ethereum_canonical_edges").drop("flow")
val rsvd_results_path: String = "/projects/group21/test_ethereum_"

val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)
val singularDF = sc.parallelize(singularValues.toArray).toDF()

singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues")

  

>     groupedCanonicalEdges: org.apache.spark.sql.DataFrame = [src: int, dst: int ... 1 more field]
>     rsvd_results_path: String = /projects/group21/test_ethereum_
>     leftSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = None
>     singularValues: breeze.linalg.DenseVector[Double] = DenseVector(458.40347228630355, 345.1960245414591, 297.38956434965013, 271.6473930317088, 223.76310129031808, 206.09769977563982, 189.1963246644718, 137.79672308619433, 137.56623843203414, 135.54290900239602, 132.08931171241727, 128.72107390801995, 120.93923408578102, 120.62389811722166, 119.5447107766826, 118.21384174672244, 112.19577063002343, 111.39418637351413, 106.97086678678656, 106.5641595917104, 102.47323443175746, 100.63351038159678, 99.9528847938503, 96.35068331317144, 94.63883930018115, 93.63420288135472, 91.14614276423288, 85.69575649594555, 85.33070560572034, 82.7806418087701, 79.46368530163006, 78.30172308417197, 77.52345859456187, 75.89415601567978, 75.17444923371288, 74.60719378218755, 72.76955624490425, 72.46205544360612, 72.11945747937338, 71.36743768743939, 69.68190130698407, 69.34880256578178, 69.23852630919075, 68.349699932709, 68.08810547606782, 67.58052315764029, 67.00671105014123, 66.7375533289733, 66.50556789291336, 65.61826598949622, 65.26026410908351, 64.08933544145536, 63.75670758508847, 63.284124094998404, 61.879367517237704, 61.72183276227414, 60.67402992766071, 60.11054425362685, 60.043685003620446, 59.48403109878358, 58.701932680150286, 58.32674555053222, 57.33272290092479, 56.718331334692344, 55.98877873308879, 55.47326395906372, 54.952717142978095, 54.58134293797019, 54.224229790295766, 53.448450653730504, 53.27335173954756, 53.015192945979386, 52.83027431053384, 52.538750139678044, 52.04259394726296, 51.959976901583175, 51.465183251045325, 51.275215752441895, 50.96925279272193, 50.85019051825306, 50.55657458775341, 50.27755187580496, 49.79019131986308, 49.466635437303324, 49.313416136040296, 49.14614394421691, 48.816142694566054, 48.382666803452764, 48.214673282871296, 48.1100921416426, 47.881278635894425, 47.61474033120143, 47.42272886233879, 47.203789646001, 46.978521992237454, 46.83620532681898, 46.53323067801463, 46.02425975867161, 45.681123495339364, 45.61205652605019)
>     rightSingularVectors: Option[com.criteo.rsvd.SkinnyBlockMatrix] = None
>     singularDF: org.apache.spark.sql.DataFrame = [value: double]

  

### Compute and save RSVD for Erdös-Renyi graphs

In [None]:
for(i <- 0 to 9) {
  val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/uniform_random_graph" + i)
  val rsvd_results_path: String = "/projects/group21/uniform_random_graph_"

  val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)
  
  val singularDF = sc.parallelize(singularValues.toArray).toDF()

  singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues" + i)
}

  

### Compute and save RSVD for R-MAT graphs

In [None]:
for(i <- 0 to 9) {
  val groupedCanonicalEdges = spark.read.format("parquet").load("/projects/group21/rmat_random_graph" + i)
  val rsvd_results_path: String = "/projects/group21/rmat_random_graph_"

  val RsvdResults(leftSingularVectors, singularValues, rightSingularVectors) = computeRSVD(groupedCanonicalEdges, config)
  
  val singularDF = sc.parallelize(singularValues.toArray).toDF()

  singularDF.write.format("parquet").mode("overwrite").save(rsvd_results_path + "SingularValues" + i)
}