## This notebook calculates the pagerank score for each node

### Load packages

In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 40
   }
}

In [ ]:
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.lit

In [ ]:
var edge_path = ""
var page_rank_path = ""

### Load data

In [ ]:
val df = spark.read.format("parquet").option("header","true").load(edge_path)

### Convert edges to integers if not already done

In [ ]:
val df1 = df.withColumn("issuer_id_indexed",col("issuer_id_indexed").cast("Integer")).withColumn("receiver_id_indexed",col("receiver_id_indexed").cast("Integer")).withColumn("edge_count",col("edge_count").cast("Integer"))

### Create vertices

In [ ]:
val vertices : RDD[(VertexId, Integer)] = df1
    .select(explode(array('issuer_id_indexed, 'receiver_id_indexed))) // issuer_id and receiver_id are the vertices
    .distinct // we remove duplicates
    .rdd.map(_.getAs[Integer](0)) // transform to RDD
    .zipWithIndex // associate a long index to each vertex
    .map(_.swap)

val vertexDf = vertices.toDF("id", "node")

### Create edges

In [ ]:
val edges : RDD[Edge[Integer]] = df1
    .join(vertexDf, df1("issuer_id_indexed") === vertexDf("node")) // getting the IDs for "issuer_id"
    .select('issuer_id_indexed, 'receiver_id_indexed, 'edge_count, 'id as 'idS)
    .join(vertexDf, df1("receiver_id_indexed") === vertexDf("node")) // getting the IDs for "receiver_id"
    .rdd.map(row => // creating the edge using column "edge_count" as metadata 
    Edge(row.getAs[Long]("idS"), row.getAs[Long]("id"), row.getAs[Integer]("edge_count")))

### Create graph

In [ ]:
// And finally
val graph = Graph(vertices, edges)

### Run pagerank

In [ ]:
var pgrDf = vertexDf.withColumn("pagerank_score",lit(0))
try {
    val pgr = graph.pageRank(0.1)
    val pgrDF = pgr.vertices.toDF("node_id","pagerank_score")
    pgrDf = pgrDF.join(vertexDf, $"node_id"===$"id")
} catch {  
    case e: Exception => println("Exception occurred : " + e)
} 


### Save dataframe

In [ ]:
pgrDf.select(pgrDf("node"),pgrDf("pagerank_score")).toDF("issuer_id_indexed","pagerank_score").write.mode("overwrite").option("header", "true").format("parquet").save(page_rank_path)