# Analyse Blockchain with GraphX

_Trying identify interesting addresses in the blockchain transaction graph_

## Basic setup

Here we will create spark session that is necessary for further dataframe processing.


In [ ]:
val spark = SparkSession.builder
                    .master("local[4]")
                    .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@285e3ec8


## Check the data on disk

In [ ]:
:sh du -h /tmp/nodes.parquet

25M	/tmp/nodes.parquet

import sys.process._




In [ ]:
:sh du -h /tmp/edges.parquet

11M	/tmp/edges.parquet

import sys.process._




## Load the data

In [ ]:
val rawNodes = spark.read.load("/tmp/nodes.parquet")
                    .withColumnRenamed("_1", "id")
                    .withColumnRenamed("_2", "Wallet")

rawNodes: org.apache.spark.sql.DataFrame = [id: bigint, Wallet: string]


### Number of vertices

In [ ]:
rawNodes.count

res7: Long = 546651


In [ ]:
import org.apache.spark.sql.functions.regexp_replace

val nodes = rawNodes.withColumn("Wallet", regexp_replace($"Wallet", "bitcoinaddress_", "")).cache()
nodes.show(5)

+---+--------------------+
| id|              Wallet|
+---+--------------------+
|  0|9303DBB4C75A56057...|
|  1|4D3826A813A4B4E9B...|
|  2|BECC6154EEF33464E...|
|  3|4B5E0300F11C2932F...|
|  4|44730B80C9D5EF65D...|
+---+--------------------+
only showing top 5 rows

import org.apache.spark.sql.functions.regexp_replace
nodes: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: bigint, Wallet: string]


In [ ]:
val edges = spark.read.load("/tmp/edges.parquet")
                      .withColumnRenamed("srcId", "src")
                      .withColumnRenamed("dstId", "dst")
                      .cache()
edges.show(5)
edges.count()

+------+------+-----+
|   src|   dst| attr|
+------+------+-----+
|150102|107378|input|
|470403|107378|input|
|232249| 97703|input|
|539070| 97703|input|
|131174|176711|input|
+------+------+-----+
only showing top 5 rows

edges: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [src: bigint, dst: bigint ... 1 more field]
res11: Long = 2087249


# Creating the Graph

In [ ]:
// GraphX expects RDDs, so we need to do the conversion from dataframes here

// todo: ugly
import org.apache.spark.graphx._
val nodesRdd: RDD[(VertexId, String)] = nodes.rdd.map(row => (row(0).asInstanceOf[Long], row(1).asInstanceOf[String]))
val edgesRdd: RDD[Edge[Option[String]]] = edges.rdd.map(row => Edge(row(0).asInstanceOf[Long], row(1).asInstanceOf[Long]))


import org.apache.spark.graphx._
nodesRdd: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, String)] = MapPartitionsRDD[29] at map at <console>:86
edgesRdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Option[String]]] = MapPartitionsRDD[33] at map at <console>:87


In [ ]:
val graph = Graph(nodesRdd, edgesRdd)

graph: org.apache.spark.graphx.Graph[String,Option[String]] = org.apache.spark.graphx.impl.GraphImpl@34414ef5


Triangle count

In [ ]:
val triCounts = graph.triangleCount().vertices
triCounts

triCounts: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[147] at RDD at VertexRDD.scala:57
res21: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[147] at RDD at VertexRDD.scala:57


In [ ]:
val subgraph = graph.subgraph(vpred = (id, wallet) => wallet.startsWith("0"))
println(subgraph.vertices.count())
println(subgraph.edges.count())


28877
6943
subgraph: org.apache.spark.graphx.Graph[String,Option[String]] = org.apache.spark.graphx.impl.GraphImpl@557aa67b


In [ ]:
val pagerankGraph = subgraph.pageRank(0.001)
pagerankGraph.cache()

pagerankGraph: org.apache.spark.graphx.Graph[Double,Double] = org.apache.spark.graphx.impl.GraphImpl@62f76ba9
res74: org.apache.spark.graphx.Graph[Double,Double] = org.apache.spark.graphx.impl.GraphImpl@62f76ba9


In [ ]:
val ranks = pagerankGraph.vertices
ranks.toDF.show

+------+------------------+
|    _1|                _2|
+------+------------------+
|108150|0.9152523662181615|
|527646|0.9152523662181615|
|442260|1.3042346218608802|
| 86303|0.9152523662181615|
|392525|0.9152523662181615|
| 18690|0.9152523662181615|
| 36449|0.9152523662181615|
|280091|1.6932168775035987|
|534849|0.9152523662181615|
|439124|0.9152523662181615|
|462546|0.9152523662181615|
|477211|0.9152523662181615|
|191268|0.9152523662181615|
|363951|0.9152523662181615|
|286643|0.9152523662181615|
|410508|0.9152523662181615|
|353003|0.9152523662181615|
|544908|0.9152523662181615|
|224175|0.9152523662181615|
|181783| 1.174573869979974|
+------+------------------+
only showing top 20 rows

ranks: org.apache.spark.graphx.VertexRDD[Double] = VertexRDDImpl[2721] at RDD at VertexRDD.scala:57


In [ ]:
graph.pageRank(0.001).vertices.toDF.show

+------+-------------------+
|    _1|                 _2|
+------+-------------------+
|249487|  0.425519260832563|
|536298|0.17570600495678032|
|108150|0.17570600495678032|
|444087|0.17570600495678032|
|111517|0.17570600495678032|
|257320| 0.6897936787896283|
|363132|  1.331790309639232|
|231966| 1.9869194312599217|
|292341| 2.5880830687057723|
|451731| 1.5510694471398851|
|475832|0.17570600495678032|
|213045| 2.0287429723438564|
| 32676|  1.503250416026544|
|433811|0.17570600495678032|
|161980|0.17570600495678032|
| 81508| 1.6274962844457763|
|254457|0.17570600495678032|
|137823| 1.0928021426655743|
|363818|0.36758542414348955|
|161091| 1.6864558304320325|
+------+-------------------+
only showing top 20 rows



### Computing connected components 

In [ ]:
val cc = graph.connectedComponents

The `cc` variable is the original graph but vertives' payload/properties is only the cluster to which is belongs. The cluster is characterized by the smallest `VertexId` in the cluster.

#### Number of connected components 

Computing the number of clusters can easily be done by counting the number of distinct `payload` for the vertices.

In [ ]:
<strong style="color: red">{cc.vertices.map(_._2).distinct.count}</strong>

res16: scala.xml.Elem = <strong style="color: red">4774</strong>
