# Analyse Blockchain with GraphX

_Trying identify interesting addresses in the blockchain transaction graph_

## Basic setup

Here we will create spark session that is necessary for further dataframe processing.


In [ ]:
val spark = SparkSession.builder
                    .master("local[4]")
                    .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@2d0dbb1f


## Check the data on disk

In [ ]:
:sh du -h /tmp/nodes.parquet

53M	/tmp/nodes.parquet

import sys.process._




In [ ]:
:sh du -h /tmp/edges.parquet

34M	/tmp/edges.parquet

import sys.process._




## Load the data

In [ ]:
val rawNodes = spark.read.load("/tmp/nodes.parquet")
                    .withColumnRenamed("_1", "id")
                    .withColumnRenamed("_2", "address")

rawNodes: org.apache.spark.sql.DataFrame = [id: bigint, address: string]


### Number of vertices

In [ ]:
rawNodes.count

res8: Long = 1213698


In [ ]:
import org.apache.spark.sql.functions.regexp_replace

val nodes = rawNodes.withColumn("address", regexp_replace($"address", "bitcoinaddress_", "")).cache()
nodes.show(5)

+---+--------------------+
| id|             address|
+---+--------------------+
|  0|                null|
|  1|6CC3A5C1A3DDC35B5...|
|  2|C6AB2C48504334228...|
|  3|5B0CD86C0689949A2...|
|  4|4F2D5BE9122725E7F...|
+---+--------------------+
only showing top 5 rows

import org.apache.spark.sql.functions.regexp_replace
nodes: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: bigint, address: string]


In [ ]:
val edges = spark.read.load("/tmp/edges.parquet")
                      .withColumnRenamed("srcId", "src")
                      .withColumnRenamed("dstId", "dst")
                      .cache()
edges.show(5)
edges.count()

+-------+------+-----+
|    src|   dst| attr|
+-------+------+-----+
| 216625|778439|input|
|1199465|778439|input|
| 669823|400608|input|
| 416571|418699|input|
| 418699|418699|input|
+-------+------+-----+
only showing top 5 rows

edges: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [src: bigint, dst: bigint ... 1 more field]
res12: Long = 12806337


# Creating the Graph

In [ ]:
// GraphX expects RDDs, so we need to do the conversion from dataframes here

// todo: ugly
import org.apache.spark.graphx._
val nodesRdd: RDD[(VertexId, String)] = nodes.rdd.map(row => (row(0).asInstanceOf[Long], row(1).asInstanceOf[String]))
val edgesRdd: RDD[Edge[Option[String]]] = edges.rdd.map(row => Edge(row(0).asInstanceOf[Long], row(1).asInstanceOf[Long]))


import org.apache.spark.graphx._
nodesRdd: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, String)] = MapPartitionsRDD[31] at map at <console>:86
edgesRdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Option[String]]] = MapPartitionsRDD[35] at map at <console>:87


In [ ]:
val graph = Graph(nodesRdd, edgesRdd)

graph: org.apache.spark.graphx.Graph[String,Option[String]] = org.apache.spark.graphx.impl.GraphImpl@31d84261


Triangle count

In [ ]:
val triCounts = graph.triangleCount().vertices
triCounts

triCounts: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[147] at RDD at VertexRDD.scala:57
res21: org.apache.spark.graphx.VertexRDD[Int] = VertexRDDImpl[147] at RDD at VertexRDD.scala:57


In [ ]:
val ranks = graph.pageRank(0.001)
                 .vertices
                 .toDF("id", "rank")

ranks.show

ranks: org.apache.spark.sql.DataFrame = [id: bigint, rank: double]


In [ ]:
val sortedRanks = ranks.join(nodes, "id")
                       .sort(desc("rank"))

sortedRanks.show(5, false)

+-------+------------------+----------------------------------------+
|id     |rank              |address                                 |
+-------+------------------+----------------------------------------+
|0      |137529.98432158327|null                                    |
|351756 |13618.23058844001 |C825A1ECF2A6830C4401620C3A16F1995057C2AB|
|275668 |3765.1174232892163|DE21D51F82F065DF011CFB3CDCE09C6F71FC716B|
|1108326|2629.374661399439 |D63066643AFA128CE4BEBB2523242ADF5F07A0A9|
|1178604|2440.636714969531 |AA3750AA18B8A0F3F0590731E1FAB934856680CF|
+-------+------------------+----------------------------------------+
only showing top 5 rows

sortedRanks: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: bigint, rank: double ... 1 more field]


In [ ]:
val top5 = sortedRanks.take(5).tail.map(_(2).toString)

top5: Array[String] = Array(C825A1ECF2A6830C4401620C3A16F1995057C2AB, DE21D51F82F065DF011CFB3CDCE09C6F71FC716B, D63066643AFA128CE4BEBB2523242ADF5F07A0A9, AA3750AA18B8A0F3F0590731E1FAB934856680CF)


In [ ]:
import scala.io.Source.fromURL

def hashToAddress(hash: String): String = 
  fromURL(s"https://blockchain.info/q/hashtoaddress/$hash").mkString

val topAddresses = top5.map(hashToAddress)
topAddresses


import scala.io.Source.fromURL
hashToAddress: (hash: String)String
topAddresses: Array[String] = Array(1KFHE7w8BhaENAswwryaoccDb6qcT6DbYY, 1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm, 1LXXawYPVsMEpsDWmEJ3RZBTbNxPQwXv5t, 1GX28yLjVWux7ws4UQ9FB4MnLH4UKTPK2z)
res101: Array[String] = Array(1KFHE7w8BhaENAswwryaoccDb6qcT6DbYY, 1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm, 1LXXawYPVsMEpsDWmEJ3RZBTbNxPQwXv5t, 1GX28yLjVWux7ws4UQ9FB4MnLH4UKTPK2z)


In [ ]:
def getBallance(address: String): String = 
  fromURL(s"https://blockchain.info/q/addressbalance/$address").mkString

getBallance("1KFHE7w8BhaENAswwryaoccDb6qcT6DbYY")

// todo: in usd
// todo: check why the first 1 is missing in the page rank\

// getreceivedbyaddress/Address - Get the total number of bitcoins received by an address (in satoshi). Multiple addresses separated by | Do not use to process payments without the confirmations parameter
// Add the parameters start_time and end_time to restrict received by to a specific time period. Provided times should be a unix timestamp in milliseconds. Multiple addresses separated by |

// getsentbyaddress/Address - Get the total number of bitcoins send by an address (in satoshi). Multiple addresses separated by | Do not use to process payments without the confirmations parameter
// addressbalance/Address - Get the balance of an address (in satoshi). Multiple addresses separated by | Do not use to process payments without the confirmations parameter
// addressfirstseen/Address - Timestamp of the block an address was first confirmed in.


getBallance: (address: String)String
res109: String = 75511369157


In [ ]:
hashtoaddress/Hash

<iframe 
width="700" frameborder="0" height="350" 
src="http://bitcoinwhoswho.com/address/1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm"></iframe>
// <iframe src="https://www.google.com/search?q=Im+Feeling+Lucky" width=700 height=350></iframe>


res68: scala.xml.Elem = <iframe width="700" frameborder="0" height="350" src="http://bitcoinwhoswho.com/address/1MFXYK1XucKFfhPhW9HDHD3vsM9BKey4qm"></iframe>
