In [10]:
spark.stop()

### General Imports and Spark Config

In [1]:
import pyspark
from pyspark.sql import SparkSession
from graphframes import *
from delta import *
from delta.tables import *

import pyspark.sql.functions as f

In [2]:
config = pyspark.SparkConf().setAll([
    ('spark.executor.memory', '12g'), 
    ('spark.executor.cores', '3'), 
    ('spark.cores.max', '9'),
    ('spark.driver.memory','1g'),
    ('spark.executor.instances', '1'),
    ('spark.dynamicAllocation.enabled', 'true'),
    ('spark.dynamicAllocation.shuffleTracking.enabled', 'true'),
    ('spark.dynamicAllocation.executorIdleTimeout', '60s'),
    ('spark.dynamicAllocation.minExecutors', '0'),
    ('spark.dynamicAllocation.maxExecutors', '3'),
    ('spark.dynamicAllocation.initialExecutors', '1'),
    ('spark.dynamicAllocation.executorAllocationRatio', '1'),
    ('spark.worker.cleanup.enabled', 'true'),
    ('spark.worker.cleanup.interval', '60'),
    ('spark.shuffle.service.db.enabled', 'true'),
    ('spark.worker.cleanup.appDataTtl', '60')
    ('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:10.0.2')
])

### Build Spark Session using DeltaLake Extension

In [3]:
builder = pyspark.sql.SparkSession.builder \
    .appName("Neo4Stream") \
    .master("spark://172.23.149.212:7077") \
    .config(conf=config) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

22/06/08 09:30:17 WARN Utils: Your hostname, algorand-druid-and-spark resolves to a loopback address: 127.0.0.1; using 172.23.149.212 instead (on interface ens3)
22/06/08 09:30:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/ubuntu/.local/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0b8141b4-5a3d-4464-a025-0ace7a2af202;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 169ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;1.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      d

### Read the transaction table

### TODO noch neue nodes einbauen für andere dinge wie asset ids und nicht nur addressen, neue labels verpassen, selection stimmt irgendwie noch nicht, allgemeine überprüfung was mit anderen typen von transactions ist, also asset tx, paytx und auch app tx

In [4]:
# txn table to find all transactions between them
dfTxn = spark.read.format("delta").load("/mnt/delta/bronze/algod_indexer_public_txn_flat").limit(1000)
dfTxn = dfTxn.repartition(9)

                                                                                

### Build the transaction Network and write to Neo4j

Important: In neo4j there is a naming convention, node labels should use camelcase (beginning with uppercase) and relationship labels should use all uppercase with _

 #### Pepare the vertices/nodes dataframe for payment transactions

Prepare the edges/relationship dataframe

In [32]:
dfPaymentTx = dfTxn.filter(dfTxn.TYPEENUM == 1).select(dfTxn.TXN_SND, dfTxn.TXN_RCV, dfTxn.TXN_AMT, dfTxn.TXN_FEE, dfTxn.ROUND, dfTxn.INTRA)
dfPaymentTx.show(1, False, True)
print(dfPaymentTx.count())

                                                                                

-RECORD 0-----------------------------------------------
 TXN_SND | 7OUGoX3hg950O7LF51T+7uRwEWw9PMlVpp0axxDgZY0= 
 TXN_RCV | 8uB+aByCd418plJsSONsH7srs5CAkpjo8U4aHmvoh/c= 
 TXN_AMT | 1                                            
 TXN_FEE | 1000                                         
 ROUND   | 217819                                       
 INTRA   | 0                                            
only showing top 1 row





292


                                                                                

Create the vertices in Neo4j to later add the relationships

In [33]:
dfTxnSender = dfPaymentTx.select(dfPaymentTx.TXN_SND.alias("userAddress"))
dfTxnReceiver = dfPaymentTx.select(dfPaymentTx.TXN_RCV.alias("userAddress"))
dfPaymentNodes = dfTxnSender.union(dfTxnReceiver)
print(dfPaymentNodes.count())



584


                                                                                

Write the vertices to neo4j

In [34]:
dfPaymentNodes.write.format("org.neo4j.spark.DataSource") \
  .mode("Overwrite") \
  .option("url", "bolt://172.23.149.212:7687") \
  .option("labels", ":UserAddress") \
  .option("node.keys", "userAddress") \
  .save()

22/06/08 09:51:54 WARN SchemaService: Switching to query schema resolution
                                                                                

Write the relationships in neo4j

In [36]:
dfPaymentTx.write.format("org.neo4j.spark.DataSource") \
  .option("url", "bolt://172.23.149.212:7687") \
  .mode("Overwrite") \
  .option("relationship", "PAYMENT") \
  .option("relationship.save.strategy", "keys") \
  .option("relationship.source.labels", ":UserAddress") \
  .option("relationship.source.save.mode", "Overwrite") \
  .option("relationship.source.node.keys", "TXN_SND:userAddress") \
  .option("relationship.target.labels", ":UserAddress") \
  .option("relationship.target.save.mode", "Overwrite") \
  .option("relationship.target.node.keys", "TXN_RCV:userAddress") \
  .option("relationship.properties", "TXN_AMT:amount, TXN_FEE:fee, ROUND:blockNumber, INTRA:intraBlockTxNumber") \
  .save()

22/06/08 09:53:15 WARN SchemaService: Switching to query schema resolution
22/06/08 09:53:15 WARN SchemaService: Switching to query schema resolution
22/06/08 09:53:15 WARN SchemaService: Switching to query schema resolution
22/06/08 09:53:15 WARN SchemaService: For the following exception
org.neo4j.driver.exceptions.ClientException: Unable to convert scala.collection.immutable.Map$Map1 to Neo4j Value.
	at org.neo4j.driver.Values.value(Values.java:134)
	at org.neo4j.driver.internal.util.Extract.mapOfValues(Extract.java:203)
	at org.neo4j.driver.internal.AbstractQueryRunner.parameters(AbstractQueryRunner.java:69)
	at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:43)
	at org.neo4j.spark.service.SchemaService.retrieveSchemaFromApoc(SchemaService.scala:68)
	at org.neo4j.spark.service.SchemaService.liftedTree2$1(SchemaService.scala:171)
	at org.neo4j.spark.service.SchemaService.structForRelationship(SchemaService.scala:155)
	at org.neo4j.spark.service.SchemaServ

22/06/08 09:53:26 WARN TaskSetManager: Lost task 11.0 in stage 453.0 (TID 88588) (172.23.149.212 executor 1): org.neo4j.driver.exceptions.TransientException: ForsetiClient[transactionId=4791, clientId=29] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=4785, clientId=3]} on NODE(44), because holders of that lock are waiting for ForsetiClient[transactionId=4791, clientId=29].
 Wait list:ExclusiveLock[
Client[4785] waits for [ForsetiClient[transactionId=4793, clientId=18],ForsetiClient[transactionId=4791, clientId=29]]]
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:143)
	at org.neo4j.driver.internal.InternalTransaction.commit(InternalTransaction.java:39)
	at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:73)
	at org.neo4j.spark.writer.BaseDataWriter.commit(BaseDataWriter.scala:106)
	at org.neo4j.spark.writer.Neo4jDataWriter.commit(Neo4jDataWriter.scala:9)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$an

 Wait list:ExclusiveLock[
Client[4790] waits for [ForsetiClient[transactionId=4793, clientId=18]]]
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:143)
	at org.neo4j.driver.internal.InternalTransaction.commit(InternalTransaction.java:39)
	at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:73)
	at org.neo4j.spark.writer.BaseDataWriter.commit(BaseDataWriter.scala:106)
	at org.neo4j.spark.writer.Neo4jDataWriter.commit(Neo4jDataWriter.scala:9)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:430)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1496)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:457)
	at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:358)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:

22/06/08 09:53:27 WARN TaskSetManager: Lost task 11.2 in stage 453.0 (TID 88595) (172.23.149.212 executor 9): org.neo4j.driver.exceptions.TransientException: ForsetiClient[transactionId=4798, clientId=18] can't acquire ExclusiveLock{owner=ForsetiClient[transactionId=4780, clientId=19]} on NODE(44), because holders of that lock are waiting for ForsetiClient[transactionId=4798, clientId=18].
 Wait list:ExclusiveLock[
Client[4780] waits for [ForsetiClient[transactionId=4798, clientId=18]]]
	at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:143)
	at org.neo4j.driver.internal.InternalTransaction.commit(InternalTransaction.java:39)
	at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:73)
	at org.neo4j.spark.writer.BaseDataWriter.commit(BaseDataWriter.scala:106)
	at org.neo4j.spark.writer.Neo4jDataWriter.commit(Neo4jDataWriter.scala:9)
	at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:430)

                                                                                

#### Keyreg transactions and nodes

In [None]:
dfKeyregTx = dfTxn.filter(dfTxn.TYPEENUM == 2)
dfKeyregTx.show(1, False, True)

#### Asset configuration nodes and transactions

In [None]:
dfAssetConfigTx = dfTxn.filter(dfTxn.TYPEENUM == 3)
dfAssetConfigTx.show(1, False, True)

#### Asset transfer nodes and transactions

The xaid is the asset it which should also be node in the graph

In [None]:
dfAssetTransferTx = dfTxn.filter(dfTxn.TYPEENUM == 4)
dfAssetTransferTx.show(1, False, True)

#### Asset freeze nodes and transactions

The faid is the asset id being frozen. fadd is the address of the account whose asset is being frozen or unfrozen

In [None]:
dfAssetFreezeTx = dfTxn.filter(dfTxn.TYPEENUM == 5)
dfAssetFreezeTx.show(1, False, True)

#### Application nodes and transactions

The apid is the id of the application which should be a node

In [None]:
dfApplicationCallTx = dfTxn.filter(dfTxn.TYPEENUM == 6)
dfApplicationCallTx.show(1, False, True)