In [None]:
from pyspark.sql.types import  *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from datetime import datetime, date
from pyspark.sql import Row
from delta import *
from delta.tables import *
import pyspark
import tabulate
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [None]:
spark.stop()

In [None]:
config = pyspark.SparkConf().setAll([
    ('spark.executor.memory', '16g'), 
    ('spark.executor.cores', '9'), 
    ('spark.cores.max', '9'),
    ('spark.driver.memory','32g'),
    ('spark.executor.instances', '5'),
    ('spark.dynamicAllocation.enabled', 'true'),
    ('spark.dynamicAllocation.shuffleTracking.enabled', 'true'),
    ('spark.dynamicAllocation.executorIdleTimeout', '60s'),
    ('spark.dynamicAllocation.minExecutors', '0'),
    ('spark.dynamicAllocation.maxExecutors', '5'),
    ('spark.dynamicAllocation.initialExecutors', '1'),
    ('spark.dynamicAllocation.executorAllocationRatio', '1'),
    ('spark.sql.repl.eagerEval.enabled', 'true'),
    ('spark.databricks.delta.retentionDurationCheck.enabled', 'false'),
    ('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:10.0.2')
])

In [None]:
spark = SparkSession \
    .builder \
	.config(conf=config) \
    .appName("MongoDB-Test") \
    .master("spark://172.23.149.212:7077") \
    .getOrCreate()

In [None]:
schema = StructType([ \
    StructField("_id", StringType(), True), \
    StructField("asset", LongType(), True), \
    StructField("extra", StringType(), True), \
    StructField("intra", LongType(), True), \
    StructField("round", LongType(), True), \
    StructField("rr", LongType(), True), \
    StructField("sig", StringType(), True), \
    StructField("txid", StringType(), True), \
    StructField("txn_aamt", LongType(), True), \
    StructField("txn_aclose", StringType(), True), \
    StructField("txn_afrz", BooleanType(), True), \
    StructField("txn_amt", LongType(), True), \
    StructField("txn_apaa", StringType(), True), \
    StructField("txn_apan", LongType(), True), \
    StructField("txn_apap", StringType(), True), \
    StructField("txn_apar", StringType(), True), \
    StructField("txn_apas", StringType(), True), \
    StructField("txn_apat", StringType(), True), \
    StructField("txn_apep", StringType(), True), \
    StructField("txn_apfa", StringType(), True), \
    StructField("txn_apgs", StringType(), True), \
    StructField("txn_apid", LongType(), True), \
    StructField("txn_apls", StringType(), True), \
    StructField("txn_apsu", StringType(), True), \
    StructField("txn_arcv", StringType(), True), \
    StructField("txn_asnd", StringType(), True), \
    StructField("txn_caid", LongType(), True), \
    StructField("txn_close", StringType(), True), \
    StructField("txn_fadd", StringType(), True), \
    StructField("txn_faid", LongType(), True), \
    StructField("txn_fee", LongType(), True), \
    StructField("txn_fv", LongType(), True), \
    StructField("txn_gen", StringType(), True), \
    StructField("txn_gh", StringType(), True), \
    StructField("txn_grp", StringType(), True), \
    StructField("txn_lsig", StringType(), True), \
    StructField("txn_lv", LongType(), True), \
    StructField("txn_lx", StringType(), True), \
    StructField("txn_msig", StringType(), True), \
    StructField("txn_nonpart", BooleanType(), True), \
    StructField("txn_note", StringType(), True), \
    StructField("txn_rcv", StringType(), True), \
    StructField("txn_rekey", StringType(), True), \
    StructField("txn_selkey", StringType(), True), \
    StructField("txn_sig", StringType(), True), \
    StructField("txn_snd", StringType(), True), \
    StructField("txn_type", StringType(), True), \
    StructField("txn_votefst", LongType(), True), \
    StructField("txn_votekd", LongType(), True), \
    StructField("txn_votekey", StringType(), True), \
    StructField("txn_votelst", LongType(), True), \
    StructField("txn_xaid", LongType(), True), \
    StructField("typeenum", LongType(), True) \
])

In [None]:
df = spark.read.format("mongodb") \
	.option('spark.mongodb.connection.uri', 'mongodb://172.23.149.212:27017') \
  	.option('spark.mongodb.database', 'algorand') \
  	.option('spark.mongodb.collection', 'txn') \
	.option('spark.mongodb.read.readPreference.name', 'primaryPreferred') \
	.option('spark.mongodb.change.stream.publish.full.document.only','true') \
  	.option("forceDeleteTempCheckpointLocation", "true") \
    .option("spark.mongodb.read.sampleSize", 1000) \
    .schema(schema) \
  	.load()

In [None]:
df.printSchema()

In [None]:
df.count()

In [None]:
result = df.select("index", "created_at")

In [None]:
result.printSchema()

In [None]:
result.show()

In [None]:
result.write.format("mongodb") \
	.option('spark.mongodb.connection.uri', 'mongodb://172.23.149.212:27017') \
  	.mode("overwrite") \
    .option('spark.mongodb.database', 'algorand_silver') \
  	.option('spark.mongodb.collection', 'test') \
  	.option("forceDeleteTempCheckpointLocation", "true") \
  	.save()