In [8]:
import json

minio_client = get_minio_client()
bucket = "cdm-lake"
prefix = "msd_source/"
objects = minio_client.list_objects(bucket, prefix, recursive=False)

msd_db="modelseed_biochemistry"
spark = get_spark_session()
create_namespace_if_not_exists(spark, msd_db)

delete_keys = ['thermodynamics','notes','linked_compound','is_obsolete', # compounds
               'is_core','is_cofactor','comprised_of','class', # compounds
               'abstract_compound','aliases', # compounds
               'abstract_reaction','code','ec_numbers','pathways', # reactions
               'linked_reaction','stoichiometry','definition','equation', # reactions
               'compound_ids' # reactions
              ]

stoichiometries = dict()

spark_dataframes = dict()
for file_obj in objects:
    
    if('/MSD_' not in file_obj.object_name):
        continue
        
    print(file_obj.object_name)
    
    all_entities = list()

    #compounds or reactions
    msd_type = file_obj.object_name.lower().split('_')[2].split('.')[0]

    spark.sql(f"DROP TABLE IF EXISTS {msd_type}")
    file_resp = minio_client.get_object(bucket,file_obj.object_name)
    biochem_entities = file_resp.json()
    for entity in biochem_entities:
        for field in delete_keys:

            if(msd_type == 'reactions' and field == 'stoichiometry'):
                stoichiometries[entity['id']]=entity['stoichiometry']
            
            if(field in entity):
                del(entity[field])

        # has to be a list of json strings
        all_entities.append(json.dumps(entity))

    rdd = spark.sparkContext.parallelize(all_entities)
    df = spark.read.json(rdd)
    spark_dataframes[msd_type]=df

rgt_entities=list()
for rxn in stoichiometries:
    for rgt in stoichiometries[rxn]:
        entity = {'reaction_id':rxn,
                  'compound_id':rgt['compound'],
                  'compartment_index':rgt['compartment'],
                  'stoichiometry':rgt['coefficient']}
        rgt_entities.append(json.dumps(entity))
rdd=spark.sparkContext.parallelize(rgt_entities)
df=spark.read.json(rdd)
spark_dataframes['reagents']=df

25/01/06 17:25:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Namespace modelseed_biochemistry is ready to use.
msd_source/MSD_Compounds.json


25/01/06 17:25:13 WARN TaskSetManager: Stage 5 contains a task of very large size (8822 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

msd_source/MSD_Reactions.json


25/01/06 17:25:30 WARN TaskSetManager: Stage 6 contains a task of very large size (6258 KiB). The maximum recommended task size is 1000 KiB.
25/01/06 17:25:32 WARN TaskSetManager: Stage 7 contains a task of very large size (13007 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [9]:
spark = get_spark_session()
spark.sql("USE modelseed_biochemistry").show()
spark.sql("SHOW TABLES").show()

25/01/06 17:29:51 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/01/06 17:29:51 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


++
||
++
++

+--------------------+--------------------+-----------+
|           namespace|           tableName|isTemporary|
+--------------------+--------------------+-----------+
|modelseed_biochem...|           compounds|      false|
|modelseed_biochem...|kegg_pathway_reac...|      false|
|modelseed_biochem...|       kegg_pathways|      false|
|modelseed_biochem...|metacyc_pathway_r...|      false|
|modelseed_biochem...|    metacyc_pathways|      false|
|modelseed_biochem...|   pathway_reactions|      false|
|modelseed_biochem...|            pathways|      false|
|modelseed_biochem...|           reactions|      false|
|modelseed_biochem...|            reagents|      false|
+--------------------+--------------------+-----------+



In [10]:
spark_dataframes['compounds'].show()
spark_dataframes['reactions'].show()

25/01/06 17:29:57 WARN TaskSetManager: Stage 8 contains a task of very large size (8822 KiB). The maximum recommended task size is 1000 KiB.


+------------+------+-------+---------+--------------+--------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------------+
|abbreviation|charge| deltag|deltagerr|       formula|      id|            inchikey| mass|                name|                 pka|                 pkb|              smiles|          source|
+------------+------+-------+---------+--------------+--------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------------+
|         h2o|     0| -37.54|     0.18|           H2O|cpd00001|XLYOFNOQVPJJNP-UH...| 18.0|                 H2O|           1:1:15.70|           1:1:-1.80|                   O|Primary Database|
|         atp|    -3|-548.85|     0.36| C10H13N5O13P3|cpd00002|ZKHQWZAMYRWXGA-KQ...|504.0|                 ATP|1:14:12.60;1:22:3...|1:9:-3.03;1:14:-3...|Nc1ncnc2c1ncn2[C@...|Primary Database|
|         nad|    -1|-286.41|     1.59| 

25/01/06 17:29:58 WARN TaskSetManager: Stage 9 contains a task of very large size (6258 KiB). The maximum recommended task size is 1000 KiB.


+------------+------+---------+---------+--------+------------+--------------------+-------------+------------------+------+
|abbreviation|deltag|deltagerr|direction|      id|is_transport|                name|reversibility|            source|status|
+------------+------+---------+---------+--------+------------+--------------------+-------------+------------------+------+
|      R00004| -3.46|     0.05|     NULL|rxn00001|           0|diphosphate phosp...|            >|  Primary Database|    OK|
|      R00005|-20.14|     1.86|     NULL|rxn00002|           0|urea-1-carboxylat...|            >|  Primary Database|    OK|
|      R00006|  8.27|      0.9|     NULL|rxn00003|           0|pyruvate:pyruvate...|            <|  Primary Database|    OK|
|      R00008|  4.49|     0.57|     NULL|rxn00004|           0|4-hydroxy-4-methy...|            =|  Primary Database|    OK|
|      R00009|-46.06|     1.64|     NULL|rxn00006|           0|hydrogen-peroxide...|            >|  Primary Database|    OK|


In [11]:
from pyspark.sql.types import (
	StringType, LongType, DoubleType, BooleanType
)
from pyspark.sql.functions import col

for msd_type in spark_dataframes:
    df=spark_dataframes[msd_type]
    for field, dtype in df.dtypes:
        
        if(dtype == 'string'):
            df = df.withColumn(field, col(field).cast(StringType()))
        elif(dtype == 'double'):
            df = df.withColumn(field, col(field).cast(DoubleType()))
        elif(dtype == 'bigint' and field == 'is_transport'):
            df = df.withColumn(field, col(field).cast(BooleanType()))
        elif(dtype == 'bigint'):
            df = df.withColumn(field, col(field).cast(LongType()))
        else:
            print("Unsupported field?",field,dtype)


In [12]:
for msd_type in spark_dataframes:
    spark_table = f"{msd_db}.{msd_type}"
    delta_file = f"msd_delta/{msd_type}.delta"
    df = spark_dataframes[msd_type]
    df.write.mode("overwrite") \
        .option("overwriteSchema", "true") \
    	.option("compression", "snappy") \
    	.option("path", f"s3a://{bucket}/{delta_file}") \
    	.format("delta") \
    	.saveAsTable(spark_table)
    print(f"Spark table {spark_table} created.")

25/01/06 17:30:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/01/06 17:30:54 WARN TaskSetManager: Stage 17 contains a task of very large size (8822 KiB). The maximum recommended task size is 1000 KiB.
25/01/06 17:30:57 WARN TaskSetManager: Lost task 1.0 in stage 17.0 (TID 74) (development-yarn-nodemanager-4-1 executor 2): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to s3a://cdm-lake/msd_delta/compounds.delta.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.executeTask(DeltaFileFormatWriter.scala:447)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$executeWrite$2(DeltaFileFormatWriter.scala:274)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at or

Py4JJavaError: An error occurred while calling o769.saveAsTable.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 17.0 failed 4 times, most recent failure: Lost task 0.3 in stage 17.0 (TID 78) (development-yarn-nodemanager-4-1 executor 2): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to s3a://cdm-lake/msd_delta/compounds.delta.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.executeTask(DeltaFileFormatWriter.scala:447)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$executeWrite$2(DeltaFileFormatWriter.scala:274)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.nio.file.AccessDeniedException: msd_delta/compounds.delta/part-00000-84ccd335-86b5-49a2-b633-781b68b2fce8-c000.snappy.parquet: Writing Object on msd_delta/compounds.delta/part-00000-84ccd335-86b5-49a2-b633-781b68b2fce8-c000.snappy.parquet: com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 18182AC012BF7CA3; S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca; Proxy: null), S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca:AccessDenied
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:255)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:119)
	at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:322)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:318)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:293)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.retry(WriteOperationHelper.java:208)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.putObject(WriteOperationHelper.java:563)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.lambda$putObject$0(S3ABlockOutputStream.java:562)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
	at org.apache.hadoop.util.SemaphoredDelegatingExecutor$RunnableWithPermitRelease.run(SemaphoredDelegatingExecutor.java:196)
	at org.apache.hadoop.util.SemaphoredDelegatingExecutor$RunnableWithPermitRelease.run(SemaphoredDelegatingExecutor.java:196)
	... 3 more
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 18182AC012BF7CA3; S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca; Proxy: null), S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1879)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1418)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1387)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403)
	at com.amazonaws.services.s3.AmazonS3Client.access$300(AmazonS3Client.java:421)
	at com.amazonaws.services.s3.AmazonS3Client$PutObjectStrategy.invokeServiceCall(AmazonS3Client.java:6531)
	at com.amazonaws.services.s3.AmazonS3Client.uploadObject(AmazonS3Client.java:1861)
	at com.amazonaws.services.s3.AmazonS3Client.putObject(AmazonS3Client.java:1821)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$putObjectDirect$17(S3AFileSystem.java:2877)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier(IOStatisticsBinding.java:604)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:2874)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:566)
	at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:117)
	... 15 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$executeWrite$1(DeltaFileFormatWriter.scala:263)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.writeAndCommit(DeltaFileFormatWriter.scala:295)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.executeWrite(DeltaFileFormatWriter.scala:234)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.write(DeltaFileFormatWriter.scala:214)
	at org.apache.spark.sql.delta.files.TransactionalWrite.$anonfun$writeFiles$1(TransactionalWrite.scala:440)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:398)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:371)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:147)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:246)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:242)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:147)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles(TransactionalWrite.scala:235)
	at org.apache.spark.sql.delta.files.TransactionalWrite.writeFiles$(TransactionalWrite.scala:232)
	at org.apache.spark.sql.delta.OptimisticTransaction.writeFiles(OptimisticTransaction.scala:147)
	at org.apache.spark.sql.delta.commands.WriteIntoDelta.writeFiles(WriteIntoDelta.scala:349)
	at org.apache.spark.sql.delta.commands.WriteIntoDelta.writeAndReturnCommitData(WriteIntoDelta.scala:297)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.doDeltaWrite$1(CreateDeltaTableCommand.scala:250)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.handleCreateTableAsSelect(CreateDeltaTableCommand.scala:276)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.handleCommit(CreateDeltaTableCommand.scala:150)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.$anonfun$run$2(CreateDeltaTableCommand.scala:110)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordFrameProfile(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.metering.DeltaLogging.$anonfun$recordDeltaOperationInternal$1(DeltaLogging.scala:136)
	at com.databricks.spark.util.DatabricksLogging.recordOperation(DatabricksLogging.scala:128)
	at com.databricks.spark.util.DatabricksLogging.recordOperation$(DatabricksLogging.scala:117)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordOperation(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperationInternal(DeltaLogging.scala:135)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:125)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:115)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.recordDeltaOperation(CreateDeltaTableCommand.scala:57)
	at org.apache.spark.sql.delta.commands.CreateDeltaTableCommand.run(CreateDeltaTableCommand.scala:110)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.$anonfun$createDeltaTable$1(DeltaCatalog.scala:184)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.recordFrameProfile(DeltaCatalog.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.org$apache$spark$sql$delta$catalog$DeltaCatalog$$createDeltaTable(DeltaCatalog.scala:95)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog$StagedDeltaTableV2.$anonfun$commitStagedChanges$1(DeltaCatalog.scala:545)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile(DeltaLogging.scala:168)
	at org.apache.spark.sql.delta.metering.DeltaLogging.recordFrameProfile$(DeltaLogging.scala:166)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog.recordFrameProfile(DeltaCatalog.scala:65)
	at org.apache.spark.sql.delta.catalog.DeltaCatalog$StagedDeltaTableV2.commitStagedChanges(DeltaCatalog.scala:507)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.$anonfun$writeToTable$1(WriteToDataSourceV2Exec.scala:580)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable(WriteToDataSourceV2Exec.scala:573)
	at org.apache.spark.sql.execution.datasources.v2.V2CreateTableAsSelectBaseExec.writeToTable$(WriteToDataSourceV2Exec.scala:567)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.writeToTable(WriteToDataSourceV2Exec.scala:183)
	at org.apache.spark.sql.execution.datasources.v2.AtomicReplaceTableAsSelectExec.run(WriteToDataSourceV2Exec.scala:216)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:634)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:568)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to s3a://cdm-lake/msd_delta/compounds.delta.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.executeTask(DeltaFileFormatWriter.scala:447)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$executeWrite$2(DeltaFileFormatWriter.scala:274)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.nio.file.AccessDeniedException: msd_delta/compounds.delta/part-00000-84ccd335-86b5-49a2-b633-781b68b2fce8-c000.snappy.parquet: Writing Object on msd_delta/compounds.delta/part-00000-84ccd335-86b5-49a2-b633-781b68b2fce8-c000.snappy.parquet: com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 18182AC012BF7CA3; S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca; Proxy: null), S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca:AccessDenied
	at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:255)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:119)
	at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:322)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:318)
	at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:293)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.retry(WriteOperationHelper.java:208)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.putObject(WriteOperationHelper.java:563)
	at org.apache.hadoop.fs.s3a.S3ABlockOutputStream.lambda$putObject$0(S3ABlockOutputStream.java:562)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.TrustedListenableFutureTask$TrustedFutureInterruptibleTask.runInterruptibly(TrustedListenableFutureTask.java:125)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:69)
	at org.apache.hadoop.thirdparty.com.google.common.util.concurrent.TrustedListenableFutureTask.run(TrustedListenableFutureTask.java:78)
	at org.apache.hadoop.util.SemaphoredDelegatingExecutor$RunnableWithPermitRelease.run(SemaphoredDelegatingExecutor.java:196)
	at org.apache.hadoop.util.SemaphoredDelegatingExecutor$RunnableWithPermitRelease.run(SemaphoredDelegatingExecutor.java:196)
	... 3 more
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Access Denied. (Service: Amazon S3; Status Code: 403; Error Code: AccessDenied; Request ID: 18182AC012BF7CA3; S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca; Proxy: null), S3 Extended Request ID: 6008fa4fca9724794957431f1b601abef4f4d0a7ea602326c858484a7fb219ca
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1879)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1418)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1387)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403)
	at com.amazonaws.services.s3.AmazonS3Client.access$300(AmazonS3Client.java:421)
	at com.amazonaws.services.s3.AmazonS3Client$PutObjectStrategy.invokeServiceCall(AmazonS3Client.java:6531)
	at com.amazonaws.services.s3.AmazonS3Client.uploadObject(AmazonS3Client.java:1861)
	at com.amazonaws.services.s3.AmazonS3Client.putObject(AmazonS3Client.java:1821)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$putObjectDirect$17(S3AFileSystem.java:2877)
	at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDurationOfSupplier(IOStatisticsBinding.java:604)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:2874)
	at org.apache.hadoop.fs.s3a.WriteOperationHelper.lambda$putObject$7(WriteOperationHelper.java:566)
	at org.apache.hadoop.fs.store.audit.AuditingFunctions.lambda$withinAuditSpan$0(AuditingFunctions.java:62)
	at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:117)
	... 15 more


25/01/06 17:31:06 WARN TaskSetManager: Lost task 1.1 in stage 17.0 (TID 75) (development-yarn-nodemanager-3-1 executor 4): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 0 in stage 17.0 failed 4 times, most recent failure: Lost task 0.3 in stage 17.0 (TID 78) (development-yarn-nodemanager-4-1 executor 2): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to s3a://cdm-lake/msd_delta/compounds.delta.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.executeTask(DeltaFileFormatWriter.scala:447)
	at org.apache.spark.sql.delta.files.DeltaFileFormatWriter$.$anonfun$executeWrite$2(DeltaFileFormatWriter.scala:274)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)

In [6]:
spark.sql("SELECT COUNT(*) AS Compound_Count from compounds").show()
spark.sql("SELECT COUNT(*) AS Reaction_Count from reactions").show()
spark.sql("SELECT COUNT(*) AS Reagent_Count from reagents").show()

                                                                                

+--------------+
|Compound_Count|
+--------------+
|         45706|
+--------------+



                                                                                

+--------------+
|Reaction_Count|
+--------------+
|         56009|
+--------------+



                                                                                

+-------------+
|Reagent_Count|
+-------------+
|       262506|
+-------------+



In [35]:
spark.sql("SELECT * from compounds ORDER BY id LIMIT 10").show()
spark.sql("SELECT * from reactions ORDER BY id LIMIT 10").show()
spark.sql("SELECT * from reagents ORDER BY reaction_id,compound_id LIMIT 10").show()

                                                                                

+------------+------+-------+---------+--------------+--------+--------------------+-----+---------+--------------------+--------------------+--------------------+----------------+
|abbreviation|charge| deltag|deltagerr|       formula|      id|            inchikey| mass|     name|                 pka|                 pkb|              smiles|          source|
+------------+------+-------+---------+--------------+--------+--------------------+-----+---------+--------------------+--------------------+--------------------+----------------+
|         h2o|     0| -37.54|     0.18|           H2O|cpd00001|XLYOFNOQVPJJNP-UH...| 18.0|      H2O|           1:1:15.70|           1:1:-1.80|                   O|Primary Database|
|         atp|    -3|-548.85|     0.36| C10H13N5O13P3|cpd00002|ZKHQWZAMYRWXGA-KQ...|504.0|      ATP|1:14:12.60;1:22:3...|1:9:-3.03;1:14:-3...|Nc1ncnc2c1ncn2[C@...|Primary Database|
|         nad|    -1|-286.41|     1.59| C21H26N7O14P2|cpd00003|BAWFJGJZGIEFAR-NN...|662.0|     

                                                                                

+------------+------+---------+---------+--------+------------+--------------------+-------------+----------------+------+
|abbreviation|deltag|deltagerr|direction|      id|is_transport|                name|reversibility|          source|status|
+------------+------+---------+---------+--------+------------+--------------------+-------------+----------------+------+
|      R00004| -3.46|     0.05|     NULL|rxn00001|           0|diphosphate phosp...|            >|Primary Database|    OK|
|      R00005|-20.14|     1.86|     NULL|rxn00002|           0|urea-1-carboxylat...|            >|Primary Database|    OK|
|      R00006|  8.27|      0.9|     NULL|rxn00003|           0|pyruvate:pyruvate...|            <|Primary Database|    OK|
|      R00008|  4.49|     0.57|     NULL|rxn00004|           0|4-hydroxy-4-methy...|            =|Primary Database|    OK|
|      R00009|-46.06|     1.64|     NULL|rxn00006|           0|hydrogen-peroxide...|            >|Primary Database|    OK|
|      R00010| -



+-----------------+-----------+-----------+-------------+
|compartment_index|compound_id|reaction_id|stoichiometry|
+-----------------+-----------+-----------+-------------+
|                0|   cpd00001|   rxn00001|         -1.0|
|                0|   cpd00009|   rxn00001|          2.0|
|                0|   cpd00012|   rxn00001|         -1.0|
|                0|   cpd00067|   rxn00001|          1.0|
|                0|   cpd00001|   rxn00002|         -1.0|
|                0|   cpd00011|   rxn00002|          2.0|
|                0|   cpd00013|   rxn00002|          2.0|
|                0|   cpd00067|   rxn00002|         -3.0|
|                0|   cpd00742|   rxn00002|         -1.0|
|                0|   cpd00011|   rxn00003|         -1.0|
+-----------------+-----------+-----------+-------------+



                                                                                

Stopping Spark session after timeout...
Stopping Spark session after timeout...


In [25]:
spark.sql('SELECT rxn.id,rxn.name,cpd.id,cpd.name,rgt.compartment_index,rgt.stoichiometry \
        FROM compounds AS cpd, reagents AS rgt, reactions AS rxn \
        WHERE LOWER(cpd.name) LIKE "%quinon%" \
        AND rgt.compound_id = cpd.id \
        AND rgt.reaction_id = rxn.id').show()

+--------+--------------------+--------+--------------------+-----------------+-------------+
|      id|                name|      id|                name|compartment_index|stoichiometry|
+--------+--------------------+--------+--------------------+-----------------+-------------+
|rxn33538|            RXN-2543|cpd29943|2,3-dimethyl-6-ph...|                0|         -1.0|
|rxn33627|                   -|cpd03449|2-Octaprenyl-3-me...|                0|         -1.0|
|rxn33627|                   -|cpd03449|2-Octaprenyl-3-me...|                1|          1.0|
|rxn33767|            RXN-2542|cpd29953|2-methyl-6-phytyl...|                0|         -1.0|
|rxn33767|            RXN-2542|cpd29943|2,3-dimethyl-6-ph...|                0|          1.0|
|rxn34195|alcohol dehydroge...|cpd30039|PQQ (Pyrroloquino...|                0|         -1.0|
|rxn34195|alcohol dehydroge...|cpd30040|PQQ (pyrroloquino...|                0|          1.0|
|rxn34200|cytochrome b6/f c...|cpd12011|       Plastoquinone

In [30]:
spark.sql('SELECT COUNT(DISTINCT(rxn.id)) AS Reaction_Count \
        FROM compounds AS cpd, reagents AS rgt, reactions AS rxn \
        WHERE LOWER(cpd.name) LIKE "%quinon%" \
        AND rgt.compound_id = cpd.id \
        AND rgt.reaction_id = rxn.id').show()

+--------------+
|Reaction_Count|
+--------------+
|          1317|
+--------------+

Stopping Spark session after timeout...
Stopping Spark session after timeout...
Stopping Spark session after timeout...
