In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import types

In [3]:
spark = SparkSession.builder \
    .appName("Data_Transformation") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.jars", "gcs-connector-hadoop3-latest.jar,spark-bigquery-latest_2.12.jar") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", "/home/chenchen/.gc/my-creds.json") \
    .getOrCreate()

25/03/18 12:45:58 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
25/03/18 12:45:58 INFO SharedState: Warehouse path is 'file:/home/chenchen/DE-Zoomcamp-Project/spark_transformation/spark-warehouse'.


In [4]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("compression", "gzip") \
    .option("encoding", "us-ascii") \
    .csv("gs://de-zoomcamp-project-453801-terra-bucket/stock_dataset/stock/1742135256.3842237.3b60b69d81.csv.gz") 

25/03/18 12:46:11 INFO GhfsStorageStatistics: Detected potential high latency for operation op_get_file_status. latencyMs=749; previousMaxLatencyMs=0; operationCount=1; context=gs://de-zoomcamp-project-453801-terra-bucket/stock_dataset/stock/1742135256.3842237.3b60b69d81.csv.gz
25/03/18 12:46:11 INFO InMemoryFileIndex: It took 88 ms to list leaf files for 1 paths.
25/03/18 12:46:11 INFO InMemoryFileIndex: It took 41 ms to list leaf files for 1 paths.
25/03/18 12:46:14 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:46:14 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:46:14 INFO FileSourceStrategy: Output Data Schema: struct<value: string>
25/03/18 12:46:15 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 201.5 KiB, free 434.2 MiB)
25/03/18 12:46:15 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 34.6 KiB, free 434.2 MiB)
25/03/18 12:46:15 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on de-zo

                                                                                

25/03/18 12:46:17 INFO CodeGenerator: Code generated in 14.231533 ms
25/03/18 12:46:17 INFO SparkContext: Starting job: csv at NativeMethodAccessorImpl.java:0
25/03/18 12:46:17 INFO DAGScheduler: Got job 1 (csv at NativeMethodAccessorImpl.java:0) with 1 output partitions
25/03/18 12:46:17 INFO DAGScheduler: Final stage: ResultStage 1 (csv at NativeMethodAccessorImpl.java:0)
25/03/18 12:46:17 INFO DAGScheduler: Parents of final stage: List()
25/03/18 12:46:17 INFO DAGScheduler: Missing parents: List()
25/03/18 12:46:17 INFO DAGScheduler: Submitting ResultStage 1 (MapPartitionsRDD[10] at csv at NativeMethodAccessorImpl.java:0), which has no missing parents
25/03/18 12:46:17 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 18.4 KiB, free 434.1 MiB)
25/03/18 12:46:17 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 8.8 KiB, free 434.1 MiB)
25/03/18 12:46:17 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on de-zoom

In [5]:
df.show(5)

25/03/18 12:46:17 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:46:17 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:46:17 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 15 more fields>
25/03/18 12:46:18 INFO CodeGenerator: Code generated in 51.916017 ms
25/03/18 12:46:18 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 201.4 KiB, free 433.9 MiB)
25/03/18 12:46:18 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 34.6 KiB, free 433.9 MiB)
25/03/18 12:46:18 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 (size: 34.6 KiB, free: 434.3 MiB)
25/03/18 12:46:18 INFO SparkContext: Created broadcast 3 from showString at NativeMethodAccessorImpl.java:0
25/03/18 12:46:18 INFO FileSourceScanExec: Planning scan with bin packing, max size: 4194304 b

In [6]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# Assuming you have the DataFrame loaded as `df`

# Convert the 'date' column to a proper Date type (if it's not already)
df = df.withColumn("date", F.to_date(df["date"], "yyyy-MM-dd"))

# Extract year and month from the date to perform monthly aggregation
df = df.withColumn("year_month", F.date_format("date", "yyyy-MM"))

# Group by 'symbol' and 'year_month' and calculate the required aggregates
monthly_df = df.groupBy("symbol", "year_month") \
    .agg(
        F.avg("open").alias("avg_open"),
        F.max("high").alias("max_high"),
        F.min("low").alias("min_low"),
        F.avg("close").alias("avg_close"),
        F.max("volume").alias("max_volume"),
        F.avg("adj_high").alias("avg_adj_high"),
        F.min("adj_low").alias("min_adj_low"),
        F.avg("adj_close").alias("avg_adj_close"),
        F.avg("adj_open").alias("avg_adj_open"),
        F.avg("adj_volume").alias("avg_adj_volume"),
        F.avg("split_factor").alias("avg_split_factor"),
        F.avg("dividend").alias("avg_dividend")
    )

# Show the result
monthly_df.show(5)


25/03/18 12:46:29 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:46:29 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:46:29 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 12 more fields>
25/03/18 12:46:29 INFO BlockManagerInfo: Removed broadcast_0_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 34.6 KiB, free: 434.3 MiB)
25/03/18 12:46:29 INFO BlockManagerInfo: Removed broadcast_3_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 34.6 KiB, free: 434.4 MiB)
25/03/18 12:46:29 INFO BlockManagerInfo: Removed broadcast_2_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 8.8 KiB, free: 434.4 MiB)
25/03/18 12:46:30 INFO CodeGenerator: Code generated in 223.499614 ms
25/03/18 12:46:30 INFO BlockManagerInfo: Removed broadcast_4_piec

[Stage 3:>                                                          (0 + 1) / 1]

25/03/18 12:46:31 INFO Executor: Finished task 0.0 in stage 3.0 (TID 3). 2833 bytes result sent to driver
25/03/18 12:46:31 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 3) in 840 ms on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal (executor driver) (1/1)
25/03/18 12:46:31 INFO TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool 
25/03/18 12:46:31 INFO DAGScheduler: ShuffleMapStage 3 (showString at NativeMethodAccessorImpl.java:0) finished in 0.915 s
25/03/18 12:46:31 INFO DAGScheduler: looking for newly runnable stages
25/03/18 12:46:31 INFO DAGScheduler: running: Set()
25/03/18 12:46:31 INFO DAGScheduler: waiting: Set()
25/03/18 12:46:31 INFO DAGScheduler: failed: Set()
25/03/18 12:46:31 INFO ShufflePartitionsUtil: For shuffle(0), advisory target size: 67108864, actual target size 1048576, minimum partition size: 1048576
25/03/18 12:46:31 INFO HashAggregateExec: spark.sql.codegen.aggregate.map.twolevel.enabled is set

                                                                                

In [7]:
# Extract the year and week number from the date
df = df.withColumn("year", F.year(df["date"]))
df = df.withColumn("week", F.weekofyear(df["date"]))

# Combine year and week into a single column 'year_week' in "yyyy-week" format
df = df.withColumn("year_week", F.concat_ws("-", df["year"], df["week"]))

# Group by 'symbol' and 'year_week' and calculate the required aggregates
weekly_df = df.groupBy("symbol", "year_week") \
    .agg(
        F.avg("open").alias("avg_open"),
        F.max("high").alias("max_high"),
        F.min("low").alias("min_low"),
        F.avg("close").alias("avg_close"),
        F.max("volume").alias("max_volume"),
        F.avg("adj_high").alias("avg_adj_high"),
        F.min("adj_low").alias("min_adj_low"),
        F.avg("adj_close").alias("avg_adj_close"),
        F.avg("adj_open").alias("avg_adj_open"),
        F.avg("adj_volume").alias("avg_adj_volume"),
        F.avg("split_factor").alias("avg_split_factor"),
        F.avg("dividend").alias("avg_dividend")
    )

# Show the result
weekly_df.show(5)


25/03/18 12:46:41 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:46:41 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:46:41 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 12 more fields>
25/03/18 12:46:41 INFO BlockManagerInfo: Removed broadcast_7_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 32.4 KiB, free: 434.3 MiB)
25/03/18 12:46:41 INFO BlockManagerInfo: Removed broadcast_5_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 34.6 KiB, free: 434.4 MiB)
25/03/18 12:46:41 INFO BlockManagerInfo: Removed broadcast_6_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 30.5 KiB, free: 434.4 MiB)
25/03/18 12:46:41 INFO CodeGenerator: Code generated in 80.20447 ms
25/03/18 12:46:41 INFO MemoryStore: Block broadcast_8 stored as va

In [13]:
#  #.option("clusteredFields", "symbol, year_month") \
monthly_df.write \
    .format("bigquery") \
    .option("temporaryGcsBucket", "de-zoomcamp-project-453801-terra-bucket") \
    .option("table", "de-zoomcamp-project-453801.demo_dataset.monthly_stock_data") \
    .option("partitionField", "symbol") \
    .mode("overwrite") \
    .save()

25/03/18 13:06:37 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 13:06:37 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 13:06:37 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 12 more fields>
25/03/18 13:06:37 INFO ParquetFileFormat: Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
25/03/18 13:06:37 INFO MemoryStore: Block broadcast_22 stored as values in memory (estimated size 201.4 KiB, free 434.0 MiB)
25/03/18 13:06:37 INFO BlockManagerInfo: Removed broadcast_19_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 34.6 KiB, free: 434.4 MiB)
25/03/18 13:06:37 INFO MemoryStore: Block broadcast_22_piece0 stored as bytes in memory (estimated size 34.6 KiB, free 434.2 MiB)
25/03/18 13:06:37 INFO BlockManagerInfo: Added broadcast_22_piece0 in memory on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-pr

[Stage 21:>                                                         (0 + 1) / 1]

25/03/18 13:06:38 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-b7ef6750-8029-4e8d-ad36-016bee1a9f92/_temporary/0/_temporary/' directory.
25/03/18 13:06:38 INFO FileOutputCommitter: Saved output of task 'attempt_202503181306378933985505372282929_0021_m_000000_15' to gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-b7ef6750-8029-4e8d-ad36-016bee1a9f92/_temporary/0/task_202503181306378933985505372282929_0021_m_000000
25/03/18 13:06:38 INFO SparkHadoopMapRedUtil: attempt_202503181306378933985505372282929_0021_m_000000_15: Committed. Elapsed time: 443 ms.
25/03/18 13:06:38 INFO Executor: Finished task 0.0 in stage 21.0 (TID 15). 4854 bytes result sent to driver
25/03/18 13:06:38 INFO TaskSetManager: Finished task 0.0 in stage 21.0 (TID 15) in 821 ms on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal (executor driver) (1/1)
25/03/18 13:06:38 I

                                                                                

25/03/18 13:06:39 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-b7ef6750-8029-4e8d-ad36-016bee1a9f92/_temporary/0/task_202503181306378933985505372282929_0021_m_000000/' directory.
25/03/18 13:06:39 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-b7ef6750-8029-4e8d-ad36-016bee1a9f92/' directory.
25/03/18 13:06:39 INFO BlockManagerInfo: Removed broadcast_24_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 103.3 KiB, free: 434.4 MiB)
25/03/18 13:06:39 INFO FileFormatWriter: Write Job 4e2c13fb-f65a-4edb-8c79-dcfbabd0a99b committed. Elapsed time: 975 ms.
25/03/18 13:06:39 INFO FileFormatWriter: Finished processing stats for write job 4e2c13fb-f65a-4edb-8c79-dcfbabd0a99b.
25/03/18 13:06:40 ERROR BigQueryClient: Unable to create the job to load to de-zoomc

Py4JJavaError: An error occurred while calling o179.save.
: com.google.cloud.bigquery.connector.common.BigQueryConnectorException: Failed to write to BigQuery
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.java:110)
	at com.google.cloud.spark.bigquery.write.BigQueryDeprecatedIndirectInsertableRelation.insert(BigQueryDeprecatedIndirectInsertableRelation.java:43)
	at com.google.cloud.spark.bigquery.write.CreatableRelationProviderHelper.createRelation(CreatableRelationProviderHelper.java:54)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:107)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryException: The field specified for time partitioning can only be of type TIMESTAMP, DATE or DATETIME. The type found is: STRING.
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.translate(HttpBigQueryRpc.java:115)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.create(HttpBigQueryRpc.java:220)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$5.call(BigQueryImpl.java:405)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl$5.call(BigQueryImpl.java:394)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.gax.retrying.DirectRetryingExecutor.submit(DirectRetryingExecutor.java:103)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryRetryHelper.run(BigQueryRetryHelper.java:86)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryRetryHelper.runWithRetries(BigQueryRetryHelper.java:49)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl.create(BigQueryImpl.java:393)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQueryImpl.create(BigQueryImpl.java:358)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.createAndWaitFor(BigQueryClient.java:328)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.createAndWaitFor(BigQueryClient.java:323)
	at com.google.cloud.bigquery.connector.common.BigQueryClient.loadDataIntoTable(BigQueryClient.java:564)
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.loadDataToBigQuery(BigQueryWriteHelper.java:134)
	at com.google.cloud.spark.bigquery.write.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.java:107)
	... 44 more
Caused by: com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.json.GoogleJsonResponseException: 400 Bad Request
POST https://www.googleapis.com/bigquery/v2/projects/de-zoomcamp-project-453801/jobs?prettyPrint=false
{
  "code": 400,
  "errors": [
    {
      "domain": "global",
      "message": "The field specified for time partitioning can only be of type TIMESTAMP, DATE or DATETIME. The type found is: STRING.",
      "reason": "invalid"
    }
  ],
  "message": "The field specified for time partitioning can only be of type TIMESTAMP, DATE or DATETIME. The type found is: STRING.",
  "status": "INVALID_ARGUMENT"
}
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.json.GoogleJsonResponseException.from(GoogleJsonResponseException.java:146)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:118)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.json.AbstractGoogleJsonClientRequest.newExceptionOnError(AbstractGoogleJsonClientRequest.java:37)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest$3.interceptResponse(AbstractGoogleClientRequest.java:466)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.http.HttpRequest.execute(HttpRequest.java:1111)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:552)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.executeUnparsed(AbstractGoogleClientRequest.java:493)
	at com.google.cloud.spark.bigquery.repackaged.com.google.api.client.googleapis.services.AbstractGoogleClientRequest.execute(AbstractGoogleClientRequest.java:603)
	at com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.create(HttpBigQueryRpc.java:218)
	... 56 more


25/03/18 13:14:04 INFO BlockManagerInfo: Removed broadcast_22_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 34.6 KiB, free: 434.4 MiB)


In [9]:
weekly_df.write \
    .format("bigquery") \
    .option("temporaryGcsBucket", "de-zoomcamp-project-453801-terra-bucket") \
    .option("table", "de-zoomcamp-project-453801.demo_dataset.weekly_stock_data") \
    .mode("overwrite") \
    .save()

25/03/18 12:49:30 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:49:30 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:49:30 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 12 more fields>
25/03/18 12:49:30 INFO ParquetFileFormat: Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
25/03/18 12:49:30 INFO MemoryStore: Block broadcast_14 stored as values in memory (estimated size 201.4 KiB, free 434.0 MiB)
25/03/18 12:49:30 INFO MemoryStore: Block broadcast_14_piece0 stored as bytes in memory (estimated size 34.6 KiB, free 433.9 MiB)
25/03/18 12:49:30 INFO BlockManagerInfo: Added broadcast_14_piece0 in memory on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 (size: 34.6 KiB, free: 434.3 MiB)
25/03/18 12:49:30 INFO SparkContext: Created broadcast 14 from save at BigQueryWriteHelper.java:105
25/03/18 12:49:30 INFO BlockManag

[Stage 14:>                                                         (0 + 1) / 1]

25/03/18 12:49:31 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-957dce8e-03c5-4561-a07c-f9ad8e86e139/_temporary/0/_temporary/' directory.
25/03/18 12:49:31 INFO FileOutputCommitter: Saved output of task 'attempt_202503181249302461627306395990593_0014_m_000000_10' to gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-957dce8e-03c5-4561-a07c-f9ad8e86e139/_temporary/0/task_202503181249302461627306395990593_0014_m_000000
25/03/18 12:49:31 INFO SparkHadoopMapRedUtil: attempt_202503181249302461627306395990593_0014_m_000000_10: Committed. Elapsed time: 443 ms.
25/03/18 12:49:31 INFO Executor: Finished task 0.0 in stage 14.0 (TID 10). 4854 bytes result sent to driver
25/03/18 12:49:31 INFO TaskSetManager: Finished task 0.0 in stage 14.0 (TID 10) in 848 ms on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal (executor driver) (1/1)
25/03/18 12:49:31 I

                                                                                

25/03/18 12:49:32 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-957dce8e-03c5-4561-a07c-f9ad8e86e139/_temporary/0/task_202503181249302461627306395990593_0014_m_000000/' directory.
25/03/18 12:49:32 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-957dce8e-03c5-4561-a07c-f9ad8e86e139/' directory.
25/03/18 12:49:32 INFO BlockManagerInfo: Removed broadcast_16_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 103.6 KiB, free: 434.4 MiB)
25/03/18 12:49:32 INFO FileFormatWriter: Write Job 9793a380-f214-43e8-8ae7-c0be446b7e6f committed. Elapsed time: 1016 ms.
25/03/18 12:49:32 INFO FileFormatWriter: Finished processing stats for write job 9793a380-f214-43e8-8ae7-c0be446b7e6f.
25/03/18 12:49:33 INFO BigQueryClient: Submitted job LoadJobConfiguration{type=LOAD

In [11]:
df.write \
    .format("bigquery") \
    .option("temporaryGcsBucket", "de-zoomcamp-project-453801-terra-bucket") \
    .option("table", "de-zoomcamp-project-453801.demo_dataset.daily_stock_data") \
    .mode("overwrite") \
    .save()

25/03/18 12:49:53 INFO FileSourceStrategy: Pushed Filters: 
25/03/18 12:49:53 INFO FileSourceStrategy: Post-Scan Filters: 
25/03/18 12:49:53 INFO FileSourceStrategy: Output Data Schema: struct<open: double, high: double, low: double, close: double, volume: double ... 15 more fields>
25/03/18 12:49:53 INFO ParquetFileFormat: Using default output committer for Parquet: org.apache.parquet.hadoop.ParquetOutputCommitter
25/03/18 12:49:53 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/03/18 12:49:53 INFO FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
25/03/18 12:49:53 INFO SQLHadoopMapReduceCommitProtocol: Using user defined output committer class org.apache.parquet.hadoop.ParquetOutputCommitter
25/03/18 12:49:53 INFO FileOutputCommitter: File Output Committer Algorithm version is 1
25/03/18 12:49:53 INFO FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders un

[Stage 15:>                                                         (0 + 1) / 1]

25/03/18 12:49:55 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-5524d064-a83c-4183-9557-25cd029a757d/_temporary/0/_temporary/' directory.
25/03/18 12:49:55 INFO FileOutputCommitter: Saved output of task 'attempt_202503181249537726674266397982598_0015_m_000000_11' to gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-5524d064-a83c-4183-9557-25cd029a757d/_temporary/0/task_202503181249537726674266397982598_0015_m_000000
25/03/18 12:49:55 INFO SparkHadoopMapRedUtil: attempt_202503181249537726674266397982598_0015_m_000000_11: Committed. Elapsed time: 454 ms.
25/03/18 12:49:55 INFO Executor: Finished task 0.0 in stage 15.0 (TID 11). 2566 bytes result sent to driver
25/03/18 12:49:55 INFO TaskSetManager: Finished task 0.0 in stage 15.0 (TID 11) in 1212 ms on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal (executor driver) (1/1)
25/03/18 12:49:55 

                                                                                

25/03/18 12:49:55 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-5524d064-a83c-4183-9557-25cd029a757d/_temporary/0/task_202503181249537726674266397982598_0015_m_000000/' directory.
25/03/18 12:49:55 INFO GoogleCloudStorageFileSystem: Successfully repaired 'gs://de-zoomcamp-project-453801-terra-bucket/.spark-bigquery-local-1742301844485-5524d064-a83c-4183-9557-25cd029a757d/' directory.
25/03/18 12:49:56 INFO BlockManagerInfo: Removed broadcast_18_piece0 on de-zoomcamp-project.europe-west1-b.c.de-zoomcamp-project-453801.internal:46511 in memory (size: 81.8 KiB, free: 434.4 MiB)
25/03/18 12:49:56 INFO FileFormatWriter: Write Job bfd07b87-c375-4c9b-aa9b-1e484a6a30e3 committed. Elapsed time: 978 ms.
25/03/18 12:49:56 INFO FileFormatWriter: Finished processing stats for write job bfd07b87-c375-4c9b-aa9b-1e484a6a30e3.
25/03/18 12:49:56 INFO BigQueryClient: Submitted job LoadJobConfiguration{type=LOAD, 