In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import types

In [2]:
spark = SparkSession.builder \
    .appName("GCS_Spark_Read") \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/03/17 13:21:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [35]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("compression", "gzip") \
    .option("encoding", "us-ascii") \
    .csv("gs://de-zoomcamp-project-453801-terra-bucket/stock_dataset/stock/1742135256.3842237.3b60b69d81.csv.gz") 

                                                                                

In [36]:
df.show(5)

+-------+--------+------+------+-----------+--------+--------+---------+--------+-----------+------------+--------+------+--------+-------------------+--------------------+--------------+
|   open|    high|   low| close|     volume|adj_high| adj_low|adj_close|adj_open| adj_volume|split_factor|dividend|symbol|exchange|               date|        _dlt_load_id|       _dlt_id|
+-------+--------+------+------+-----------+--------+--------+---------+--------+-----------+------------+--------+------+--------+-------------------+--------------------+--------------+
|379.775|  390.23|379.51|388.56|1.9952846E7|  390.23|  379.51|   388.56| 379.775|1.9952846E7|         1.0|     0.0|  MSFT|    XNAS|2025-03-14 00:00:00|1.7421352563842237E9|Yop916yQPrfOeA|
| 211.25|  213.95|209.58|213.49|6.0107582E7|  213.95|  209.58|   213.49|  211.25|6.0107582E7|         1.0|     0.0|  AAPL|    XNAS|2025-03-14 00:00:00|1.7421352563842237E9|FHdPOLiabPdEBQ|
| 215.94|216.8394|208.42|209.68|6.0306872E7|216.8394|  208.4

In [37]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

# Assuming you have the DataFrame loaded as `df`

# Convert the 'date' column to a proper Date type (if it's not already)
df = df.withColumn("date", F.to_date(df["date"], "yyyy-MM-dd"))

# Extract year and month from the date to perform monthly aggregation
df = df.withColumn("year_month", F.date_format("date", "yyyy-MM"))

# Group by 'symbol' and 'year_month' and calculate the required aggregates
monthly_df = df.groupBy("symbol", "year_month") \
    .agg(
        F.avg("open").alias("avg_open"),
        F.max("high").alias("max_high"),
        F.min("low").alias("min_low"),
        F.avg("close").alias("avg_close"),
        F.max("volume").alias("max_volume"),
        F.avg("adj_high").alias("avg_adj_high"),
        F.min("adj_low").alias("min_adj_low"),
        F.avg("adj_close").alias("avg_adj_close"),
        F.avg("adj_open").alias("avg_adj_open"),
        F.avg("adj_volume").alias("avg_adj_volume"),
        F.avg("split_factor").alias("avg_split_factor"),
        F.avg("dividend").alias("avg_dividend")
    )

# Show the result
monthly_df.show(5)


+------+----------+------------------+--------+-------+------------------+----------+------------------+-----------+------------------+------------------+--------------------+----------------+--------------------+
|symbol|year_month|          avg_open|max_high|min_low|         avg_close|max_volume|      avg_adj_high|min_adj_low|     avg_adj_close|      avg_adj_open|      avg_adj_volume|avg_split_factor|        avg_dividend|
+------+----------+------------------+--------+-------+------------------+----------+------------------+-----------+------------------+------------------+--------------------+----------------+--------------------+
|  MSFT|   2024-06| 436.7971052631579|  456.16| 408.92| 438.3421052631579| 3.41871E7| 440.3619789473684|   408.9234| 438.3421052631579| 436.7821052631579| 1.801949505263158E7|             1.0|                 0.0|
|  MSFT|   2024-12|438.82166666666683|456.1648| 420.66|439.40857142857135| 6.42352E7|443.16800952380953|     420.66|439.40857142857135|438.80023

In [40]:
# Extract the year and week number from the date
df = df.withColumn("year", F.year(df["date"]))
df = df.withColumn("week", F.weekofyear(df["date"]))

# Combine year and week into a single column 'year_week' in "yyyy-week" format
df = df.withColumn("year_week", F.concat_ws("-", df["year"], df["week"]))

# Group by 'symbol' and 'year_week' and calculate the required aggregates
weekly_df = df.groupBy("symbol", "year_week") \
    .agg(
        F.avg("open").alias("avg_open"),
        F.max("high").alias("max_high"),
        F.min("low").alias("min_low"),
        F.avg("close").alias("avg_close"),
        F.max("volume").alias("max_volume"),
        F.avg("adj_high").alias("avg_adj_high"),
        F.min("adj_low").alias("min_adj_low"),
        F.avg("adj_close").alias("avg_adj_close"),
        F.avg("adj_open").alias("avg_adj_open"),
        F.avg("adj_volume").alias("avg_adj_volume"),
        F.avg("split_factor").alias("avg_split_factor"),
        F.avg("dividend").alias("avg_dividend")
    )

# Show the result
weekly_df.show(5)


+------+---------+------------------+--------+-------+------------------+------------+------------------+-----------+------------------+------------------+--------------+----------------+------------+
|symbol|year_week|          avg_open|max_high|min_low|         avg_close|  max_volume|      avg_adj_high|min_adj_low|     avg_adj_close|      avg_adj_open|avg_adj_volume|avg_split_factor|avg_dividend|
+------+---------+------------------+--------+-------+------------------+------------+------------------+-----------+------------------+------------------+--------------+----------------+------------+
|  AAPL|  2024-18|          175.1144|   187.0| 169.11|173.90400000000002|1.63224109E8|           176.828|     169.11|173.90400000000002|175.08700000000002|  8.83852732E7|             1.0|         0.0|
|  AAPL|  2024-21|           190.502|192.8231|186.625|            190.23| 5.0481918E7|          191.8106|    186.625|            190.23|           190.496|  4.17304244E7|             1.0|         

In [41]:
# monthly_df.write.format('bigquery') \
#     .option('table',monthly_stock_data) \
#     .save() 
# monthly_df.write \
#     .format("bigquery") \
#     .option("table", "de-zoomcamp-project-453801.demo_dataset.monthly_stock") \
#     .mode("overwrite") \
#     .save()


Py4JJavaError: An error occurred while calling o376.save.
: java.lang.ClassNotFoundException: 
Failed to find data source: bigquery. Please find packages at
https://spark.apache.org/third-party-projects.html
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:587)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:675)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:725)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:864)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.ClassNotFoundException: bigquery.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:471)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:588)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:521)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:661)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:661)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:661)
	... 16 more
