In [31]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import (dayofmonth, hour, dayofyear,
                                   month, year, weekofyear,
                                   format_number, date_format)
from pyspark.sql.types import (StructField, StructType, StringType,
                               IntegerType, DoubleType, TimestampType)

In [32]:
## Create Spark Session
spark = SparkSession.builder.appName('DatesTimes').getOrCreate()

In [33]:
## Setup Schema
schema = StructType(fields=[StructField('date', TimestampType(), True),
                            StructField('open', DoubleType(), True),
                            StructField('high', DoubleType(), True),
                            StructField('low', DoubleType(), True),
                            StructField('close', DoubleType(), True),
                            StructField('volume', IntegerType(), True),
                            StructField('adj_close', DoubleType(), True)])

In [34]:
## Read in Data
df = spark.read.csv('gs://spark-training-data/datasets/appl_stock.csv', inferSchema=False,
                    schema=schema, header=True)

In [35]:
## Show Data
df.show()
df.printSchema()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               date|              open|              high|               low|             close|   volume|         adj_close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [36]:
## Date Example - 'date': datetime.datetime(2010, 1, 4, 0, 0)
df.head(1)[0].asDict()

{'date': datetime.datetime(2010, 1, 4, 0, 0),
 'open': 213.429998,
 'high': 214.499996,
 'low': 212.38000099999996,
 'close': 214.009998,
 'volume': 123432400,
 'adj_close': 27.727039}

In [39]:
## Date Function example from pyspark.sql.functions
df.select(dayofmonth(df['date']).alias('day_of_month')).show()

+------------+
|day_of_month|
+------------+
|           4|
|           5|
|           6|
|           7|
|           8|
|          11|
|          12|
|          13|
|          14|
|          15|
|          19|
|          20|
|          21|
|          22|
|          25|
|          26|
|          27|
|          28|
|          29|
|           1|
+------------+
only showing top 20 rows



In [52]:
## Example Problem - Monthly Closing Average
result = df.groupBy(year(df['date']).alias('year')).mean('close')
result.select('year', format_number('avg(close)',2)).withColumnRenamed('format_number(avg(close), 2)','average_closing_price').show()

+----+---------------------+
|year|average_closing_price|
+----+---------------------+
|2015|               120.04|
|2013|               472.63|
|2014|               295.40|
|2012|               576.05|
|2016|               104.60|
|2010|               259.84|
|2011|               364.00|
+----+---------------------+

