In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('dates and times').getOrCreate()

In [3]:
sdf = spark.read.csv('./Data/stock.csv', inferSchema=True, header=True)

In [4]:
sdf.show(5)

+----------+----------+----------+------------------+------------------+---------+------------------+
|      Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
+----------+----------+----------+------------------+------------------+---------+------------------+
|2010-01-04|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
+----------+----------+----------+------------------+------------------+---------+------------------+
only showing top 5 rows



In [5]:
sdf.select('Date').show(5)

+----------+
|      Date|
+----------+
|2010-01-04|
|2010-01-05|
|2010-01-06|
|2010-01-07|
|2010-01-08|
+----------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import dayofmonth, hour, minute, year, weekofyear, format_number, date_format, month

In [7]:
sdf.select(dayofmonth(sdf['Date']).alias('Day of Month')).show(5)

+------------+
|Day of Month|
+------------+
|           4|
|           5|
|           6|
|           7|
|           8|
+------------+
only showing top 5 rows



In [8]:
sdf.select(month(sdf['Date']).alias('Month')).show(5)

+-----+
|Month|
+-----+
|    1|
|    1|
|    1|
|    1|
|    1|
+-----+
only showing top 5 rows



In [9]:
sdf.select(year(sdf['Date']).alias('Year')).show(5)

+----+
|Year|
+----+
|2010|
|2010|
|2010|
|2010|
|2010|
+----+
only showing top 5 rows



In [10]:
sdf.select(weekofyear(sdf['Date']).alias('Week of Year')).show(5)

+------------+
|Week of Year|
+------------+
|           1|
|           1|
|           1|
|           1|
|           1|
+------------+
only showing top 5 rows



In [11]:
sdf.withColumn(colName='Year', col=year(sdf['Date'])).show(10)

+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|
+----------+------------------+------------------+------------------+------------------+---------+------------------+----+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|
|2010-01-11|212.

In [12]:
new_df = sdf.withColumn(colName='Year', col=year(sdf['Date']))

In [13]:
new_df.select(['Year', 'Open', 'Close', 'High', 'Low']).groupby('Year').mean().show()

+----+---------+------------------+------------------+------------------+------------------+
|Year|avg(Year)|         avg(Open)|        avg(Close)|         avg(High)|          avg(Low)|
+----+---------+------------------+------------------+------------------+------------------+
|2015|   2015.0|120.17575393253965|120.03999980555547|121.24452385714291| 118.8630954325397|
|2013|   2013.0| 473.1281355634922| 472.6348802857143| 477.6389272301587|468.24710264682557|
|2014|   2014.0| 295.1426195357143| 295.4023416507935|297.56103184523823| 292.9949599801587|
|2012|   2012.0|     576.652720788| 576.0497195640002| 581.8254008040001| 569.9211606079999|
|2016|   2016.0|104.50777772619044|104.60400786904763| 105.4271825436508|103.69027771825397|
|2010|   2010.0| 259.9576190992064| 259.8424600000002|262.36880881349214|256.84761791269847|
|2011|   2011.0|364.06142773412705|364.00432532142867| 367.4235704880951|360.29769878174613|
+----+---------+------------------+------------------+----------------