In [14]:
import pyspark
from sparkvalidator.functions import *
from sparkvalidator.sampler import *

# create spark session and read in the data
spark = pyspark.sql.SparkSession.builder.appName('demo').getOrCreate()
df = spark.read.option("header","true").csv('data/walmart_stock.csv')

In [15]:
# select useful columns - Open, High, Low, Close
df = df.select(df['Open'].cast('float'),df['High'].cast('float'),df['Low'].cast('float'),df['Close'].cast('float'))

# sample the df
df = random_sampling(df, 2,False,0.5)

# count number of data entries in the file
entryCount = count(df)
print('total number of entries: ',entryCount)

# find the max stock price
maxPriceDf = max(df,'High')
print("max stock price is: ")
maxPriceDf.show()

# find max price fluctuation and final price change in a day
df = map(df, lambda x : (x[1] - x[2],x[3] - x[0]), ["max_price_fluctuation","final_price_change"])
df.show()


total number of entries:  648
max stock price is: 
+---------+
|max(High)|
+---------+
|    90.39|
+---------+

+---------------------+--------------------+
|max_price_fluctuation|  final_price_change|
+---------------------+--------------------+
|   0.8799972534179688|                -0.5|
|                 1.25| 0.06999969482421875|
|   0.5800018310546875| -0.4199981689453125|
|   0.7299995422363281| -0.3899993896484375|
|  0.48999786376953125|  0.3400001525878906|
|   0.5999984741210938|-0.29000091552734375|
|   0.5900001525878906|-0.02000045776367...|
|   0.9799995422363281|  0.6800003051757812|
|   0.5800018310546875|  0.2599983215332031|
|    0.470001220703125| 0.09999847412109375|
|   0.5699996948242188| 0.29000091552734375|
|   0.5799980163574219|-0.15000152587890625|
|    0.970001220703125|  0.8299980163574219|
|   0.9899978637695312| -0.1699981689453125|
|   0.6500015258789062| -0.4600028991699219|
|   0.7299995422363281|-0.29000091552734375|
|   0.2800025939941406| 0.0699996