In [9]:
#Start a simple Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test2").master("local[*]").getOrCreate()

In [4]:
#Load the Walmart Stock CSV File, have Spark infer the data types.
df = spark.read.option("header", True).csv("walmart_stock-1.csv", inferSchema=True)

In [5]:
#What are the column names?
print(df.columns)

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']


In [6]:
#What does the Schema look like?
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [8]:
#Print out the first 5 columns
fiveColDf = df.select(df.columns[:5])
fiveColDf.show()

+----------+------------------+------------------+------------------+------------------+
|      Date|              Open|              High|               Low|             Close|
+----------+------------------+------------------+------------------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18|
|2012-01-10|             59.43|59.709998999999996|             58.98|59.040001000000004|
|2012-01-11|         59.060001|         59.529999|59.040001000000004|         59.400002|
|2012-01-12|59.790001000000004|              60.0|         59.400002|              59.5|
|2012-01-13|         

In [10]:
#Use describe() to learn about the DataFrame.
df.describe()

DataFrame[summary: string, Open: string, High: string, Low: string, Close: string, Volume: string, Adj Close: string]

In [11]:
#What day had the Peak High in Price?
#df.agg({'High': 'max'}).show()
df.orderBy(df.High.desc()).show(1)

+----------+---------+---------+-----+---------+-------+---------+
|      Date|     Open|     High|  Low|    Close| Volume|Adj Close|
+----------+---------+---------+-----+---------+-------+---------+
|2015-01-13|90.800003|90.970001|88.93|89.309998|8215400|83.825448|
+----------+---------+---------+-----+---------+-------+---------+
only showing top 1 row



In [150]:
#What is the mean of the Close column?
from pyspark.sql.functions import mean
df.select(mean(df.Close)).show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



In [151]:
#What is the max and min of the Volume column?
from pyspark.sql.functions import max
from pyspark.sql.functions import min
df.select(max(df.Volume)).show()
df.select(min(df.Volume)).show()

+-----------+
|max(Volume)|
+-----------+
|   80898100|
+-----------+

+-----------+
|min(Volume)|
+-----------+
|    2094900|
+-----------+



In [152]:
#How many days was the Close lower than 60 dollars?
lowerThanSixty = df.where(df.Close < 60)
row_count = lowerThanSixty.count()
print(row_count)


81


In [153]:
#What percentage of the time was the High greater than 80 dollars ? In other words, (Number of Days High>80)/(Total Days in the dataset)
higherThanEighty = df.filter(df.High > 80)
daysHigh = higherThanEighty.count()
totalDays = df.count()
print(daysHigh/totalDays*100)


9.141494435612083


In [154]:
#What is the Pearson correlation between High and Volume?
from pyspark.sql.functions import corr

pearsonCorr = df.corr("High", "Volume", "pearson")
print(pearsonCorr)

-0.3384326061737161


In [14]:
from pyspark.sql.functions import corr


In [155]:
#What is the max High per year?
from pyspark.sql.functions import (dayofmonth, hour,
                                   dayofyear, month,
                                   year, weekofyear,
                                   format_number, date_format)
maxHighPerYear = df.groupBy(year("Date")).agg(max("High")).alias("year").show()

+----------+---------+
|year(Date)|max(High)|
+----------+---------+
|      2015|90.970001|
|      2013|81.370003|
|      2014|88.089996|
|      2012|77.599998|
|      2016|75.190002|
+----------+---------+



In [165]:
from pyspark.sql.functions import avg
#What is the average Close for each Calendar Month?
#In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc... Your result will have a value for each of these months.
avgClosePerMonth = df.groupBy(month("Date").alias("Month")).agg(avg("Close").alias("Close")).orderBy("Month")
#Create a new column Month from existing Date column
df = df.withColumn("Month", month("Date"))


#Group by month and take average of all other columns
avgAllPerMonth = df.groupBy("Month").agg(avg("High").alias("High"), avg("Low").alias("Low"), avg("Close").alias("Close"), avg("Volume").alias("Volume"), avg("Adj Close").alias("Adj Close"))

#Sort by month
avgAllPerMonth = avgAllPerMonth.orderBy("Month")

#Display only month and avg(Close), the desired columns
avgAllPerMonth.select("Month", "Close").show()


+-----+-----------------+
|Month|            Close|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+

