In [28]:
import pyspark
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F

In [4]:
# Creating Spark Session
spark = SparkSession.builder.appName('companyABC').getOrCreate()

# Reading /loading the Dataset from CSV file 
spark_df = spark.read.load("Data/CompanyABC_stock.csv", format="csv", header=True, inferSchema=True)

In [5]:
spark_df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [6]:
spark_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [7]:
spark_df.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

In [8]:
spark_df.show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000000004|
|2012-01-10|             59.43|59.709998999999996|             5

In [16]:
# Print first 5 rows
spark_df.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



In [21]:
""" Create a new DataFrame column called “HV Ratio,” which will 
stimulate the ratio of the High price versus the total Volume of 
stock that was traded for a day."""

spark_df = spark_df.withColumn("HV Ratio", spark_df["High"] / spark_df["Volume"])
spark_df.show()

+----------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|      Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|            HV Ratio|
+----------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|2012-01-03|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|4.819714653321546E-6|
|2012-01-04|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|6.290848613094555E-6|
|2012-01-05|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|4.669412994783916E-6|
|2012-01-06|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|7.367338463826307E-6|
|2012-01-09|         59.029999|   

In [100]:
# Find out on what day the stock price was the highest
max_price = spark_df.select("Date", "High").sort(F.col('High').desc())
max_price.show(1)

+----------+---------+
|      Date|     High|
+----------+---------+
|2015-01-13|90.970001|
+----------+---------+
only showing top 1 row



In [101]:
# average (mean) closing price

avg_closing_price = spark_df.agg(F.avg('Close'))
avg_closing_price.show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



In [102]:
# minimum volumes of stock traded
min_volume = spark_df.agg(F.min('Volume'))
min_volume.show()

# maximum  volumes of stock traded
max_volume = spark_df.agg(F.max('Volume'))
max_volume.show()

+-----------+
|min(Volume)|
+-----------+
|    2094900|
+-----------+

+-----------+
|max(Volume)|
+-----------+
|   80898100|
+-----------+



In [103]:
# number of days closing value was less than 70 dollars
days_under_70 = spark_df.select('Date', 'Close').filter(F.col('Close') < 70)
days_under_70.show()

+----------+------------------+
|      Date|             Close|
+----------+------------------+
|2012-01-03|         60.330002|
|2012-01-04|59.709998999999996|
|2012-01-05|         59.419998|
|2012-01-06|              59.0|
|2012-01-09|             59.18|
|2012-01-10|59.040001000000004|
|2012-01-11|         59.400002|
|2012-01-12|              59.5|
|2012-01-13|59.540001000000004|
|2012-01-17|         59.849998|
|2012-01-18|60.009997999999996|
|2012-01-19|60.610001000000004|
|2012-01-20|61.009997999999996|
|2012-01-23|             60.91|
|2012-01-24|61.389998999999996|
|2012-01-25|         61.470001|
|2012-01-26|         60.970001|
|2012-01-27|60.709998999999996|
|2012-01-30|         61.299999|
|2012-01-31|61.360001000000004|
+----------+------------------+
only showing top 20 rows



In [None]:
# percentage of time High was greater than 80 dollars
high_over_80 = spark_df.select('High').filter(F.col('High') > 80).agg(F.avg(F.col('High')))
high_over_80.show()

+----------------+
|       avg(High)|
+----------------+
|84.7920002956522|
+----------------+



In [78]:
# Load/Write CompanyABC stock.csv file data into CompanyABC_DB database from the SparkSQL Dataframe
spark_df.write.format("jdbc") \
  .mode("append") \
  .option("url", "jdbc:mysql://localhost:3306/CompanyABC_DB") \
  .option("dbtable", "CompanyABC_DB.sba_Spark_Demo") \
  .option("user", "root") \
  .option("password", "password") \
  .save()

In [80]:
# Load/Read both CompanyABC sales datasets (Sales_April_2019.csv and Sales_February_2019.csv) into a single SparkSQL DataFrame

# Creating Spark Session
spark = SparkSession.builder.appName('companyABC Sales Data').getOrCreate()

# list of different file paths
file_paths = ["Data/Sales_April_2019.csv", "Data/Sales_February_2019.csv"]

# Reading /loading the Dataset from CSV file 
sales_df = spark.read.csv(file_paths, header=True, inferSchema=True)

In [81]:
sales_df.columns

['Order ID',
 'Product',
 'Quantity Ordered',
 'Price Each',
 'Order Date',
 'Purchase Address']

In [82]:
sales_df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [83]:
sales_df.describe().show()

+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|  count|             30275|       30328|              30275|             30275|         30328|               30328|
|   mean|173816.63355904212|        NULL| 1.1239966969446737| 183.7621383980421|          NULL|                NULL|
| stddev|14902.179833784816|        NULL|0.43431295924749175|328.79347229907665|          NULL|                NULL|
|    min|            150502|20in Monitor|                  1|              2.99|02/01/19 01:51|1 14th St, New Yo...|
|    max|            194094|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+---

In [84]:
sales_df.show()

+--------+--------------------+----------------+----------+---------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|     Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+---------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95| 4/19/2019 8:46|917 1st St, Dalla...|
|  176559|Bose SoundSport H...|               1|     99.99| 4/7/2019 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|4/12/2019 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|4/12/2019 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99| 4/30/2019 9:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|4/29/2019 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|               1|     99.99|  4/2/2019 7:46|668 Center St, Se...|
|  176564|USB-C Charging Cable

In [None]:
# create a new column called “Total price” for quantity ordered
sales_df = sales_df.withColumn('Total Price', F.col('Price Each') * F.col('Quantity Ordered'))
sales_df.show()

+--------+--------------------+----------------+----------+---------------+--------------------+-----------+
|Order ID|             Product|Quantity Ordered|Price Each|     Order Date|    Purchase Address|Total Price|
+--------+--------------------+----------------+----------+---------------+--------------------+-----------+
|  176558|USB-C Charging Cable|               2|     11.95| 4/19/2019 8:46|917 1st St, Dalla...|       23.9|
|  176559|Bose SoundSport H...|               1|     99.99| 4/7/2019 22:30|682 Chestnut St, ...|      99.99|
|  176560|        Google Phone|               1|     600.0|4/12/2019 14:38|669 Spruce St, Lo...|      600.0|
|  176560|    Wired Headphones|               1|     11.99|4/12/2019 14:38|669 Spruce St, Lo...|      11.99|
|  176561|    Wired Headphones|               1|     11.99| 4/30/2019 9:27|333 8th St, Los A...|      11.99|
|  176562|USB-C Charging Cable|               1|     11.95|4/29/2019 13:03|381 Wilson St, Sa...|      11.95|
|  176563|Bose Soun

In [88]:
# Load/Write both sales.csv file data into CompanyABC_DB database from the SparkSQL Dataframe
sales_df.write.format("jdbc") \
  .mode("overwrite") \
  .option("url", "jdbc:mysql://localhost:3306/CompanyABC_DB") \
  .option("dbtable", "CompanyABC_DB.sales_data") \
  .option("user", "root") \
  .option("password", "password") \
  .save()