# Ben Harris - DS2002 Capstone

### Libraries

In [None]:
import os
import json
import pymongo
import pyspark.pandas as pd  # This uses Koalas that is included in PySpark version 3.2 or newer.
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, BinaryType
from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, DecimalType

### Global Variables

In [None]:
# Azure MySQL Server Connection Information ###################
jdbc_hostname = "nra2je-mysql.mysql.database.azure.com"
jdbc_port = 3306
src_database = "capstone"

connection_properties = {
  "user" : "nra2je",
  "password" : "Wilson23185!",
  "driver" : "org.mariadb.jdbc.Driver"
}

# MongoDB Atlas Connection Information ########################
atlas_cluster_name = "sandbox.4mvrkqg"
atlas_database_name = "capstone"
atlas_user_name = "nra2je"
atlas_password = "Passw0rd123"

# Data Files (CSV) Information ###############################
dst_database = "sales_data"

base_dir = "dbfs:/FileStore/capstone_data"
database_dir = f"{base_dir}/{dst_database}"

data_dir = f"{base_dir}/sales"
batch_dir = f"{data_dir}/batch"
stream_dir = f"{data_dir}/stream"

inspectors_stream_dir = f"{stream_dir}/inspector_data"
stores_stream_dir = f"{stream_dir}/stores_data"
sales_stream_dir = f"{stream_dir}/sales"

sales_output_bronze = f"{database_dir}/fact_sales/bronze"
sales_output_silver = f"{database_dir}/fact_sales/silver"
sales_output_gold   = f"{database_dir}/fact_sales/gold"


# Delete the Streaming Files ################################## 
dbutils.fs.rm(f"{database_dir}/fact_sales", True) 

# Delete the Database Files ###################################
dbutils.fs.rm(database_dir, True)

False

### Global Functions

In [None]:
##################################################################################################################
# Use this Function to Fetch a DataFrame from the MongoDB Atlas database server Using PyMongo.
##################################################################################################################
def get_mongo_dataframe(user_id, pwd, cluster_name, db_name, collection, conditions, projection, sort):
    '''Create a client connection to MongoDB'''
    mongo_uri = f"mongodb+srv://{user_id}:{pwd}@{cluster_name}.mongodb.net/{db_name}"
    
    client = pymongo.MongoClient(mongo_uri)

    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    if conditions and projection and sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection).sort(sort)))
    elif conditions and projection and not sort:
        dframe = pd.DataFrame(list(db[collection].find(conditions, projection)))
    else:
        dframe = pd.DataFrame(list(db[collection].find()))

    client.close()
    
    return dframe

def set_mongo_collection(client, db_name, data_directory, csv_files):
    db = client[db_name]
    
    for collection_name, csv_file in csv_files.items():
        db[collection_name].drop()
        csv_path = os.path.join(data_directory, csv_file)
        df = pd.read_csv(csv_path)
        records = df.to_dict(orient='records')
        db[collection_name].insert_many(records)
    
    client.close()


### Ingest Reference Data

In [None]:
%sql
DROP DATABASE IF EXISTS capstone2 CASCADE;

In [None]:
%sql
CREATE DATABASE IF NOT EXISTS capstone2
LOCATION "dbfs:/FileStore/capstone_data/capstone2"
WITH DBPROPERTIES (contains_pii = true, purpose = "DS-2002 Capstone");

In [None]:
%sql
CREATE OR REPLACE TEMPORARY VIEW view_date
USING org.apache.spark.sql.jdbc
OPTIONS (
  url "jdbc:mysql://nra2je-mysql.mysql.database.azure.com:3306/capstone",
  dbtable "dim_date",
  user "nra2je",
  password "Wilson23185!" 
)

In [None]:
%sql
USE DATABASE capstone2;

CREATE OR REPLACE TABLE capstone2.dim_date
COMMENT "Date Dimension Table"
LOCATION "dbfs:/FileStore/lab_data/capstone2/dim_date"
AS SELECT * FROM view_date

num_affected_rows,num_inserted_rows


In [None]:
%sql
DESCRIBE EXTENDED capstone2.dim_date;

col_name,data_type,comment
date_key,int,
full_date,date,
date_name,varchar(11),
date_name_us,varchar(11),
date_name_eu,varchar(11),
day_of_week,int,
day_name_of_week,varchar(10),
day_of_month,int,
day_of_year,int,
weekday_weekend,varchar(10),


In [None]:
%sql
SELECT * FROM capstone2.dim_date LIMIT 5

date_key,full_date,date_name,date_name_us,date_name_eu,day_of_week,day_name_of_week,day_of_month,day_of_year,weekday_weekend,week_of_year,month_name,month_of_year,is_last_day_of_month,calendar_quarter,calendar_year,calendar_year_month,calendar_year_qtr,fiscal_month_of_year,fiscal_quarter,fiscal_year,fiscal_year_month,fiscal_year_qtr
20000101,2000-01-01,2000/01/01,01/01/2000,01/01/2000,7,Saturday,1,1,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000102,2000-01-02,2000/01/02,01/02/2000,02/01/2000,1,Sunday,2,2,Weekend,52,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000103,2000-01-03,2000/01/03,01/03/2000,03/01/2000,2,Monday,3,3,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000104,2000-01-04,2000/01/04,01/04/2000,04/01/2000,3,Tuesday,4,4,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3
20000105,2000-01-05,2000/01/05,01/05/2000,05/01/2000,4,Wednesday,5,5,Weekday,1,January,1,N,1,2000,2000-01,2000Q1,7,3,2000,2000-07,2000Q3


### Fetch Reference Data

In [None]:
display(dbutils.fs.ls(batch_dir))

In [None]:
source_dir = '/dbfs/FileStore/capstone_data/batch'
csv_files = {"inspectors" : 'inspector_data.csv'
              , "stores" : 'stores_data.csv'}

set_mongo_collection(atlas_user_name, atlas_password, atlas_cluster_name, atlas_database_name, source_dir, csv_files) 

In [None]:
%scala
import com.mongodb.spark._

val userName = "nra2je"
val pwd = "Passw0rd123"
val clusterName = "sandbox.4mvrkqg"
val atlas_uri = s"mongodb+srv://$userName:$pwd@$clusterName.mongodb.net/?retryWrites=true&w=majority"

#### Inspector Dimension

In [None]:
%scala

val df_inspector = spark.read.format("com.mongodb.spark.sql.DefaultSource")
.option("spark.mongodb.input.uri", atlas_uri)
.option("database", "capstone")
.option("collection", "inspectors").load()
.select("Inspector_ID","First","Last","Email")

display(df_inspector)

In [None]:
%scala
df_inspector.printSchema()

In [None]:
%scala
df_inspector.write.format("delta").mode("overwrite").saveAsTable("capstone2.dim_inspector")

In [None]:
%sql
DESCRIBE EXTENDED capstone2.dim_inspector

In [None]:
%sql
SELECT * FROM capstone2.dim_inspector LIMIT 5

#### Store Dimension

In [None]:
store_csv = f"{batch_dir}/stores_data.csv"

df_store = spark.read.format('csv').options(header='true', inferSchema='true').load(store_csv)
display(df_store)

In [None]:
df_store.printSchema()

In [None]:
%scala
df_store.write.format("delta").mode("overwrite").saveAsTable("capstone2.dim_store")

In [None]:
%sql
DESCRIBE EXTENDED capstone2.dim_store

In [None]:
%sql
SELECT * FROM capstone2.dim_store LIMIT 5

#### Verify Dimension Tables

In [None]:
%sql
USE capstone2;
SHOW TABLES

### Integrate Reference and Real Time Data

In [None]:
(spark.readStream
 .format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaLocation", sales_output_bronze)
 .option("cloudFiles.inferColumnTypes", "true")
 .option("multiLine", "true")
 .load(sales_stream_dir)
 .createOrReplaceTempView("sales_raw_tempview"))

In [None]:
%sql
/* Add Metadata for Traceability */
CREATE OR REPLACE TEMPORARY VIEW sales_bronze_tempview AS (
  SELECT *, current_timestamp() receipt_time, input_file_name() source_file
  FROM sales_raw_tempview
)

In [None]:
%sql
SELECT * FROM sales_bronze_tempview

In [None]:
(spark.table("sales_bronze_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{sales_output_bronze}/_checkpoint")
      .outputMode("append")
      .table("fact_sales_bronze"))

In [None]:
(spark.readStream
  .table("fact_sales_bronze")
  .createOrReplaceTempView("sales_silver_tempview"))

In [None]:
%sql
SELECT * FROM sales_silver_tempview

In [None]:
%sql
DESCRIBE EXTENDED sales_silver_tempview

In [None]:
%sql
CREATE OR REPLACE TEMPORARY VIEW fact_sales_silver_tempview AS (
  SELECT st.Type,
      st.Size,
      st.State,
      st.City,
      st.Address,
      s.Dept,
      i.First,
      i.Last,
      i.Email,
      d.day_name_of_week AS inspected_day_name_of_week,
      d.day_of_month AS inspected_day_of_month,
      d.weekday_weekend AS inspected_weekday_weekend,
      d.month_name AS inspected_month_name,
      d.calendar_quarter AS inspected_calendar_quarter,
      d.calendar_year AS inspected_calendar_year,
      s.Weekly_Sales,
     s.IsHoliday
  FROM sales_silver_tempview AS s
  INNER JOIN capstone2.dim_inspector AS i
  ON s.Inspector_id = i.Inspector_id
  INNER JOIN capstone2.dim_store as st
  ON s.Store_id = st.Store
  INNER JOIN capstone2.dim_date as d
  ON s.sale_date_key = d.date_key
)

In [None]:
(spark.table("fact_sales_silver_tempview")
      .writeStream
      .format("delta")
      .option("checkpointLocation", f"{sales_output_silver}/_checkpoint")
      .outputMode("append")
      .table("fact_sales_silver"))

In [None]:
%sql
SELECT * FROM fact_sales_silver

In [None]:
%sql
DESCRIBE EXTENDED capstone2.fact_sales_silver

#### Gold Table Aggregations

In [None]:
%sql
CREATE OR REPLACE TABLE capstone2.fact_sales_by_store AS (
  SELECT store_id AS Store_Number
    , Address AS Store_Location
    , CONCAT('$', FORMAT(AVG(Weekly_Sales), 0)) AS Average_Weekly_Sales,
    , CONCAT('$', FORMAT(SUM(sWeekly_Sales), 0)) AS Total_Sales
  FROM capstone2.fact_sales_silver
  GROUP BY store_id, Address);

SELECT * FROM capstone2.fact_sale_by_store

<div>
<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }

    .dataframe tbody tr th {
        vertical-align: top;
    }

    .dataframe thead th {
        text-align: right;
    }
</style>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>Store_Number</th>
      <th>Store_Location</th>
      <th>Average_Weekly_Sales</th>
      <th>Total_Sales</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1</td>
      <td>1601 N State Route 50 Bourbonnais, IL 60914 US</td>
      <td>$21,711</td>
      <td>$222,402,809</td>
    </tr>
    <tr>
      <th>1</th>
      <td>2</td>
      <td>41 W. 87th Street Chicago, IL 60620 US</td>
      <td>$26,898</td>
      <td>$275,382,441</td>
    </tr>
    <tr>
      <th>2</th>
      <td>3</td>
      <td>601 King St Alexandria, VA 22314 US</td>
      <td>$6,373</td>
      <td>$57,586,735</td>
    </tr>
    <tr>
      <th>3</th>
      <td>4</td>
      <td>235 S State St Ann Arbor, MI 48104 US</td>
      <td>$29,161</td>
      <td>$299,543,953</td>
    </tr>
    <tr>
      <th>4</th>
      <td>5</td>
      <td>1081 Pine Plaza Dr Apex, NC 27523 US</td>
      <td>$5,053</td>
      <td>$45,475,689</td>
    </tr>
    <tr>
      <th>5</th>
      <td>6</td>
      <td>48557 Morongo Trail Cabazon, CA 92230 US</td>
      <td>$21,913</td>
      <td>$223,756,131</td>
    </tr>
    <tr>
      <th>6</th>
      <td>7</td>
      <td>5 Bel Air S Pkwy Bel Air, MD 21014 US</td>
      <td>$8,359</td>
      <td>$81,598,275</td>
    </tr>
    <tr>
      <th>7</th>
      <td>8</td>
      <td>305 S 6th St Boise, ID 83702 US</td>
      <td>$13,133</td>
      <td>$129,951,181</td>
    </tr>
    <tr>
      <th>8</th>
      <td>9</td>
      <td>1132 S Clinton St Clinton &amp; Grenshaw Chicago, ...</td>
      <td>$8,773</td>
      <td>$77,789,219</td>
    </tr>
    <tr>
      <th>9</th>
      <td>10</td>
      <td>4400 Sharon Rd Space G41 Charlotte, NC 28211 US</td>
      <td>$26,332</td>
      <td>$271,617,714</td>
    </tr>
    <tr>
      <th>10</th>
      <td>11</td>
      <td>1109 E. 5th St. Suite 130 Austin, TX 78702 US</td>
      <td>$19,277</td>
      <td>$193,962,787</td>
    </tr>
    <tr>
      <th>11</th>
      <td>12</td>
      <td>240 Bloomfield Ave Bloomfield, NJ 07003 US</td>
      <td>$14,867</td>
      <td>$144,287,230</td>
    </tr>
    <tr>
      <th>12</th>
      <td>13</td>
      <td>400 S Baldwin Ave Ste FC-9 Arcadia, CA 91007 US</td>
      <td>$27,355</td>
      <td>$286,517,704</td>
    </tr>
    <tr>
      <th>13</th>
      <td>14</td>
      <td>316 N Michigan Ave Chicago, IL 60601 US</td>
      <td>$28,785</td>
      <td>$288,999,911</td>
    </tr>
    <tr>
      <th>14</th>
      <td>15</td>
      <td>Post Security - Ab Concourse Food Court A/B Te...</td>
      <td>$9,002</td>
      <td>$89,133,684</td>
    </tr>
    <tr>
      <th>15</th>
      <td>16</td>
      <td>919 Pearl St Boulder, CO 80302 US</td>
      <td>$7,863</td>
      <td>$74,252,425</td>
    </tr>
    <tr>
      <th>16</th>
      <td>17</td>
      <td>19179 Bear Valley Rd Ste 4 Apple Valley, CA 92...</td>
      <td>$12,954</td>
      <td>$127,782,139</td>
    </tr>
    <tr>
      <th>17</th>
      <td>18</td>
      <td>598 Massachusetts Ave Cambridge, MA 02139 US</td>
      <td>$15,733</td>
      <td>$155,114,734</td>
    </tr>
    <tr>
      <th>18</th>
      <td>19</td>
      <td>8599 W Grand River Ave Brighton, MI 48116 US</td>
      <td>$20,362</td>
      <td>$206,634,862</td>
    </tr>
    <tr>
      <th>19</th>
      <td>20</td>
      <td>2608 Central Ave SE Albuquerque, NM 87106 US</td>
      <td>$29,508</td>
      <td>$301,397,792</td>
    </tr>
    <tr>
      <th>20</th>
      <td>21</td>
      <td>3600 Coors Blvd NW A-800 Albuquerque, NM 87120 US</td>
      <td>$11,283</td>
      <td>$108,117,879</td>
    </tr>
    <tr>
      <th>21</th>
      <td>22</td>
      <td>695 W Waterloo St Canal Winchester, OH 43110 US</td>
      <td>$15,181</td>
      <td>$147,075,649</td>
    </tr>
    <tr>
      <th>22</th>
      <td>23</td>
      <td>1650 28th St Unit 1224 Boulder, CO 80301 US</td>
      <td>$19,776</td>
      <td>$198,750,618</td>
    </tr>
    <tr>
      <th>23</th>
      <td>24</td>
      <td>528 E Green St Ste 101 Champaign, IL 61820 US</td>
      <td>$18,969</td>
      <td>$194,016,021</td>
    </tr>
    <tr>
      <th>24</th>
      <td>25</td>
      <td>1457 S Carson St Ste 102 Carson City, NV 89701 US</td>
      <td>$10,308</td>
      <td>$101,061,179</td>
    </tr>
    <tr>
      <th>25</th>
      <td>26</td>
      <td>2743 N Elston Ave Chicago, IL 60647 US</td>
      <td>$14,554</td>
      <td>$143,416,394</td>
    </tr>
    <tr>
      <th>26</th>
      <td>27</td>
      <td>101 W State St Baton Rouge, LA 70802 US</td>
      <td>$24,827</td>
      <td>$253,855,917</td>
    </tr>
    <tr>
      <th>27</th>
      <td>28</td>
      <td>12697 N Pennsylvania St Ste 100 Carmel, IN 460...</td>
      <td>$18,715</td>
      <td>$189,263,681</td>
    </tr>
    <tr>
      <th>28</th>
      <td>29</td>
      <td>5001 S Cooper St Ste 125 Arlington, TX 76017 US</td>
      <td>$8,159</td>
      <td>$77,141,554</td>
    </tr>
    <tr>
      <th>29</th>
      <td>30</td>
      <td>1558 E Wooster St Bowling Green, OH 43402 US</td>
      <td>$8,764</td>
      <td>$62,716,885</td>
    </tr>
    <tr>
      <th>30</th>
      <td>31</td>
      <td>9430C Colerain Ave Cincinnati, OH 45251 US</td>
      <td>$19,682</td>
      <td>$199,613,905</td>
    </tr>
    <tr>
      <th>31</th>
      <td>32</td>
      <td>4600 Roswell Rd Unit B110 Atlanta, GA 30342 US</td>
      <td>$16,352</td>
      <td>$166,819,246</td>
    </tr>
    <tr>
      <th>32</th>
      <td>33</td>
      <td>6440 Harrison Ave Ste 300 Cincinnati, OH 45247 US</td>
      <td>$5,728</td>
      <td>$37,160,222</td>
    </tr>
    <tr>
      <th>33</th>
      <td>34</td>
      <td>55 Dodge St Beverly, MA 01915 US</td>
      <td>$13,522</td>
      <td>$138,249,763</td>
    </tr>
    <tr>
      <th>34</th>
      <td>35</td>
      <td>1924 Beacon St Brighton, MA 02135 US</td>
      <td>$13,804</td>
      <td>$131,520,672</td>
    </tr>
    <tr>
      <th>35</th>
      <td>36</td>
      <td>347 Flatbush Ave Brooklyn, NY 11238 US</td>
      <td>$8,584</td>
      <td>$53,412,215</td>
    </tr>
    <tr>
      <th>36</th>
      <td>37</td>
      <td>720 Otay Lakes Rd Chula Vista, CA 91910 US</td>
      <td>$10,297</td>
      <td>$74,202,740</td>
    </tr>
    <tr>
      <th>37</th>
      <td>38</td>
      <td>774 Broadway Brooklyn, NY 11206 US</td>
      <td>$7,492</td>
      <td>$55,159,626</td>
    </tr>
    <tr>
      <th>38</th>
      <td>39</td>
      <td>8120 San Pedro Dr NE Albuquerque, NM 87113 US</td>
      <td>$21,001</td>
      <td>$207,445,542</td>
    </tr>
    <tr>
      <th>39</th>
      <td>40</td>
      <td>15500 Excelsior Dr Bowie, MD 20716 US</td>
      <td>$13,764</td>
      <td>$137,870,310</td>
    </tr>
    <tr>
      <th>40</th>
      <td>41</td>
      <td>9873 Iron Bridge Rd Chesterfield, VA 23832 US</td>
      <td>$17,976</td>
      <td>$181,341,935</td>
    </tr>
    <tr>
      <th>41</th>
      <td>42</td>
      <td>19825 Belmont Chase Dr Ste 130 Ashburn, VA 201...</td>
      <td>$11,443</td>
      <td>$79,565,752</td>
    </tr>
    <tr>
      <th>42</th>
      <td>43</td>
      <td>65 Drum Hill Rd Chelmsford, MA 01824 US</td>
      <td>$13,415</td>
      <td>$90,565,435</td>
    </tr>
    <tr>
      <th>43</th>
      <td>44</td>
      <td>258 Stetson St Cincinnati, OH 45219 US</td>
      <td>$6,039</td>
      <td>$43,293,088</td>
    </tr>
    <tr>
      <th>44</th>
      <td>45</td>
      <td>36 Market Space Annapolis, MD 21401 US</td>
      <td>$11,663</td>
      <td>$112,395,341</td>
    </tr>
  </tbody>
</table>
</div>

In [None]:
%fs rm -r /FileStore/lab_data/