# Data Inception: Using Spark to help Spark

- In this following notebook, we will learn to use Spark to help us work more efficiently.

#### Importing Libraries

In [1]:
# PySpark is the main library for Spark
import pyspark 
# SparkContext is the entry point for Spark functionality
from pyspark import SparkContext 
# SparkSession is the entry point for DataFrame and SQL functionality
from pyspark.sql import SparkSession 
from pyspark import SQLContext

In [2]:
# Provides a way of using operating system dependent functionality
import os 
# Delta is a storage layer for data lakes
from delta.tables import * 
# DeltaTable is the main class for Delta tables
from delta.tables import DeltaTable 
# Provides cryptographic hashing functions
import hashlib 
 # Provides classes for working with dates and times
import datetime
# Provides functions for working with URLs
import urllib.request 
# Provides functions for working with JSON data
import json 
 # Import timedelta and date classes from datetime module
from datetime import timedelta, date
# Provides functions for working with iterables
from itertools import islice 
# Provides access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter.
import sys 

In [3]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Ignore warnings from Apache Spark
warnings.filterwarnings("ignore", message=".*consider reporting.*")
warnings.filterwarnings("ignore", message=".*illegal-access.*")
warnings.filterwarnings("ignore", message=".*default log level.*")

# Create SparkSession

In [4]:
# Create SparkSession from builder
from pyspark.sql import SparkSession

# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("LetSparkWorkForYou") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()

# Details of the Spark Session
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Read CSV.

In [5]:
coffees = spark.read.format("csv") \
                .option("header", True) \
                .load("/home/jovyan/work/data/raw-coffee.txt") \
                .toDF("name", "roast")

coffees.show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|      yuban|   10|
|  nespresso|   10|
|     ritual|    4|
|four barrel|    5|
+-----------+-----+



# Infer Schema

In [6]:
coffeeAndSchema = spark.read.format("csv") \
    .option("inferSchema", True) \
    .option("header", True) \
    .load("/home/jovyan/work/data/raw-coffee.txt") \
    .toDF("name", "roast")

coffeeAndSchema.show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|      yuban| 10.0|
|  nespresso| 10.0|
|     ritual|  4.0|
|four barrel|  5.0|
+-----------+-----+



In [7]:
coffeeAndSchema.printSchema()

root
 |-- name: string (nullable = true)
 |-- roast: double (nullable = true)



# Manually build The Schema Pattern

- When working with critical datasets, using strict schemas enables you to ignore (skip) corrupt data, or to fail fast and kick back an exception, when encountering data that doesn’t conform or parse correctly.

In [8]:
from pyspark.sql.types import *

# Define the schema
schema = StructType([
    StructField("name", StringType(), True),
    StructField("roast", DoubleType(), True)
])

# Read the data with the specified schema
coffeeAndSchema = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/home/jovyan/work/data/raw-coffee.txt")

# Show the DataFrame
coffeeAndSchema.show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|      yuban| 10.0|
|  nespresso| 10.0|
|     ritual|  4.0|
|four barrel|  5.0|
+-----------+-----+



## SparkSQL

In [9]:
# Create a view for the DataFrame
coffeeAndSchema.createOrReplaceTempView("coffee")

# Query the view
spark.sql("SELECT * FROM coffee ORDER BY roast desc").show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|      yuban| 10.0|
|  nespresso| 10.0|
|four barrel|  5.0|
|     ritual|  4.0|
+-----------+-----+



## Computing Averages
- The task of computing an average is straightforward with Spark SQL (and SQL). You need to simply call the avg expression on a column.

In [10]:
# Query the view
spark.sql("SELECT avg(roast) as avg_roast from coffee").show()

+---------+
|avg_roast|
+---------+
|     7.25|
+---------+



## Test 

- Find the Min & Max roast values in the table.
- Try and sort out the table using the ORDER BY clause.
- Try sorting the data by coffee name.

Min & Max

In [11]:
spark.sql("SELECT min(roast) as minimum_roast, max(roast) as maximum_roast from coffee").show()

+-------------+-------------+
|minimum_roast|maximum_roast|
+-------------+-------------+
|          4.0|         10.0|
+-------------+-------------+



Sort the table using the ORDER BY clause.

In [12]:
spark.sql("SELECT * from coffee order by roast asc").show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|     ritual|  4.0|
|four barrel|  5.0|
|      yuban| 10.0|
|  nespresso| 10.0|
+-----------+-----+



Sort data by coffee name

In [13]:
spark.sql("SELECT * from coffee order by name asc").show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|four barrel|  5.0|
|  nespresso| 10.0|
|     ritual|  4.0|
|      yuban| 10.0|
+-----------+-----+



# Writing your First Spark ETL

In [14]:
pwd

'/home/jovyan/work/notebooks'

## Write to CSV

In [16]:
##############################################################
from pyspark.sql import SparkSession
from datetime import datetime
from pyspark.sql import functions as f
##############################################################
## TimeStamp Column ##
######################################################################################
UPDATED=datetime.today().replace(second=0, microsecond=0)
######################################################################################

##############################################################
## Define Schema ##
######################################################################################
# Define the schema
custom_schema = StructType([
    StructField("name", StringType(), True),
    StructField("roast", DoubleType(), True)
])
######################################################################################
##########################################
#### Begin ETL Job #######
##########################################
# Read the data with the specified schema and create a DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .schema(custom_schema) \
    .load("/home/jovyan/work/data/raw-coffee.csv")

# Extract relevant columns and create a new DataFrame
new_df = df.select("name", "roast")

# Transform the data by adding a new column with current timestamp
transformed_df = new_df.withColumn('updated_at', f.lit(UPDATED))

# Create a new directory with timestamp suffix
timestamp = datetime.now().strftime("%Y%m%d")


output_dir = f"/home/jovyan/work/data/csv_curated_{timestamp}"

# Write the transformed data to a CSV file
transformed_df.write.option("header","true").csv(output_dir)

## Write to Parquet

In [17]:
##############################################################
## TimeStamp Column ##
######################################################################################
UPDATED=datetime.today().replace(second=0, microsecond=0)
######################################################################################

##############################################################
## Define Schema ##
######################################################################################
# Define the schema
custom_schema = StructType([
    StructField("name", StringType(), True),
    StructField("roast", DoubleType(), True)
])
######################################################################################
##########################################
#### Begin ETL Job #######
##########################################
# Read the data with the specified schema and create a DataFrame
df = spark.read.format("csv") \
    .option("header", True) \
    .schema(schema) \
    .load("/home/jovyan/work/data/raw-coffee.csv")
##########################################
# Extract relevant columns and create a new DataFrame
new_df = df.select("name", "roast")
##########################################
# Transform the data by adding a new column with current timestamp
transformed_df = new_df.withColumn('updated_at', f.lit(UPDATED))
##########################################
# Create a new directory with timestamp suffix
timestamp = datetime.now().strftime("%Y%m%d")
output_dir = f"/home/jovyan/work/data/parq_curated_{timestamp}"
transformed_df.write.mode("overwrite").parquet(output_dir)
##########################################

                                                                                