# Introduction to Spark on JupyterNotebook

#### Importing Libraries

In [1]:
# PySpark is the main library for Spark
import pyspark 
# SparkContext is the entry point for Spark functionality
from pyspark import SparkContext 
# SparkSession is the entry point for DataFrame and SQL functionality
from pyspark.sql import SparkSession 

# SQLContext is the entry point for SparkSQL functionality, providing a way to interact with structured data using SQL queries, as well as integrating with various data sources and data formats. 
#It is required to create Spark DataFrames, which are the primary data structure for working with structured data in PySpark.
from pyspark import SQLContext

In [2]:
# Provides a way of using operating system dependent functionality
import os 
# Delta is a storage layer for data lakes
from delta.tables import * 
# DeltaTable is the main class for Delta tables
from delta.tables import DeltaTable 
# Provides cryptographic hashing functions
import hashlib 
 # Provides classes for working with dates and times
import datetime
# Provides functions for working with URLs
import urllib.request 
# Provides functions for working with JSON data
import json 
 # Import timedelta and date classes from datetime module
from datetime import timedelta, date
# Provides functions for working with iterables
from itertools import islice 
# Provides access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter.
import sys 

In [3]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Ignore warnings from Apache Spark
warnings.filterwarnings("ignore", message=".*consider reporting.*")
warnings.filterwarnings("ignore", message=".*illegal-access.*")
warnings.filterwarnings("ignore", message=".*default log level.*")

# Create SparkSession

In [4]:
# Create SparkSession from builder
from pyspark.sql import SparkSession

# Create a SparkSession and set the extraClassPath configuration
spark = SparkSession.builder.master("local[1]") \
    .appName("GettingStartedWithSpark") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
    .getOrCreate()

# Details of the Spark Session
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Basic File Reading and Custom Parsing.

## Read & Analyse

- This next paragraph shows the most basic of Spark methods for file based reading. The zero-bells attached `spark.read.text.` 
- This is useful technique if you are working data for the first time and don’t know how to parse it, or maybe are having issues using other methods for automatic parsing.

In [5]:
## // parse the raw coffee file. this is a simple csv file without headers. 
# The goal is to build core skills: parsing, exploring data

# Read the text file into a DataFrame
df = spark.read.text("/home/jovyan/work/data/raw-coffee.csv")

# Print the schema of the DataFrame
df.printSchema()

root
 |-- value: string (nullable = true)



## Exercise: Basic File Reading and Custom Parsing

In [6]:
df.show()

+--------------+
|         value|
+--------------+
|    name,roast|
|   folgers, 10|
|     yuban, 10|
| nespresso, 10|
|     ritual, 4|
|four barrel, 5|
+--------------+



- The code reads in a DataFrame, and imports two functions split and col from the PySpark sql.functions module.
- The split function takes a string column as input and splits it into an array of substrings using a specified delimiter. In this case, it is splitting the value column on a comma ,.
- The resulting split_col column is then selected with the `select()` method and renamed to name and roast using selectExpr().
- Additionally, `CAST()`  is used to convert the roast column to an integer datatype.
- Any null values are replaced with "unknown" for the name column and 0 for the roast column using the `na.fill()` method.
- The final schema of the resulting DataFrame is printed using the `printSchema()` method.

In [7]:
from pyspark.sql.functions import split, col

converted = df.select(split(col("value"), ",").alias("split_col")) \
       .selectExpr("split_col[0] as name", "CAST(split_col[1] as INT) as roast") \
       .na.fill({'name': 'unknown', 'roast': 0})
       
converted.printSchema()

root
 |-- name: string (nullable = false)
 |-- roast: integer (nullable = false)



In [8]:
converted.show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|       name|    0|
|    folgers|   10|
|      yuban|   10|
|  nespresso|   10|
|     ritual|    4|
|four barrel|    5|
+-----------+-----+



In [9]:
import pandas as pd