# Read & Analyze

## Importing Libraries

In [1]:
import pyspark 
from pyspark import SparkContext 
from pyspark.sql import SparkSession 
from pyspark import SQLContext
import os 
from delta.tables import * 
from delta.tables import DeltaTable 
import hashlib 
import datetime
import urllib.request 
import json 
from datetime import timedelta, date
from itertools import islice 
import sys
from datetime import datetime
from pyspark.sql import functions as f
from pyspark.sql.types import *

## Start Spark Session

In [2]:
# create a Spark Session 
from pyspark.sql import SparkSession 


# Initiate Spark Session 
spark = SparkSession.builder.master("local[1]") \
                    .appName("Read_&_Analyze") \
                    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/*") \
                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# details of the session
spark

### Read a Plain Text File - Schema Definition

In [14]:
# Define the schema for your DataFrame (assuming "name" and "roast" are your column names)
from pyspark.sql.types import *
schema = StructType([
    StructField("name", StringType(), True),
    StructField("roast", DoubleType(), True)
])

# Read the CSV file and specify the schema and header options
coffee_sdf = spark.read.format("csv") \
    .option("header", "false") \
    .schema(schema) \
    .load(input_path)

# Show the data
coffee_sdf.show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|    folgers| 10.0|
|      yuban| 10.0|
|  nespresso| 10.0|
|     ritual|  4.0|
|four barrel|  5.0|
+-----------+-----+



In [15]:
# print the schema 
coffee_sdf.printSchema()

root
 |-- name: string (nullable = true)
 |-- roast: double (nullable = true)



creating a sql view

In [17]:
# Create a view for the DataFrame
coffee_sdf.createOrReplaceTempView("coffee")

# Query the view
spark.sql("SELECT * FROM coffee ORDER BY roast desc").show()

+-----------+-----+
|       name|roast|
+-----------+-----+
|  nespresso| 10.0|
|      yuban| 10.0|
|    folgers| 10.0|
|four barrel|  5.0|
|     ritual|  4.0|
+-----------+-----+



In [19]:
## output would show table metadata

spark.sql("desc coffee").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|    name|   string|   null|
|   roast|   double|   null|
+--------+---------+-------+



In [None]:
## select + coffee

In [20]:
# Query the view vertically
query = """
    SELECT avg(roast)
    AS avg_roast
    FROM coffee
    ORDER BY 1 desc
"""

spark.sql(query).show()

+---------+
|avg_roast|
+---------+
|      7.8|
+---------+



In [21]:
# Query the view vertically
query = """
    SELECT min(roast), max(roast)
    AS avg_roast
    FROM coffee
    ORDER BY 1,2 desc
"""

spark.sql(query).show()

+----------+---------+
|min(roast)|avg_roast|
+----------+---------+
|       4.0|     10.0|
+----------+---------+

