## PySpark SQL
 - module in Spark that provides a higher-level abstration for working with structured data and can be used SQL queries
 - First you should create temporary table or view 
 - once table is created, it can be accessed throughout the SparkSession using sql()

In [8]:
import os
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder.appName('TestApp').getOrCreate()
options = {
    'header' : True,
    'delimiter': ',',
    'inferSchema' : True
}

df = spark.read.options(**options).csv(['./resources/1_sample_zip_code.csv','./resources/1_sample_zip_code.csv'])


# Create temporary table by registering df
df.createOrReplaceTempView("Cities")

# Run SQL query
df2 = spark.sql("SELECT * from Cities")
df2.show()

+--------+-----------+-----+
|zip_code|       city|state|
+--------+-----------+-----+
|   10001|   New York|   NY|
|   90001|Los Angeles|   CA|
|   60601|    Chicago|   IL|
|   77001|    Houston|   TX|
|   85001|    Phoenix|   AZ|
|   10001|   New York|   NY|
|   90001|Los Angeles|   CA|
|   60601|    Chicago|   IL|
|   77001|    Houston|   TX|
|   85001|    Phoenix|   AZ|
+--------+-----------+-----+



In [10]:
groupped_by_df = spark.sql('select city, count(*) from Cities group by city')
groupped_by_df.show()

+-----------+--------+
|       city|count(1)|
+-----------+--------+
|    Phoenix|       2|
|Los Angeles|       2|
|    Chicago|       2|
|    Houston|       2|
|   New York|       2|
+-----------+--------+



In [17]:
df = spark.read.option("header",True).csv("./resources/4_sample_zipcodes.csv")
df.printSchema()
df.show()



root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|State|
+------------+-------+-------------------+-------+-----+
|           1|     US|        PARC PARQUE|    704|   PR|
|           2|     US|PASEO COSTA DEL SUR|    704|   PR|
|          10|     US|       BDA SAN LUIS|    709|   PR|
|       49347|     US|               HOLT|  32564|   FL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
|       61391|     US|  CINGULAR WIRELESS|  76166|   TX|
|       61392|     US|         FORT WORTH|  76177|   TX|
|       61393|     US|           FT WORTH|  76177|   TX|
|       54356|     US|        SPRUCE PINE|  35585|   AL|
|       76511|     US|           ASH HILL|  27007|   NC|
|           4|     US|    URB EUGENE RICE|    704|   PR|
|

In [22]:
# Create temporary table
df_another  = spark.read.option("header",True).csv("./resources/4_sample_zipcodes.csv").createOrReplaceTempView("Zipcodes")

In [29]:
df_another.select("country","city","zipcode","state").show(5)
# gives issue because - .createOrReplaceTempView("Zipcodes") 
# does not return a DataFrame. Instead, it registers the DataFrame as a temporary SQL table.

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
+-------+-------------------+-------+-----+
only showing top 5 rows



### so the solution is below

In [30]:
df_another = spark.read.option("header",True).csv("./resources/4_sample_zipcodes.csv")

df_another.createOrReplaceTempView("Zipcodes")

df_another.select("country", "city", "zipcode", "state").show(5)

+-------+-------------------+-------+-----+
|country|               city|zipcode|state|
+-------+-------------------+-------+-----+
|     US|        PARC PARQUE|    704|   PR|
|     US|PASEO COSTA DEL SUR|    704|   PR|
|     US|       BDA SAN LUIS|    709|   PR|
|     US|               HOLT|  32564|   FL|
|     US|          HOMOSASSA|  34487|   FL|
+-------+-------------------+-------+-----+
only showing top 5 rows



### Filter

In [34]:
df_another.select("country", "city", "zipcode", "state").where("state == 'AZ'").show()


+-------+----+-------+-----+
|country|city|zipcode|state|
+-------+----+-------+-----+
|     US|MESA|  85209|   AZ|
|     US|MESA|  85210|   AZ|
+-------+----+-------+-----+

