In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\apachespark\\spark-2.4.7-bin-hadoop2.7'

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("demo").getOrCreate()

In [3]:
spark

In [4]:
customerDataDF = spark.read.json('customerData.json')

In [6]:
customerDataDF.printSchema()

root
 |-- age: long (nullable = true)
 |-- deptid: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



# Create Schema programatically and attach the same in the Dataframe

In [7]:
# StructType ---- Used to create the  Schema Object
# StructField --- Used to create a field/column 
# Syntax: StructField(colName, dataType, supportNull)

from pyspark.sql.types import StructType,StructField,StringType,IntegerType

schemaCustomerData = StructType([
    StructField("age",IntegerType(),True),
    StructField("deptid",IntegerType(),True),
    StructField("gender",StringType(),True),
    StructField("name",StringType(),True),
    StructField("salary",IntegerType(),True)
])

In [8]:
customerDataDF2 = spark.read.schema(schemaCustomerData).json('customerData.json')

In [9]:
customerDataDF2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- deptid: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)



In [10]:
# Bigdata Project Phases
#
# Data Acquisition Phase
#
# File -------------------> SparkDF -------------------> HDFS
# Database ---------------> SparkDF(jdbc) -------------> HDFS
# Streams ----------------> SparkStreaming ------------> HDFS
#
#Data Preprocessing Phase
# Data Transformation Phase
#
# DF --------------------> SparkSQL ------------------------> Transformed Data
#                          SparkDFFunctions
#
# Data View Phase
# DF --------------------> SparkSQL Query-------------------> Realtime Output/Near Realtime Output
#
# Intelligence Layer Phase
# DF --------------------> SparkML --------------------------> Models
#

# The SQL way of Analysing Data in Apache Spark

1. Ensure the data is loaded in the form of DataFrame
2. Register DataFrame as a Temporary Table
3. Using sparkSession object you can perform SQL Query using SQL language

In [11]:
autodataDf = spark.read.option('header',True).option('inferSchema',True).csv('auto-data.csv')

In [12]:
autodataDf.show(5)

+----------+--------+------+-----+---------+-----+---------+---+----+--------+-------+-----+
|      MAKE|FUELTYPE|ASPIRE|DOORS|     BODY|DRIVE|CYLINDERS| HP| RPM|MPG-CITY|MPG-HWY|PRICE|
+----------+--------+------+-----+---------+-----+---------+---+----+--------+-------+-----+
|    subaru|     gas|   std|  two|hatchback|  fwd|     four| 69|4900|      31|     36| 5118|
| chevrolet|     gas|   std|  two|hatchback|  fwd|    three| 48|5100|      47|     53| 5151|
|     mazda|     gas|   std|  two|hatchback|  fwd|     four| 68|5000|      30|     31| 5195|
|    toyota|     gas|   std|  two|hatchback|  fwd|     four| 62|4800|      35|     39| 5348|
|mitsubishi|     gas|   std|  two|hatchback|  fwd|     four| 68|5500|      37|     41| 5389|
+----------+--------+------+-----+---------+-----+---------+---+----+--------+-------+-----+
only showing top 5 rows



In [13]:
# Register the DF as a temp table

autodataDf.registerTempTable('autodata')

In [14]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        | autodata|       true|
+--------+---------+-----------+



In [16]:
spark.sql("select * from autodata where FUELTYPE = 'gas'").show()

+----------+--------+------+-----+---------+-----+---------+---+----+--------+-------+-----+
|      MAKE|FUELTYPE|ASPIRE|DOORS|     BODY|DRIVE|CYLINDERS| HP| RPM|MPG-CITY|MPG-HWY|PRICE|
+----------+--------+------+-----+---------+-----+---------+---+----+--------+-------+-----+
|    subaru|     gas|   std|  two|hatchback|  fwd|     four| 69|4900|      31|     36| 5118|
| chevrolet|     gas|   std|  two|hatchback|  fwd|    three| 48|5100|      47|     53| 5151|
|     mazda|     gas|   std|  two|hatchback|  fwd|     four| 68|5000|      30|     31| 5195|
|    toyota|     gas|   std|  two|hatchback|  fwd|     four| 62|4800|      35|     39| 5348|
|mitsubishi|     gas|   std|  two|hatchback|  fwd|     four| 68|5500|      37|     41| 5389|
|     honda|     gas|   std|  two|hatchback|  fwd|     four| 60|5500|      38|     42| 5399|
|    nissan|     gas|   std|  two|    sedan|  fwd|     four| 69|5200|      31|     37| 5499|
|     dodge|     gas|   std|  two|hatchback|  fwd|     four| 68|5500| 