In [1]:
#create a dictionary/JSON type structure to represent customer records
customerRDD = sc.parallelize(("""
{"ID": "111-11-1111",
 "Name": "Doug Walters",
 "Address": "1111 Hemlock St., Overland Park, Kansas"}
""",
 """
 {"ID": "123-11-1111",
 "Name": "Calamity Jane",
 "Address": "2222 Oak St., Chisolm Trail, Texas"}
""",
"""
 {"ID": "222-11-1111",
 "Name": "Sherlock Holmes",
 "Address": "221B Baket Street, London, UK"}
 """
  ))

In [2]:
customerRDD.take(1)

In [3]:
#let us create a dataframe by reading the json format
df = spark.read.json(customerRDD)
df.show(truncate = False)

In [4]:
df.printSchema()

In [5]:
#let us create a schema and create the dataframe again using a different RDD and the schema
customerRDD = sc.parallelize([("111-11-1111","Viv Richards","101 Antigua Bay, Jamaica",67, 250000.00),
                             ("222-22-2222","Doug Walters", "111 Adelaide Ave, Sydney", 71, 100000.00),
                             ("333-33-3333", "Steffi Graf", "101 Casino Dr, Las Vegas", 53, 500000.00)])
from pyspark.sql.types import * #we will potentially need different data types 
schema = StructType([StructField("ID", StringType(), True), StructField("Name", StringType(), True), StructField("Address", StringType(), False),
                    StructField("Age", IntegerType(), False), StructField("Salary", DoubleType(), True)])
df = spark.createDataFrame(customerRDD, schema)
df.show()


In [6]:
df.printSchema()

In [7]:
#let us create a view on which we can run some SQL queries
df.createOrReplaceTempView("customers")

In [8]:
spark.sql("""select name, address from customers""").show()

In [9]:
spark.sql("""select name, address from customers where name like 'V%'""").show() #display customers whose names start with "V"

In [10]:
spark.sql("""select name, address from customers order by name""").show() #sort by name ascending

In [11]:
spark.sql("""select name, address from customers order by name desc""").show() #sort by name descending

In [12]:
#display average age and average salary
spark.sql("select avg(age) as Average_Age, avg(salary) as Average_Salary from customers").show()

In [13]:
#You can use the DataFrame API as well
df.select("Name","Age").show()

In [14]:
df.select("Name","Address").filter("age > 60").show()