In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.master("local[1]").appName("DataFrame Schema").getOrCreate()

In [3]:
spark


In [4]:
#Schema Enforcement
    #1.Schema ddl Approach

In [5]:
Orders_schema='order_id long,order_date date,customer_id long,order_status string'

In [10]:
orders_df=spark.read.format("csv").schema(Orders_schema).load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000 -no-header.csv")

In [11]:
orders_df.show(3)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 3 rows



In [12]:
orders_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [13]:
#2.using StructField and StructType

In [4]:
from pyspark.sql.types import *

In [15]:
orders_schema_struct=StructType([StructField("order_id",LongType()),StructField("order_date",DateType()),StructField("customer_id",LongType()),StructField("order_status",StringType())])

In [16]:
orders_df_struct=spark.read.format("csv").schema(orders_schema_struct).load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000 -no-header.csv")

In [17]:
orders_df_struct.show(3)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
+--------+----------+-----------+---------------+
only showing top 3 rows



In [18]:
orders_df_struct.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: date (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [20]:
# Spark will accept data format in "yyyy-mm-dd" and if it is in different format , different versions of spark will behave differently
#inorder to solve this we have two approaches
    # 1.mention the dateformat while creating the dataframe

In [22]:
orders_date1=spark.read.format("csv").schema(orders_schema_struct).option("dateFormat","mm-dd-yyyy").load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\dateformat.csv")

In [24]:
orders_date1.show(2)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-01-25|      11599|         CLOSED|
|       2|2013-01-25|        256|PENDING_PAYMENT|
+--------+----------+-----------+---------------+
only showing top 2 rows



In [25]:
#2 another method is that we can load as string and later we can change using withColumn function

In [26]:
Orders_schema_date='order_id long,order_date string,customer_id long,order_status string'

In [30]:
orders_date2=spark.read.format("csv").schema(Orders_schema_date).load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\dateformat.csv")

In [31]:
orders_date2.show(2)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|07-25-2013|      11599|         CLOSED|
|       2|07-25-2013|        256|PENDING_PAYMENT|
+--------+----------+-----------+---------------+
only showing top 2 rows



In [5]:
from pyspark.sql.functions import *

In [33]:
orders_date3=orders_date2.withColumn("order_date",to_date("order_date","mm-dd-yyyy"))

In [35]:
orders_date3.show(2)

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-01-25|      11599|         CLOSED|
|       2|2013-01-25|        256|PENDING_PAYMENT|
+--------+----------+-----------+---------------+
only showing top 2 rows



In [6]:
#Nested Schema

In [7]:
customer_list=[(1,("jincy","george"),"markham"),(2,("eldo","joseph"),"calgary"),(2,("ethan","eldo"),"edmonton")]

In [8]:
customer_schema="customer_id long,fullname struct< firstname:string,lastname:string>,city string"

In [9]:
df=spark.createDataFrame(customer_list,customer_schema)

In [10]:
df.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)



In [11]:
df.show()

+-----------+---------------+--------+
|customer_id|       fullname|    city|
+-----------+---------------+--------+
|          1|{jincy, george}| markham|
|          2| {eldo, joseph}| calgary|
|          2|  {ethan, eldo}|edmonton|
+-----------+---------------+--------+



In [12]:
#another method

In [13]:
customer_schema_struct=StructType([StructField("customer_id",LongType()),StructField("fullname",StructType([StructField("firstname",StringType()),StructField("lastname",StringType())])),StructField("city",StringType())])

In [14]:
df_struct=spark.createDataFrame(customer_list,customer_schema_struct)

In [15]:
df_struct.show()

+-----------+---------------+--------+
|customer_id|       fullname|    city|
+-----------+---------------+--------+
|          1|{jincy, george}| markham|
|          2| {eldo, joseph}| calgary|
|          2|  {ethan, eldo}|edmonton|
+-----------+---------------+--------+



In [16]:
df_struct.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- fullname: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- city: string (nullable = true)

