In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.master("local[1]").appName("Dataframe examples").getOrCreate()

In [3]:
spark

In [4]:
# how to create dataframe from csv files

In [5]:
order_df=spark.read.format("csv").option("header","true").option("inferSchema","true").load(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000.csv")

In [6]:
order_df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [7]:
order_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [8]:
#To change the column name for eg: changing order_status to Status

In [9]:
transformed_df=order_df.withColumnRenamed("order_status","Status")

In [10]:
transformed_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- Status: string (nullable = true)



In [11]:
from pyspark.sql.functions import *

In [12]:
# to create new column we use withColumn

In [13]:
transformed_df1=transformed_df.withColumn("date",to_timestamp("order_date"))

In [14]:
transformed_df1.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- date: timestamp (nullable = true)



In [15]:
# creating dataframes from different file formats such as json,orc,parquet

In [16]:
orders_csv=spark.read.csv(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\udemy\RetailDB+SalesData\RetailDB SalesData\Orders\part-00000.csv",header="true",inferSchema="true")

In [17]:
orders_csv.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [18]:
orders_json=spark.read.json(r"C:\Users\user\OneDrive\Desktop\Big-data-trendytech\orders.json")

In [19]:
orders_json.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- company: struct (nullable = true)
 |    |    |-- contactperson: struct (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- phone: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- dateoffinish: string (nullable = true)
 |    |-- dateoforder: string (nullable = true)
 |    |-- price: string (nullable = true)



In [20]:
#to filter we use where or filter

In [21]:
orders_csv.where("customer_id = 11599").show(5)

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-25 00:00:00|      11599|      CLOSED|
|   11397|2013-10-03 00:00:00|      11599|    COMPLETE|
|   23908|2013-12-20 00:00:00|      11599|    COMPLETE|
|   53545|2014-06-27 00:00:00|      11599|     PENDING|
|   59911|2013-10-17 00:00:00|      11599|  PROCESSING|
+--------+-------------------+-----------+------------+



In [22]:
orders_csv.filter("customer_id = 11599").show(5)

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-25 00:00:00|      11599|      CLOSED|
|   11397|2013-10-03 00:00:00|      11599|    COMPLETE|
|   23908|2013-12-20 00:00:00|      11599|    COMPLETE|
|   53545|2014-06-27 00:00:00|      11599|     PENDING|
|   59911|2013-10-17 00:00:00|      11599|  PROCESSING|
+--------+-------------------+-----------+------------+



In [23]:
# To convert dataframe to spark table

In [24]:
orders_csv.createOrReplaceTempView("orders")

In [27]:
spark_sql_order=spark.sql("select * from orders where customer_id = 11599")

In [28]:
spark_sql_order.show(5)

+--------+-------------------+-----------+------------+
|order_id|         order_date|customer_id|order_status|
+--------+-------------------+-----------+------------+
|       1|2013-07-25 00:00:00|      11599|      CLOSED|
|   11397|2013-10-03 00:00:00|      11599|    COMPLETE|
|   23908|2013-12-20 00:00:00|      11599|    COMPLETE|
|   53545|2014-06-27 00:00:00|      11599|     PENDING|
|   59911|2013-10-17 00:00:00|      11599|  PROCESSING|
+--------+-------------------+-----------+------------+



In [29]:
# To convert spark table/view to dataframe

In [30]:
orders_df=spark.read.table("orders")

In [31]:
order_df.show(5)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [32]:
# To convert dataframe to spark table and to available to all applications or sessions

In [33]:
orders_csv.createOrReplaceGlobalTempView("orders_global")