In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir",f"/user/itv016269/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

## Created a database

In [2]:
spark.sql("create database if not exists itv016269_databases")

In [3]:
groceries_schema='order_id string, location string,item string, order_date date , quantity int'

In [4]:
grocreies_df=spark.read \
.format('csv') \
.option("header","true") \
.option("dateFormat","dd/mm/yyyy") \
.schema(groceries_schema) \
.load("/public/trendytech/groceries.csv")

In [5]:
grocreies_df.createOrReplaceTempView("groceries")

In [44]:
! hadoop fs -head /public/trendytech/groceries.csv

order_id,location,item,order_date,quantity
o1,Seattle,Bananas,01/01/2017,7
o2,Kent,Apples,02/01/2017,20
o3,Bellevue,Flowers,02/01/2017,10
o4,Redmond,Meat,03/01/2017,40
o5,Seattle,Potatoes,04/01/2017,9
o6,Bellevue,Bread,04/01/2017,5
o7,Redmond,Bread,05/01/2017,5
o8,Issaquah,Onion,05/01/2017,4
o9,Redmond,Cheese,05/01/2017,15
o10,Issaquah,Onion,06/01/2017,4
o11,Renton,Bread,05/01/2017,5
o12,Issaquah,Onion,07/01/2017,4
o13,Sammamish,Bread,07/01/2017,5
o14,Issaquah,Tomato,07/01/2017,6
o15,Issaquah,Meat,08/01/2017,3
o16,Issaquah,Meat,09/01/2017,5
o17,Issaquah,Meat,10/01/2017,6
o18,Bellevue,Bread,11/01/2017,7
o19,Bellevue,Bread,12/01/2017,54
o20,Bellevue,Bread,13/01/2017,34
o21,Bellevue,Bread,14/01/2017,25


In [6]:
grocreies_df.show()

+--------+---------+--------+----------+--------+
|order_id| location|    item|order_date|quantity|
+--------+---------+--------+----------+--------+
|      o1|  Seattle| Bananas|2017-01-01|       7|
|      o2|     Kent|  Apples|2017-01-02|      20|
|      o3| Bellevue| Flowers|2017-01-02|      10|
|      o4|  Redmond|    Meat|2017-01-03|      40|
|      o5|  Seattle|Potatoes|2017-01-04|       9|
|      o6| Bellevue|   Bread|2017-01-04|       5|
|      o7|  Redmond|   Bread|2017-01-05|       5|
|      o8| Issaquah|   Onion|2017-01-05|       4|
|      o9|  Redmond|  Cheese|2017-01-05|      15|
|     o10| Issaquah|   Onion|2017-01-06|       4|
|     o11|   Renton|   Bread|2017-01-05|       5|
|     o12| Issaquah|   Onion|2017-01-07|       4|
|     o13|Sammamish|   Bread|2017-01-07|       5|
|     o14| Issaquah|  Tomato|2017-01-07|       6|
|     o15| Issaquah|    Meat|2017-01-08|       3|
|     o16| Issaquah|    Meat|2017-01-09|       5|
|     o17| Issaquah|    Meat|2017-01-10|       6|


# Creating a Managed Table

In [7]:
spark.sql("create table itv016269_databases.groceries(order_id string, location string, item string, order_date date, quantity int) using csv")

In [8]:
spark.sql("describe table itv016269_databases.groceries").show()

+----------+---------+-------+
|  col_name|data_type|comment|
+----------+---------+-------+
|  order_id|   string|   null|
|  location|   string|   null|
|      item|   string|   null|
|order_date|     date|   null|
|  quantity|      int|   null|
+----------+---------+-------+



In [9]:
spark.sql("insert into itv016269_databases.groceries select * from groceries")

In [10]:
spark.sql("select * from itv016269_databases.groceries").show()

+--------+---------+--------+----------+--------+
|order_id| location|    item|order_date|quantity|
+--------+---------+--------+----------+--------+
|      o1|  Seattle| Bananas|2017-01-01|       7|
|      o2|     Kent|  Apples|2017-01-02|      20|
|      o3| Bellevue| Flowers|2017-01-02|      10|
|      o4|  Redmond|    Meat|2017-01-03|      40|
|      o5|  Seattle|Potatoes|2017-01-04|       9|
|      o6| Bellevue|   Bread|2017-01-04|       5|
|      o7|  Redmond|   Bread|2017-01-05|       5|
|      o8| Issaquah|   Onion|2017-01-05|       4|
|      o9|  Redmond|  Cheese|2017-01-05|      15|
|     o10| Issaquah|   Onion|2017-01-06|       4|
|     o11|   Renton|   Bread|2017-01-05|       5|
|     o12| Issaquah|   Onion|2017-01-07|       4|
|     o13|Sammamish|   Bread|2017-01-07|       5|
|     o14| Issaquah|  Tomato|2017-01-07|       6|
|     o15| Issaquah|    Meat|2017-01-08|       3|
|     o16| Issaquah|    Meat|2017-01-09|       5|
|     o17| Issaquah|    Meat|2017-01-10|       6|


# Creating an External Table

In [11]:
spark.sql("create table itv016269_databases.groceries_ext(order_id string, location string, item string, order_date date, quantity int) using csv location '/public/trendytech/groceries.csv'")

In [12]:
spark.sql("describe table itv016269_databases.groceries_ext").show()

+----------+---------+-------+
|  col_name|data_type|comment|
+----------+---------+-------+
|  order_id|   string|   null|
|  location|   string|   null|
|      item|   string|   null|
|order_date|     date|   null|
|  quantity|      int|   null|
+----------+---------+-------+



In [13]:
spark.sql("use itv016269_databases")
spark.sql("show tables").show()

+-------------------+---------------+-----------+
|           database|      tableName|isTemporary|
+-------------------+---------------+-----------+
|itv016269_databases|cust_transf_ext|      false|
|itv016269_databases|      groceries|      false|
|itv016269_databases|  groceries_ext|      false|
|                   |      groceries|       true|
+-------------------+---------------+-----------+



In [14]:
spark.sql("drop table itv016269_databases.groceries")

In [15]:
spark.sql("drop table itv016269_databases.groceries_ext")

In [16]:
spark.sql("show tables").show()

+-------------------+---------------+-----------+
|           database|      tableName|isTemporary|
+-------------------+---------------+-----------+
|itv016269_databases|cust_transf_ext|      false|
|                   |      groceries|       true|
+-------------------+---------------+-----------+



Since both managed and external tables are dropped, there is only the temporary table existing in this database. Managed tables create a table path in the above mentioned path while creating the spark session. When the managed table is dropped, the file also gets deleted from the path. External table uses the path location and fetches data from there and when the table is dropped, only the metadata gets deleted but the file still exists in the dervied path.

In [17]:
orders_schema='customer_id double, order_date date, order_id int, order_status string'

In [18]:
orders_df=spark.read \
.format('json') \
.option("header","true") \
.schema(orders_schema) \
.load("/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json")

In [19]:
orders_df.show()

+-----------+----------+--------+---------------+
|customer_id|order_date|order_id|   order_status|
+-----------+----------+--------+---------------+
|    11599.0|2013-07-25|       1|         CLOSED|
|      256.0|2013-07-25|       2|PENDING_PAYMENT|
|    12111.0|2013-07-25|       3|       COMPLETE|
|     8827.0|2013-07-25|       4|         CLOSED|
|    11318.0|2013-07-25|       5|       COMPLETE|
|     7130.0|2013-07-25|       6|       COMPLETE|
|     4530.0|2013-07-25|       7|       COMPLETE|
|     2911.0|2013-07-25|       8|     PROCESSING|
|     5657.0|2013-07-25|       9|PENDING_PAYMENT|
|     5648.0|2013-07-25|      10|PENDING_PAYMENT|
|      918.0|2013-07-25|      11| PAYMENT_REVIEW|
|     1837.0|2013-07-25|      12|         CLOSED|
|     9149.0|2013-07-25|      13|PENDING_PAYMENT|
|     9842.0|2013-07-25|      14|     PROCESSING|
|     2568.0|2013-07-25|      15|       COMPLETE|
|     7276.0|2013-07-25|      16|PENDING_PAYMENT|
|     2667.0|2013-07-25|      17|       COMPLETE|


In [20]:
products_schema='ProductID int, Category string, ProductName string, Description string, Price float, ImageURL string'

In [21]:
products_df=spark.read \
.format('csv') \
.option("header","true") \
.schema(products_schema) \
.load("/public/trendytech/retail_db/products")

In [22]:
products_df.show()

+---------+--------+--------------------+-----------+------+--------------------+
|ProductID|Category|         ProductName|Description| Price|            ImageURL|
+---------+--------+--------------------+-----------+------+--------------------+
|        2|       2|Under Armour Men'...|       null|129.99|http://images.acm...|
|        3|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
|        4|       2|Under Armour Men'...|       null| 89.99|http://images.acm...|
|        5|       2|Riddell Youth Rev...|       null|199.99|http://images.acm...|
|        6|       2|Jordan Men's VI R...|       null|134.99|http://images.acm...|
|        7|       2|Schutt Youth Recr...|       null| 99.99|http://images.acm...|
|        8|       2|Nike Men's Vapor ...|       null|129.99|http://images.acm...|
|        9|       2|Nike Adult Vapor ...|       null|  50.0|http://images.acm...|
|       10|       2|Under Armour Men'...|       null|129.99|http://images.acm...|
|       11|     

In [23]:
products_df.createOrReplaceTempView("products")

In [24]:
spark.sql("select count(*) as no_of_products from products").show()

+--------------+
|no_of_products|
+--------------+
|          1344|
+--------------+



In [32]:
spark.sql("select distinct(count(category)) as no_of_categories from products group by category ").show()

+----------------+
|no_of_categories|
+----------------+
|               5|
|              48|
|              23|
|              20|
|              24|
+----------------+



In [40]:
spark.sql("select category, productName, price, ImageURL from products order by price desc limit 5").show()

+--------+--------------------+-------+--------------------+
|category|         productName|  price|            ImageURL|
+--------+--------------------+-------+--------------------+
|      10| SOLE E35 Elliptical|1999.99|http://images.acm...|
|       4|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      10|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      22|  SOLE F85 Treadmill|1799.99|http://images.acm...|
|      47|"Spalding Beast 6...|1099.99|http://images.acm...|
+--------+--------------------+-------+--------------------+



In [42]:
spark.sql("select category, count(*) as no_of_products from products where price > 100 group by category").show()

+--------+--------------+
|category|no_of_products|
+--------+--------------+
|       7|             6|
|      51|             7|
|      54|             6|
|      11|            19|
|      29|             9|
|      42|             4|
|       3|             5|
|      30|            17|
|      34|            15|
|       8|             5|
|      22|             4|
|      16|            11|
|      35|            15|
|      52|             5|
|      47|            10|
|      43|            23|
|       5|            11|
|      31|            17|
|      18|            14|
|      17|             8|
+--------+--------------+
only showing top 20 rows



In [43]:
spark.sql("select ProductName, Price from products where price >200 and category =5 ").show()

+--------------------+------+
|         ProductName| Price|
+--------------------+------+
|"Goaliath 54"" In...|499.99|
|Fitness Gear 300 ...|209.99|
|Teeter Hang Ups N...|299.99|
+--------------------+------+



In [21]:
customers_schema='cust_id int, cust_fname string, cust_lname string, cust_email string, cust_password string, cust_street string, cust_city string, cust_state string, cust_zipcode int'

In [22]:
customers_df=spark.read \
.format('csv') \
.option("header","true") \
.schema(customers_schema) \
.load("/public/trendytech/retail_db/customers")

In [23]:
customers_df.show()

+-------+-----------+----------+----------+-------------+--------------------+-------------+----------+------------+
|cust_id| cust_fname|cust_lname|cust_email|cust_password|         cust_street|    cust_city|cust_state|cust_zipcode|
+-------+-----------+----------+----------+-------------+--------------------+-------------+----------+------------+
|      2|       Mary|   Barrett| XXXXXXXXX|    XXXXXXXXX|9526 Noble Embers...|    Littleton|        CO|       80126|
|      3|        Ann|     Smith| XXXXXXXXX|    XXXXXXXXX|3422 Blue Pioneer...|       Caguas|        PR|         725|
|      4|       Mary|     Jones| XXXXXXXXX|    XXXXXXXXX|  8324 Little Common|   San Marcos|        CA|       92069|
|      5|     Robert|    Hudson| XXXXXXXXX|    XXXXXXXXX|10 Crystal River ...|       Caguas|        PR|         725|
|      6|       Mary|     Smith| XXXXXXXXX|    XXXXXXXXX|3151 Sleepy Quail...|      Passaic|        NJ|        7055|
|      7|    Melissa|    Wilcox| XXXXXXXXX|    XXXXXXXXX|9453 Hi

Customers by state

In [24]:
customers_df.groupBy('cust_state').count().show()

+----------+-----+
|cust_state|count|
+----------+-----+
|        AZ|  213|
|        SC|   41|
|        LA|   63|
|        MN|   39|
|        NJ|  219|
|        DC|   42|
|        OR|  119|
|        VA|  136|
|        RI|   15|
|        KY|   35|
|        MI|  254|
|        NV|  103|
|        WI|   64|
|        ID|    9|
|        CA| 2012|
|        CT|   73|
|        MT|    7|
|        NC|  150|
|        MD|  164|
|        DE|   23|
+----------+-----+
only showing top 20 rows



In [30]:
from pyspark.sql.functions import *

###5 customers with the most common last name

In [32]:
grouped_df = customers_df.groupBy('cust_lname').agg(count('cust_lname').alias("count")).orderBy(col('count').desc()).limit(5).show()

+----------+-----+
|cust_lname|count|
+----------+-----+
|     Smith| 4626|
|   Johnson|   76|
|  Williams|   69|
|     Jones|   65|
|     Brown|   62|
+----------+-----+



Customers with invalid zipcode

In [34]:
customers_df.filter(length(col('cust_zipcode')) != 5).show()

+-------+----------+----------+----------+-------------+--------------------+-------------+----------+------------+
|cust_id|cust_fname|cust_lname|cust_email|cust_password|         cust_street|    cust_city|cust_state|cust_zipcode|
+-------+----------+----------+----------+-------------+--------------------+-------------+----------+------------+
|      3|       Ann|     Smith| XXXXXXXXX|    XXXXXXXXX|3422 Blue Pioneer...|       Caguas|        PR|         725|
|      5|    Robert|    Hudson| XXXXXXXXX|    XXXXXXXXX|10 Crystal River ...|       Caguas|        PR|         725|
|      6|      Mary|     Smith| XXXXXXXXX|    XXXXXXXXX|3151 Sleepy Quail...|      Passaic|        NJ|        7055|
|      7|   Melissa|    Wilcox| XXXXXXXXX|    XXXXXXXXX|9453 High Concession|       Caguas|        PR|         725|
|      8|     Megan|     Smith| XXXXXXXXX|    XXXXXXXXX|3047 Foggy Forest...|     Lawrence|        MA|        1841|
|      9|      Mary|     Perez| XXXXXXXXX|    XXXXXXXXX| 3616 Quaking St

Number of customers with valid zipcode

In [40]:
customers_df.filter(length(col('cust_zipcode')) == 5).count()

7243

Number of customers in each city of California

In [26]:
customers_df.filter("cust_state == 'CA'").groupBy('cust_city').count().show()

+-------------+-----+
|    cust_city|count|
+-------------+-----+
|       Corona|   14|
|    Pittsburg|    4|
|      Compton|   19|
|    Palo Alto|    6|
|      Hanford|    9|
|      Anaheim|   19|
|       Folsom|    6|
|         Napa|    8|
|     Temecula|    6|
|       Reseda|    6|
|    Encinitas|   17|
|    Oceanside|   24|
|    Cupertino|    9|
|      Oakland|    3|
|        Davis|    9|
|      Fontana|   18|
|Mission Viejo|   26|
|       Madera|    5|
|    Elk Grove|   10|
|  Bakersfield|   41|
+-------------+-----+
only showing top 20 rows

