# A demo notebook : Sample ecommerce dataset from AWS

## This demo illustrates the following 

* Defining spark external tables on data 
* Defining the star schema between the data tables 
* Loading a set of tables into SNAP ( Indexing Process)
* Sample queries on SNAP
NOTE: This example uses a Jupyter notebook but you can use any SQL client such a Squirrel to connext to SNAP and setup.

### First let us setup the notebook

In [None]:
from pyhive import hive
from pprint import pprint
import pandas as pd
import os
def sql(q, explain=False) :
    # silly hack to handle filesystem prefix for us when creating local tables
    if "{prefix}" in q:
        q = q.replace('{prefix}',cwd)
    df=pd.read_sql(q,c)
    return df

def explain(q):
    df = sql("explain " + q)
    plan = df['plan'][0]
    pprint(plan)
    


### Connect to SNAP

In [None]:
#Set up the connection to the SNAP Thrift server
c = hive.Connection(host="0.0.0.0",port=10000)
pd.read_sql('show tables',c)

## Drop all tables and recreate them

Note these are external tables - so drop will only delete the metadata not the data

In [None]:
table_names = ["users","venue","category","ddate","event","listing","sales"]
drop = """
drop table if exists {table_name}
"""
for table_name in table_names:
    pd.read_sql(
        drop.format(table_name=table_name)
    ,c) 

### External table : Users

In [None]:
users = """
create table if not exists users (
	userid integer ,
	username string,
	firstname string,
	lastname string,
	city string,
	state string,
	email string,
	phone string,
	likesports string,
	liketheatre string,
	likeconcerts string,
	likejazz string,
	likeclassical string,
	likeopera string,
	likerock string,
	likevegas string,
	likebroadway string,
	likemusicals string)
    using csv
    options (path "{prefix}/allusers_pipe.txt", delimiter "|")
"""
sql(users)

### External table : Venue

In [None]:
venue = """
create table if not exists venue(
	venueid integer,
	venuename string,
	venuecity string,
	venuestate string,
	venueseats integer)
    using csv
    options (path "{prefix}/venue_pipe.txt", delimiter "|")
"""
sql(venue)


### External table : Category

In [None]:
category="""
create table if not exists category(
	catid integer,
	catgroup string,
	catname string,
	catdesc string)    
    using csv
    options (path "{prefix}/category_pipe.txt", delimiter "|")
"""
sql(category)

### External table: Date

In [None]:
ddate = """
create table if not exists ddate(
	dateid integer ,
	caldate date,
	day string,
	week integer,
	month string,
	qtr string,
	year integer,
	holiday string)    
    using csv
    options (path "/data/shared/snap-samples/Redshift/date2008_pipe.txt", delimiter "|")
"""
sql(ddate)

### External table : Event

In [None]:
event = """
create table if not exists event(
	eventid integer ,
	venueid integer,
	catid integer,
	dateid integer ,
	eventname string,
	starttime timestamp)    
    using csv
    options (path "{prefix}/allevents_pipe.txt", delimiter "|")
"""
sql(event)

### External table: Listing

In [None]:
listing = """
create table if not exists listing(
	listid integer  ,
	sellerid integer ,
	eventid integer ,
	dateid integer ,
	numtickets integer ,
	priceperticket decimal(8,2),
	totalprice decimal(8,2),
	listtime timestamp)    
    using csv
    options (path "{prefix}/listings_pipe.txt", delimiter "|")
"""

sql(listing)

### External table: Sales

In [None]:
sales = """
create table if not exists  sales(
	salesid integer ,
	listid integer,
	sellerid integer,
	buyerid integer,
	eventid integer,
	dateid integer,
	qtysold integer,
	pricepaid decimal(8,2),
	commission decimal(8,2),
	saletime timestamp)
    using csv
    options (
        path "{prefix}/sales_tab.txt"
        ,delimiter "\t"
        ,timestampFormat "d/MM/yyyy HH:mm:ss"
        ,inferSchema "true"
        )
"""
sql(sales)


### Define the star schema in SNAP
The star schema in SNAP establishes the join graph between the external tables we defined above

In [None]:
create_star_schema = """alter star schema on sales as
many_to_one join of sales with listing on sales.listid = listing.listid
many_to_one join of sales with event on sales.eventid = event.eventid
many_to_one join of sales with ddate on sales.dateid = ddate.dateid
many_to_one join of sales with users on sales.buyerid = users.userid
many_to_one join of event with category on event.catid = category.catid
many_to_one join of event with venue on event.venueid = venue.venueid
"""
sql(create_star_schema)

### Define the SNAP Index 

In [None]:
salessnap="""
drop olap index salessnap on sales
"""
sql(salessnap)

#### Index is created with an indexname as "salessnap" and we say it is defined on the starschema "sales" as defined in the create star schema step.

#### Dimensions and Metrics
Dimensions are columns that you want to filter, select or group by on.
Metrics are typically fields of type float/decimal = those you want to aggregate on.
timestamp dimensions are a special type of dimension defined on Spark date or timestamp columns


In [None]:
salessnap="""

create olap index salessnap on sales
timestamp dimension starttime 
timestamp dimension listtime 
timestamp dimension saletime
timestamp dimension caldate
metric priceperticket aggregator doubleSum is nullable nullvalue "0.0" 
metric totalprice aggregator doubleSum is nullable nullvalue "0.0"
metric numtickets aggregator longSum is nullable nullvalue "0"
metric qtysold aggregator longSum is nullable nullvalue "0"
metric pricepaid aggregator doubleSum is nullable nullvalue "0.0"
metric commission aggregator doubleSum is nullable nullvalue "0.0"
dimension holiday is not nullable 
dimensions "username,city, state, likesports, liketheatre,likeconcerts, likejazz , \
likeclassical, likeopera, likerock, likevegas, likebroadway, likemusicals, \
venuename, venuecity, venuestate, catgroup, catname, catdesc \
, day, week, month, qtr, year, eventname , \
sales.eventid, sales.buyerid, sales.listid, sales.salesid" 

OPTIONS (        
    path "/data/shared/snap-samples/Redshift/snap",
    avgSizePerPartition  "40mb",
    avgNumRowsPerPartition "10000",
    preferredSegmentSize "20mb",
    rowFlushBoundary "10000",
    defaultNullValue "0"
)
"""

sql(salessnap)

### Load data into SNAP

In [None]:
insert=""" insert overwrite olap index salessnap of sales """

sql(insert)

### Query 1

Note after loading data into SNAP you can query the original Spark external tables sales, ddate with the join keys. The query will get rewritten in SNAP at runtime to use the SNAP index created instead of executing on the source data as joins.

In [None]:
query1 = """
SELECT sum(qtysold) 
FROM   sales as sales, ddate
WHERE  sales.dateid = ddate.dateid 
AND    ddate.caldate = '2008-01-05'
"""
sql(query1)

In [None]:
explain(query1)

In [None]:
query2 = """
SELECT firstname, lastname, total_quantity 
FROM   (SELECT buyerid, sum(qtysold) total_quantity
        FROM  sales
        GROUP BY buyerid
        ORDER BY total_quantity desc limit 10) Q, users
WHERE Q.buyerid = userid
ORDER BY Q.total_quantity desc
"""
sql(query2)

In [None]:
sq1=""" 

SELECT SUM(`sales`.`qtysold`) AS `sum_qtysold_ok`, 
COUNT(1) AS `x__alias__0` 
FROM `default`.`sales` `sales` 
  JOIN `default`.`users` `users` ON (`sales`.`buyerid` = `users`.`userid`) 
  JOIN `default`.`event` `event` ON (`sales`.`eventid` = `event`.`eventid`) 
  JOIN `default`.`listing` `listing` ON (`sales`.`listid` = `listing`.`listid`) 
  JOIN `default`.`ddate` `ddate` ON (`sales`.`dateid` = `ddate`.`dateid`) 
  JOIN `default`.`category` `category` ON (`event`.`catid` = `category`.`catid`) 
  JOIN `default`.`venue` `venue` ON (`event`.`venueid` = `venue`.`venueid`) 
  HAVING (COUNT(1) > 0)

"""
sql(sq1)

In [None]:
t10="""
select  sales.eventid, sum(sales.pricepaid) 
from sales, event
where sales.eventid = event.eventid
and sales.pricepaid > 30
group by sales.eventid
order by 2 desc
limit 10
"""



sql(t10)




### Create a derived view representing a segment of users

In [None]:
sportsandjazz = """
create or replace view sportsandjazz as 
select * from salessnap where likesports="TRUE" and likejazz="TRUE"
"""
sql(sportsandjazz)

In [None]:

q="""
select count(*) from sportsandjazz
"""
sql(q)

### Compare metrics for all users vs a segment of users

In [None]:
q="""
with allusers AS ( 
select caldate adate, city,sum(qtysold) q, sum(pricepaid) p 
from salessnap group by caldate,city)
,
someusers AS (
select caldate sdate,city, sum(qtysold) a, sum(pricepaid) b 
from salessnap where likeconcerts='TRUE' AND likejazz='TRUE' group by caldate,city)

select adate,allusers.city, a, b, round(a/q,2)*100 qratio , round(b/p,2)*100 pratio
from allusers, someusers where adate=sdate order by pratio desc limit 5000
"""


In [None]:
df=sql(q)

In [None]:
df.columns

### Combine results from SNAP with Pandas for descriptive analysis

In [None]:
df[['a','b','qratio','pratio']].describe()

In [None]:
sql(""" export model on sales to '/tmp/a' """)
