In [269]:
from pyhive import hive
from pprint import pprint
import pandas as pd
import os
from altair import *

import IPython.display
def draw(spec):
    IPython.display.display({
        'application/vnd.vegalite.v1+json': spec.to_dict()
    }, raw=True)

pd.set_option('display.max_colwidth', -1) # dont truncate table columns
cwd = os.getcwd()
cwd="/data/shared/snap-samples/Redshift"

c = hive.Connection(host="0.0.0.0",port=10000,auth='NOSASL')
pd.read_sql('show tables',c)

def sql(q, explain=False) :
    # silly hack to handle filesystem prefix for us when creating local tables
    if "{prefix}" in q:
        q = q.replace('{prefix}',cwd)
    df=pd.read_sql(q,c)
    return df

def explain(q):
    df = sql("explain " + q)
    plan = df['plan'][0]
    pprint(plan)
    

sql('show tables')

In [221]:
table_names = ["users","venue","category","ddate","event","listing","sales"]
drop = """
drop table if exists {table_name}
"""
for table_name in table_names:
    pd.read_sql(
        drop.format(table_name=table_name)
    ,c) 

In [222]:
def snapsql(sql) :
    df=pd.read_sql(sql,c)
    return df

In [223]:
users = """
create table if not exists users (
	userid integer ,
	username string,
	firstname string,
	lastname string,
	city string,
	state string,
	email string,
	phone string,
	likesports string,
	liketheatre string,
	likeconcerts string,
	likejazz string,
	likeclassical string,
	likeopera string,
	likerock string,
	likevegas string,
	likebroadway string,
	likemusicals string)
    using csv
    options (path "{prefix}/allusers_pipe.txt", delimiter "|")
"""
sql(users)

In [224]:
venue = """
create table if not exists venue(
	venueid integer,
	venuename string,
	venuecity string,
	venuestate string,
	venueseats integer)
    using csv
    options (path "{prefix}/venue_pipe.txt", delimiter "|")
"""
sql(venue)


In [225]:
category="""
create table if not exists category(
	catid integer,
	catgroup string,
	catname string,
	catdesc string)    
    using csv
    options (path "{prefix}/category_pipe.txt", delimiter "|")
"""
sql(category)

In [226]:
ddate = """
create table if not exists ddate(
	dateid integer ,
	caldate date,
	day string,
	week integer,
	month string,
	qtr string,
	year integer,
	holiday string)    
    using csv
    options (path "/data/shared/snap-samples/Redshift/date2008_pipe.txt", delimiter "|")
"""
sql(ddate)

In [227]:
event = """
create table if not exists event(
	eventid integer ,
	venueid integer,
	catid integer,
	dateid integer ,
	eventname string,
	starttime timestamp)    
    using csv
    options (path "{prefix}/allevents_pipe.txt", delimiter "|")
"""
sql(event)

In [228]:
listing = """
create table if not exists listing(
	listid integer  ,
	sellerid integer ,
	eventid integer ,
	dateid integer ,
	numtickets integer ,
	priceperticket decimal(8,2),
	totalprice decimal(8,2),
	listtime timestamp)    
    using csv
    options (path "{prefix}/listings_pipe.txt", delimiter "|")
"""

sql(listing)

In [229]:
sales = """
create table if not exists  sales(
	salesid integer ,
	listid integer,
	sellerid integer,
	buyerid integer,
	eventid integer,
	dateid integer,
	qtysold integer,
	pricepaid decimal(8,2),
	commission decimal(8,2),
	saletime timestamp)
    using csv
    options (
        path "{prefix}/sales_tab.txt"
        ,delimiter "\t"
        ,timestampFormat "d/MM/yyyy HH:mm:ss"
        ,inferSchema "true"
        )
"""
sql(sales)


In [230]:
query1 = """
SELECT sum(qtysold) 
FROM   sales as sales, ddate
WHERE  sales.dateid = ddate.dateid 
AND    ddate.caldate = '2008-01-05'
"""
sql(query1)

In [231]:
query2 = """
SELECT firstname, lastname, total_quantity 
FROM   (SELECT buyerid, sum(qtysold) total_quantity
        FROM  sales
        GROUP BY buyerid
        ORDER BY total_quantity desc limit 10) Q, users
WHERE Q.buyerid = userid
ORDER BY Q.total_quantity desc
"""
sql(query2)

In [234]:
create_star_schema = """alter star schema on sales as
many_to_one join of sales with listing on sales.listid = listing.listid
many_to_one join of sales with event on sales.eventid = event.eventid
many_to_one join of sales with ddate on sales.dateid = ddate.dateid
many_to_one join of sales with users on sales.buyerid = users.userid
many_to_one join of event with category on event.catid = category.catid
many_to_one join of event with venue on event.venueid = venue.venueid
"""
sql(create_star_schema)

In [235]:
q=""" drop olap index salessnap on sales """
sql(q)

In [1]:
salessnap="""

create olap index salessnap on sales
timestamp dimension starttime 
timestamp dimension listtime 
timestamp dimension saletime
timestamp dimension caldate
metric priceperticket aggregator doubleSum is nullable nullvalue "0.0" 
metric totalprice aggregator doubleSum is nullable nullvalue "0.0"
metric numtickets aggregator longSum is nullable nullvalue "0"
metric qtysold aggregator longSum is nullable nullvalue "0"
metric pricepaid aggregator doubleSum is nullable nullvalue "0.0"
metric commission aggregator doubleSum is nullable nullvalue "0.0"
dimension holiday is not nullable 
dimensions "username,city, state, likesports, liketheatre,likeconcerts, likejazz , \
likeclassical, likeopera, likerock, likevegas, likebroadway, likemusicals, \
venuename, venuecity, venuestate, catgroup, catname, catdesc \
, day, week, month, qtr, year, eventname , \
sales.eventid, sales.buyerid, sales.listid, sales.salesid" 

OPTIONS (        
    path "/data/shared/snap-samples/Redshift/snap",
    avgSizePerPartition  "40mb",
    avgNumRowsPerPartition "10000",
    preferredSegmentSize "20mb",
    rowFlushBoundary "10000",
    defaultNullValue "0"
)
"""

sql(salessnap)

NameError: name 'sql' is not defined

In [237]:
insert=""" insert overwrite olap index salessnap of sales """

sql(insert)

In [247]:
q1=""" select count(*) from sales  limit 15 """
sql(q1)

In [248]:
sq1=""" 

SELECT SUM(`sales`.`qtysold`) AS `sum_qtysold_ok`, 
COUNT(1) AS `x__alias__0` 
FROM `default`.`sales` `sales` 
  JOIN `default`.`users` `users` ON (`sales`.`buyerid` = `users`.`userid`) 
  JOIN `default`.`event` `event` ON (`sales`.`eventid` = `event`.`eventid`) 
  JOIN `default`.`listing` `listing` ON (`sales`.`listid` = `listing`.`listid`) 
  JOIN `default`.`ddate` `ddate` ON (`sales`.`dateid` = `ddate`.`dateid`) 
  JOIN `default`.`category` `category` ON (`event`.`catid` = `category`.`catid`) 
  JOIN `default`.`venue` `venue` ON (`event`.`venueid` = `venue`.`venueid`) 
  HAVING (COUNT(1) > 0)

"""

In [249]:
sql(sq1)

In [250]:
t10="""
select  sales.eventid, sum(sales.pricepaid) 
from sales, event
where sales.eventid = event.eventid
and sales.pricepaid > 30
group by sales.eventid
order by 2 desc
limit 10
"""
snapsql(t10)

In [251]:
sportsandjazz = """
create or replace view sportsandjazz as 
select * from salessnap where likesports="TRUE" and likejazz="TRUE"
"""
snapsql(sportsandjazz)

In [2]:

q="""
select count(*) from sportsandjazz
"""
snapsql(q)

NameError: name 'snapsql' is not defined

In [170]:
snapsql(""" select count(*) from users """)

In [278]:
q="""
with allusers AS ( 
select caldate adate, city,sum(qtysold) q, sum(pricepaid) p 
from salessnap group by caldate,city)
,
someusers AS (
select caldate sdate,city, sum(qtysold) a, sum(pricepaid) b 
from salessnap where likeconcerts='TRUE' AND likejazz='TRUE' group by caldate,city)

select adate,allusers.city, a, b, round(a/q,2)*100 qratio , round(b/p,2)*100 pratio
from allusers, someusers where adate=sdate order by pratio desc limit 5000
"""


In [279]:
df=sql(q)

DatabaseError: Execution failed on sql: 
with allusers AS ( 
select caldate adate, city,sum(qtysold) q, sum(pricepaid) p 
from salessnap group by caldate,city)
,
someusers AS (
select caldate sdate,city, sum(qtysold) a, sum(pricepaid) b 
from salessnap where likeconcerts='TRUE' AND likejazz='TRUE' group by caldate,city)

select adate,allusers.city, a, b, round(a/q,2)*100 qratio , round(b/p,2)*100 pratio
from allusers, someusers where adate=sdate order by pratio desc 

TSocket read 0 bytes
unable to rollback

In [281]:
df.columns

Index([u'adate', u'a', u'b', u'qratio', u'pratio'], dtype='object')

In [259]:
df[['a','b','qratio','pratio']].describe()

In [275]:
a=Chart(df).mark_bar().encode(x=X('year(a)',
  bin=Bin(maxbins=10)),y='count(*)')
draw(a)

In [None]:
a=Chart(df).mark_line().encode(color=)