In [None]:
!pip install -U -r requirements.txt

In [None]:
import os 
import prestodb 
import pandas 
import sqlalchemy

In [None]:
PRESTO_SERVER = os.environ.get('PRESTO_SERVER')
PRESTO_USER= os.environ.get('PRESTO_USER')
PRESTO_PORT= os.environ.get('PRESTO_PORT')
PRESTO_CATALOG= os.environ.get('PRESTO_CATALOG')

In [None]:
engine = sqlalchemy.create_engine("presto://%s@%s:%d/%s" %(PRESTO_USER, PRESTO_SERVER, 80, PRESTO_CATALOG))
pandas.read_sql('SHOW CATALOGS', engine)

In [None]:
sql = '''

select 
    c.id, 
    c.customername, 
    c.customeraddr, 
    c.mktsegment, 
    c.status, 
    SUM(o.amount) as sum_purchased
FROM "customer-domain".public.customer c 
JOIN "finance-domain".public.transactions o ON c.id = o.customerid
GROUP BY c.id, c.customername, c.customeraddr, c.mktsegment, c.status
ORDER BY sum_purchased DESC

'''

df = pandas.read_sql(sql, engine)
df.head()

In [None]:
# clean and extract JSON message on kafka queue first 
# then LEFT JOIN to customer and finance domains
sql = '''

WITH KAFKA AS (
    select JSON_EXTRACT(_message, '$.customer_number') as id, 
           JSON_EXTRACT(_message, '$.txt') as txt 
    from "notebook-test") 

select distinct 
    c.id, 
    c.customername, 
    c.mktsegment, 
    SUM(o.amount) as sum_purchased, 
    COUNT(k.txt) as message_count
FROM "customer-domain".public.customer c 
JOIN "finance-domain".public.transactions o ON c.id = o.customerid
LEFT JOIN KAFKA k on o.customerid = CAST(k.id AS INTEGER)
GROUP BY c.id, c.customername, c.mktsegment, k.txt
ORDER BY sum_purchased DESC

'''
df = pandas.read_sql(sql, engine)
df.head()

In [None]:
# use SQL to build features accross domains without making extracts or copies
sql = '''

WITH KAFKA AS (
    select JSON_EXTRACT(_message, '$.customer_number') as id, 
           JSON_EXTRACT(_message, '$.txt') as txt 
    from "notebook-test") 

select distinct 
    c.id, 
    c.customername, 
    c.mktsegment, 
    day(current_date - c.effectivedate) AS days_active,
    MAX(o.amount) AS max_prch,
    MIN(o.amount) AS min_prch,
    AVG(o.amount) AS mean_prch,
    COUNT(o.amount) AS count_prch,
    SUM(o.amount) as sum_purchased, 
    COUNT(k.txt) as message_count
FROM "customer-domain".public.customer c 
JOIN "finance-domain".public.transactions o ON c.id = o.customerid
LEFT JOIN KAFKA k on o.customerid = CAST(k.id AS INTEGER)
WHERE c.status > 0
GROUP BY c.id, c.customername, c.mktsegment, k.txt, day(current_date - c.effectivedate)
ORDER BY sum_purchased DESC


'''
df = pandas.read_sql(sql, engine)
df.head()