In [None]:
!pip install -U -r requirements.txt

In [None]:
import json
import os 
import warnings
import pandas
import trino 
from helper import get_sql

warnings.simplefilter('ignore')

In [None]:
TRINO_HOSTNAME = os.environ.get('TRINO_HOSTNAME')
TRINO_USERNAME = os.environ.get('TRINO_USERNAME')
TRINO_PORT = os.environ.get('TRINO_PORT')

Let's just make sure we have a connection and can query Starburst

In [None]:
conn = trino.dbapi.connect(
    host=TRINO_HOSTNAME,
    port=TRINO_PORT,
    user=TRINO_USERNAME,
)

In [None]:
sql = 'SHOW CATALOGS'
df = get_sql(sql, conn)
df.head()

In [None]:
kafka_sql = 'select * from "messages"'

kafka_raw_df = get_sql(kafka_sql, conn)
kafka_df = kafka_raw_df.join(kafka_raw_df._message.apply(json.loads).apply(pandas.Series))
kafka_df = kafka_df.drop(columns=['_message'])

kafka_df.head()

In [None]:
cust_sql = 'select * from "customer-domain".public.customer'

cust_df = get_sql(cust_sql, conn)
cust_df.head()

In [None]:
fin_sql = 'select * from "finance-domain".public.transactions'

fin_df = get_sql(fin_sql, conn)
fin_df["amount"] = fin_df["amount"].str.replace("$", "")
fin_df["amount"] = fin_df["amount"].astype(float)
fin_df.head()

Let's explore customer 42's transaction information

In [None]:
fin_df[fin_df['customerid'] == 42]

In [None]:
cust_df[cust_df['id'] == 42]

Let's look at customer spend by mktsegment

In [None]:
df = cust_df.join(fin_df.groupby("customerid")["amount"].sum("amount"), lsuffix="customerid", rsuffix="id").dropna()
df.sort_values('amount', ascending=False).head()

In [None]:
df.plot.scatter(x="mktsegment", y="amount", figsize=(12, 6))