# Enriching and updating data using Kafka lookup tables
<!--
  ~ Licensed to the Apache Software Foundation (ASF) under one
  ~ or more contributor license agreements.  See the NOTICE file
  ~ distributed with this work for additional information
  ~ regarding copyright ownership.  The ASF licenses this file
  ~ to you under the Apache License, Version 2.0 (the
  ~ "License"); you may not use this file except in compliance
  ~ with the License.  You may obtain a copy of the License at
  ~
  ~   http://www.apache.org/licenses/LICENSE-2.0
  ~
  ~ Unless required by applicable law or agreed to in writing,
  ~ software distributed under the License is distributed on an
  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  ~ KIND, either express or implied.  See the License for the
  ~ specific language governing permissions and limitations
  ~ under the License.
  -->

[Lookups](https://druid.apache.org/docs/latest/querying/lookups) are [key/value-pair tables](https://druid.apache.org/docs/latest/querying/datasource#lookup) broadcast to query processes that can be updated regularly either manually or automatically. In this notebook you will extend your knowledge of these tables to lookups populated from an [Apache Kafka topic](https://druid.apache.org/docs/latest/querying/kafka-extraction-namespace) and walk through some simple queries.

## Prerequisites

This tutorial works with Druid 30.0.0 or later.

Launch this tutorial and all prerequisites using the `all-services` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see the Learn Druid repository [readme](https://github.com/implydata/learn-druid).
   

## Initialization

The following cells set up the notebook and learning environment ready for use.

### Set up a connection to Apache Druid

Run the next cell to set up the Druid Python client's connection to Apache Druid.

If successful, the Druid version number will be shown in the output.

In [1]:
import druidapi
import os

druid_headers = {'Content-Type': 'application/json'}

if 'DRUID_HOST' not in os.environ.keys():
    druid_host=f"http://localhost:8888"
else:
    druid_host=f"http://{os.environ['DRUID_HOST']}:8888"

print(f"Opening a connection to {druid_host}.")
druid = druidapi.jupyter_client(druid_host)
display = druid.display
sql_client = druid.sql
status_client = druid.status

status_client.version

Opening a connection to http://router:8888.


'29.0.1'

### Set up a connection to Apache Kafka

<!-- Include these cells if your notebook uses Kafka. -->

Run the next cell to set up the connection to Apache Kafka.

In [2]:
if 'KAFKA_HOST' not in os.environ.keys():
   kafka_host=f"http://localhost:9092"
else:
    kafka_host=f"{os.environ['KAFKA_HOST']}:9092"

## Create a table using batch ingestion

Run the following cell to create a table using batch ingestion.

The same principles in this notebook also apply to tables receiving events from stream sources.

When completed, you'll see a description of the final table.

In [4]:
table_name = 'example-flights-kafkalookup'

sql='''
REPLACE INTO "''' + table_name + '''" OVERWRITE ALL
WITH "ext" AS (SELECT *
FROM TABLE(
  EXTERN(
    '{"type":"http","uris":["https://static.imply.io/example-data/flight_on_time/flights/On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2005_11.csv.zip"]}',
    '{"type":"csv","findColumnsFromHeader":true}'
  )
) EXTEND ("depaturetime" VARCHAR, "arrivalime" VARCHAR, "Year" BIGINT, "Quarter" BIGINT, "Month" BIGINT, "DayofMonth" BIGINT, "DayOfWeek" BIGINT, "FlightDate" VARCHAR, "Reporting_Airline" VARCHAR, "DOT_ID_Reporting_Airline" BIGINT, "IATA_CODE_Reporting_Airline" VARCHAR, "Tail_Number" VARCHAR, "Flight_Number_Reporting_Airline" BIGINT, "OriginAirportID" BIGINT, "OriginAirportSeqID" BIGINT, "OriginCityMarketID" BIGINT, "Origin" VARCHAR, "OriginCityName" VARCHAR, "OriginState" VARCHAR, "OriginStateFips" BIGINT, "OriginStateName" VARCHAR, "OriginWac" BIGINT, "DestAirportID" BIGINT, "DestAirportSeqID" BIGINT, "DestCityMarketID" BIGINT, "Dest" VARCHAR, "DestCityName" VARCHAR, "DestState" VARCHAR, "DestStateFips" BIGINT, "DestStateName" VARCHAR, "DestWac" BIGINT, "CRSDepTime" BIGINT, "DepTime" BIGINT, "DepDelay" BIGINT, "DepDelayMinutes" BIGINT, "DepDel15" BIGINT, "DepartureDelayGroups" BIGINT, "DepTimeBlk" VARCHAR, "TaxiOut" BIGINT, "WheelsOff" BIGINT, "WheelsOn" BIGINT, "TaxiIn" BIGINT, "CRSArrTime" BIGINT, "ArrTime" BIGINT, "ArrDelay" BIGINT, "ArrDelayMinutes" BIGINT, "ArrDel15" BIGINT, "ArrivalDelayGroups" BIGINT, "ArrTimeBlk" VARCHAR, "Cancelled" BIGINT, "CancellationCode" VARCHAR, "Diverted" BIGINT, "CRSElapsedTime" BIGINT, "ActualElapsedTime" BIGINT, "AirTime" BIGINT, "Flights" BIGINT, "Distance" BIGINT, "DistanceGroup" BIGINT, "CarrierDelay" BIGINT, "WeatherDelay" BIGINT, "NASDelay" BIGINT, "SecurityDelay" BIGINT, "LateAircraftDelay" BIGINT, "FirstDepTime" VARCHAR, "TotalAddGTime" VARCHAR, "LongestAddGTime" VARCHAR, "DivAirportLandings" VARCHAR, "DivReachedDest" VARCHAR, "DivActualElapsedTime" VARCHAR, "DivArrDelay" VARCHAR, "DivDistance" VARCHAR, "Div1Airport" VARCHAR, "Div1AirportID" VARCHAR, "Div1AirportSeqID" VARCHAR, "Div1WheelsOn" VARCHAR, "Div1TotalGTime" VARCHAR, "Div1LongestGTime" VARCHAR, "Div1WheelsOff" VARCHAR, "Div1TailNum" VARCHAR, "Div2Airport" VARCHAR, "Div2AirportID" VARCHAR, "Div2AirportSeqID" VARCHAR, "Div2WheelsOn" VARCHAR, "Div2TotalGTime" VARCHAR, "Div2LongestGTime" VARCHAR, "Div2WheelsOff" VARCHAR, "Div2TailNum" VARCHAR, "Div3Airport" VARCHAR, "Div3AirportID" VARCHAR, "Div3AirportSeqID" VARCHAR, "Div3WheelsOn" VARCHAR, "Div3TotalGTime" VARCHAR, "Div3LongestGTime" VARCHAR, "Div3WheelsOff" VARCHAR, "Div3TailNum" VARCHAR, "Div4Airport" VARCHAR, "Div4AirportID" VARCHAR, "Div4AirportSeqID" VARCHAR, "Div4WheelsOn" VARCHAR, "Div4TotalGTime" VARCHAR, "Div4LongestGTime" VARCHAR, "Div4WheelsOff" VARCHAR, "Div4TailNum" VARCHAR, "Div5Airport" VARCHAR, "Div5AirportID" VARCHAR, "Div5AirportSeqID" VARCHAR, "Div5WheelsOn" VARCHAR, "Div5TotalGTime" VARCHAR, "Div5LongestGTime" VARCHAR, "Div5WheelsOff" VARCHAR, "Div5TailNum" VARCHAR, "Unnamed: 109" VARCHAR))
SELECT
  TIME_PARSE("depaturetime") AS "__time",
  "Reporting_Airline",
  "Tail_Number",
  "Distance",
  "Origin",
  "Dest"
FROM "ext"
PARTITIONED BY DAY
'''

display.run_task(sql)
sql_client.wait_until_ready(f'{table_name}')
display.table(f'{table_name}')

Loading data, status:[SUCCESS]: 100%|██████████| 100.0/100.0 [00:18<00:00,  5.46it/s]


Position,Name,Type
1,__time,TIMESTAMP
2,Reporting_Airline,VARCHAR
3,Tail_Number,VARCHAR
4,Distance,BIGINT
5,Origin,VARCHAR
6,Dest,VARCHAR


## Create a lookup table

Run the following cell to create some helper functions for using the lookup API.

* postLookup handles the post to the 

In [None]:
def postLookup(definition):
    x = requests.post(f"{druid_host}/druid/coordinator/v1/lookups/config", json=definition)

    if "error" in x.text:
        raise Exception('Not able to complete the request. \n\n'+x.text)
    else:
        print('Successfully submitted the lookup request.')

def waitForLookup(tier, name, ticsMax):

    # The default time period between checks of lookup definition changes (druid.manager.lookups.period)
    # is two minutes. The notebook environment reduces this for learning purposes.
    # 
    # https://druid.apache.org/docs/latest/configuration/#lookups-dynamic-configuration

    tics = 0
    ticsWait = 1    
    ticsMax = min(ticsMax,360)
    ticsSpinner = "/-\|"
    
    apicall = f"{druid_host}/druid/coordinator/v1/lookups/status/{tier}/{name}?detailed=true"

    x = requests.get(apicall)

    while (x.text != '{"loaded":true,"pendingNodes":[]}' and tics < ticsMax):
        print(f"{x.text} {ticsSpinner[tics%len(ticsSpinner)]} [ {str(ticsMax-tics)} ]     ", end='\r')
        time.sleep(ticsWait)
        tics += 1
        x = requests.get(apicall) 

    if (tics == ticsMax):
        raise Exception(f"\nTimeout waiting for Druid to load the {name} lookup to {tier} tier. Run the cell again.")
    else:
        print(f"\nSuccess. {name} lookup in {tier} tier is fully available.")

### Initialize lookups

Run the following cell, which posts an empty JSON object to the configuration API.

In [None]:
empty_post = {}
postLookup(empty_post)

### Generate some lookup values in a Kafka topic



### Create a lookup table

In this section, you will create a `lookup_post` object that can then be posted to the API as JSON. It will contain:

* The [tier](https://druid.apache.org/docs/latest/querying/lookups#dynamic-configuration) to which the table belongs - this will be the standard '__default'.
* A name for the table
* A definition of the lookup itself

The next cell constructs a `lookup_post` object that can then be passed as JSON to the lookups API.

* The tier name appears at the highest level of the JSON - in the documentation you may see this as `<tierName>`, and it appears as "__default".
Inside this the lookup name is given.
* The lookup table name, seen as `<lookupName>` in documentation, appears inside. Here, the name is given as "example-flights-airportsizes".
* Inside this appears the actual lookup definition, including a version number.

The specific details of the definition depend on the type of lookup being created. In this notebook we will set up `example-flights-airportsizes` using a `map`-type lookup, where the data for the lookup table to be put inline in the `POST` request. You will see a number of key / value pairs that map an airport code to its size.

To begin, run the next cell to create the `lookup_post_version` variable, which uses the current time and date to construct a version string.

In [None]:
from datetime import datetime

lookup_post_version = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")

Now run the cell below to create the `lookup_post` object.

In [None]:
lookup_tier = "__default"
lookup_name = "example-flights-airportsizes"
lookup_type = "map"

lookup_post = {
  lookup_tier: {
    lookup_name: {
      "version": lookup_post_version,  
      "lookupExtractorFactory": {
      "type": lookup_type,

In [None]:
postLookup(lookup_post)
waitForLookup(lookup_tier, lookup_name, 30)

## Clean up

Run the following cell to remove the XXX used in this notebook from the database.

In [None]:
# Use this for batch ingested tables

druid.datasources.drop(f"{table_name}")

# Use this when doing streaming with the data generator

print(f"Stop streaming generator: [{requests.post(f'{datagen_host}/stop/{datagen_job}','')}]")
print(f'Pause streaming ingestion: [{requests.post(f"{druid_host}/druid/indexer/v1/supervisor/{datagen_topic}/suspend","")}]')

print(f'Shutting down running tasks ...')

tasks = druid.tasks.tasks(state='running', table=table_name)
while len(tasks)>0:
    for task in tasks:
        print(f"...stopping task [{task['id']}]")
        druid.tasks.shut_down_task(task['id'])
    tasks = druid.tasks.tasks(state='running', table=table_name)

print(f'Reset offsets for re-runnability: [{requests.post(f"{druid_host}/druid/indexer/v1/supervisor/{datagen_topic}/reset","")}]')
print(f'Terminate streaming ingestion: [{requests.post(f"{druid_host}/druid/indexer/v1/supervisor/{datagen_topic}/terminate","")}]')
print(f"Drop datasource: [{druid.datasources.drop(table_name)}]")

## Summary

* You learned this
* Remember this

## Learn more

* Try this out on your own data
* Solve for problem X that is't covered here
* Read docs pages
* Watch or read something cool from the community
* Do some exploratory stuff on your own

In [None]:
# Here are some useful code elements that you can re-use.

# When just wanting to display some SQL results
sql = f'''SELECT * FROM "{table_name}" LIMIT 5'''
display.sql(sql)

# When ingesting data and wanting to describe the schema
display.run_task(sql)
sql_client.wait_until_ready('{table_name}')
display.table('{table_name}')

# When you want to show the native version of a SQL statement
print(json.dumps(json.loads(sql_client.explain_sql(sql)['PLAN']), indent=2))

# When you want a simple plot
df = pd.DataFrame(sql_client.sql(sql))
df.plot(x='x-axis', y='y-axis', marker='o')
plt.xticks(rotation=45, ha='right')
plt.gca().get_legend().remove()
plt.show()

# When you want to add some query context parameters
req = sql_client.sql_request(sql)
req.add_context("useApproximateTopN", "false")
resp = sql_client.sql_query(req)

# When you want to compare two different sets of results
df3 = df1.compare(df2, keep_equal=True)
df3

# When you want to see some messages from a Kafka topic
from kafka import KafkaConsumer

consumer = KafkaConsumer(bootstrap_servers=kafka_host)
consumer.subscribe(topics=datagen_topic)
count = 0
for message in consumer:
    count += 1
    if count == 5:
        break
    print ("%d:%d: v=%s" % (message.partition,
                            message.offset,
                            message.value))
consumer.unsubscribe()