In [1]:
# import packages
import json
import time
import random
import pandas as pd
import psycopg2

from confluent_kafka.admin import AdminClient, NewTopic
from confluent_kafka.admin import AdminClient
from confluent_kafka import Producer, Consumer, KafkaError, TopicPartition, SerializingProducer
from confluent_kafka import KafkaException

from confluent_kafka.serialization import StringSerializer, SerializationContext, MessageField
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.avro import AvroSerializer
from confluent_kafka.serialization import StringDeserializer
from confluent_kafka.schema_registry.avro import AvroDeserializer
from confluent_kafka import DeserializingConsumer

# Kafka Tutorial

## Create Topic
In this section, we are going to look at how to list topics, create topics and define partitions and replication factors for each topic.

list user created topic in our Kafka cluster

In [2]:
# connect to our broker
admin_client = AdminClient({'bootstrap.servers': 'broker:9092'})

# Retrieve the list of topics
topic_metadata = admin_client.list_topics(timeout=10)
[topic for topic in topic_metadata.topics if not topic.startswith('_')]

[]

create a new topic 'trade' with 3 partitions and 1 replication factor 

In [3]:
# create new topic
new_topics = [NewTopic('trades', num_partitions=3, replication_factor=1)]
fs = admin_client.create_topics(new_topics)
for topic, f in fs.items():
    try:
        f.result()
        print("Topic {} created".format(topic))
    except Exception as e:
        print("Failed to create topic {}: {}".format(topic, e))

Topic trades created


list user created topic again and look at number of partition and replication factor for each

In [4]:
# list topics
topics = admin_client.list_topics().topics
for topic, topic_info in topics.items():
    if not topic.startswith('_'):
        print(f"Topic: {topic}, Partitions: {len(topic_info.partitions)}")
        for p_id, p_info in topic_info.partitions.items():
            print(f"  Partition: {p_id}, Leader: {p_info.leader}, Replicas: {p_info.replicas}")

Topic: trades, Partitions: 3
  Partition: 0, Leader: 1, Replicas: [1]
  Partition: 1, Leader: 1, Replicas: [1]
  Partition: 2, Leader: 1, Replicas: [1]


## Producer

In this section, we look at how to define and configure our producer as well as sending messages to our previously created topic in our Kafka cluster.

We will simulate trade data stream from Binance. Following is the description of each field:
- e: Event type (trade)
- E: Event time
- s: Symbol
- t: Trade ID
- p: Price
- q: Quantity
- b: Buyer order ID
- a: Seller order ID
- T: Trade time
- m: Is the buyer the market maker?
- M: Ignore in price

In [5]:
# sample of trades data stream from Binance
trades = [
    {
        'e': 'trade',
        'E': 1713991342890,
        's': 'BTCUSDT',
        't': 97236,
        'p': '0.006390',
        'q': '2551',
        'b': 69,
        'a': 607,
        'T': 1713991342890,
        'm': True,
        'M': True
    },
    {
        'e': 'trade',
        'E': 1713991378161,
        's': 'BNBUSDT',
        't': 77422,
        'p': '0.008396',
        'q': '2831',
        'b': 307,
        'a': 539,
        'T': 1713991378161,
        'm': False,
        'M': True
    },
    {
        'e': 'trade',
        'E': 1713991429451,
        's': 'BNBBTC',
        't': 43014,
        'p': '0.004122',
        'q': '3493',
        'b': 123,
        'a': 664,
        'T': 1713991429451,
        'm': True,
        'M': True
    },
]

In [6]:
def acked(err, msg):
    """
    Callback to handle message delivery results.

    Parameters:
    err: Error information if the message delivery failed.
    msg: The message that was attempted to be sent.
    """
    if err is not None:
        print(f"Failed to deliver message: {msg}: {err}")
    else:
        print(f"Message produced: {msg.topic()} {msg.partition()} {msg.key()}")

- Once we have defined and configured our producer, we can use it to ingest stream of messages into our Kafka cluster.
- We format our python dictionary as json so that it can be ingested into our Kafka topic
- We use the symbol pair defined in the 's' key in our dictionary as our message key. The key is used to determined which partition of the topic the message is ingested into
- We can see that each message has been ingested into different partition since each holds different key

In [7]:
# define and configure our producer
producer = Producer({'bootstrap.servers': 'broker:9092'})

# send trade data to Kafka topic
for trade in trades:
    producer.produce('trades', key=trade['s'], value=json.dumps(trade), callback=acked)
    
# Wait for any outstanding messages to be delivered
producer.flush()

Message produced: trades 2 b'BNBUSDT'
Message produced: trades 0 b'BTCUSDT'
Message produced: trades 1 b'BNBBTC'


0

## Consumer

In this section, we look at how to define and configure our consume as well as reading and processing messages from our previously created topic in our Kafka cluster.

In this interactive code example, we will set the timeout_duration to 5 seconds to ensure that our consumer doesn't run indefinitely. In a real streaming system, the consumer would typically run continuously, processing incoming data in real-time.

In [8]:
def consume_messages(topic_name, consumer_config, timeout_duration=5):
    """
    Consumes messages from a specified Kafka topic.

    Parameters:
    topic_name (str): The name of the Kafka topic to subscribe to.
    consumer_config (dict): Configuration settings for the Kafka consumer.
    timeout_duration (int): Duration in seconds to consume messages before stopping.

    """
    # Initialize the consumer
    consumer = Consumer(consumer_config)
    
    # Subscribe to the topic
    consumer.subscribe([topic_name])

    try:
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout_duration:  # Break the loop after the specified duration
                break
            
            msg = consumer.poll(timeout=1.0)  # Poll for a message (timeout in seconds)
            
            if msg is None:
                print("No message available within the timeout period")
                continue  # No message available within the timeout period
                
            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    # End of partition event
                    print(f'Reached end of {msg.partition()} at offset {msg.offset()}')
                else:
                    print(f'Error: {msg.error()}')
                continue

            # Access and print message details
            print(f'Message: {msg.value().decode("utf-8")}')
            print(f'Partition: {msg.partition()}')
            print(f'Offset: {msg.offset()}')
            
    finally:
        # Close the consumer
        consumer.close()

Our first consumer `consumer1` in consumer group `group1` consumes all 3 messages produced by our producers earlier and wait for more messages until the timeout is reached.

In [9]:
# Configure the consumer
consumer_config = {
    'bootstrap.servers': 'broker:9092',
    'client.id': 'consumer1',
    'group.id': 'group1',
    'auto.offset.reset': 'earliest'
}

# Consume messages from the 'trades' topic
consume_messages('trades', consumer_config, timeout_duration=5)

Message: {"e": "trade", "E": 1713991378161, "s": "BNBUSDT", "t": 77422, "p": "0.008396", "q": "2831", "b": 307, "a": 539, "T": 1713991378161, "m": false, "M": true}
Partition: 2
Offset: 0
Message: {"e": "trade", "E": 1713991342890, "s": "BTCUSDT", "t": 97236, "p": "0.006390", "q": "2551", "b": 69, "a": 607, "T": 1713991342890, "m": true, "M": true}
Partition: 0
Offset: 0
Message: {"e": "trade", "E": 1713991429451, "s": "BNBBTC", "t": 43014, "p": "0.004122", "q": "3493", "b": 123, "a": 664, "T": 1713991429451, "m": true, "M": true}
Partition: 1
Offset: 0
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period


Since our second consumer `consumer2` is also in consumer group `group1`, it doesn't consume any more messages that have already been consumed by `consumer1`. It just waits for new messages until the timeout is reached.

In [10]:
# Configure the consumer
consumer_config = {
    'bootstrap.servers': 'broker:9092',
    'client.id': 'consumer2',
    'group.id': 'group1',
    'auto.offset.reset': 'earliest'
}
# Consume messages from the 'trades' topic
consume_messages('trades', consumer_config, timeout_duration=5)

No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period


Our third consumer `consumer3` consumes all 3 messages again because it is in consumer group `group3` which is different from the group that the `consumer1` and `consumer2` are in

In [11]:
# Configure the consumer
consumer_config = {
    'bootstrap.servers': 'broker:9092',
    'client.id': 'consumer3',
    'group.id': 'group2',
    'auto.offset.reset': 'earliest'
}
# Consume messages from the 'trades' topic
consume_messages('trades', consumer_config, timeout_duration=5)

Message: {"e": "trade", "E": 1713991378161, "s": "BNBUSDT", "t": 77422, "p": "0.008396", "q": "2831", "b": 307, "a": 539, "T": 1713991378161, "m": false, "M": true}
Partition: 2
Offset: 0
Message: {"e": "trade", "E": 1713991429451, "s": "BNBBTC", "t": 43014, "p": "0.004122", "q": "3493", "b": 123, "a": 664, "T": 1713991429451, "m": true, "M": true}
Partition: 1
Offset: 0
Message: {"e": "trade", "E": 1713991342890, "s": "BTCUSDT", "t": 97236, "p": "0.006390", "q": "2551", "b": 69, "a": 607, "T": 1713991342890, "m": true, "M": true}
Partition: 0
Offset: 0
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period
No message available within the timeout period


# Schema Registry

## Avro Producer

In this section, we demonstrates how to set up a Kafka producer for sending serialized trade data using Avro serialization.

In [12]:
# Callback to handle message delivery results
def acked(err, msg):
    """
    Callback to handle message delivery results.

    Parameters:
    err: Error information if the message delivery failed.
    msg: The message that was attempted to be sent.
    """
    if err is not None:
        print(f"Failed to deliver message: {msg.key()}: {err}")
    else:
        print(f"Message produced: {msg.topic()} {msg.partition()} {msg.key()}")
    
# Define the Avro schema for the trade data
trade_schema_str = '''
{
    "namespace": "com.data605.kafka",
    "name": "Trade",
    "type": "record",
    "fields": [
        {"name": "e", "type": "string"},
        {"name": "E", "type": "long"},
        {"name": "s", "type": "string"},
        {"name": "t", "type": "int"},
        {"name": "p", "type": "string"},
        {"name": "q", "type": "string"},
        {"name": "b", "type": "int"},
        {"name": "a", "type": "int"},
        {"name": "T", "type": "long"},
        {"name": "m", "type": "boolean"},
        {"name": "M", "type": "boolean"}
    ]
}
'''

# Configuration for Schema Registry
schema_registry_conf = {'url': 'http://schema-registry:8081'}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)

# Create an Avro serializer for the trade data
avro_serializer = AvroSerializer(schema_registry_client,
                                 trade_schema_str,
                                 lambda obj, ctx: obj)

# Configure and create the Avro Serializing Producer
producer_conf = {
    'bootstrap.servers': 'broker:9092',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': avro_serializer
}

# Create the Avro Serializing Producer
producer = SerializingProducer(producer_conf)

If the trade data does not conform to the Avro schema, a serialization error occurs, indicating missing required fields or schema mismatches.

In [13]:
# Sample trade data
trades = [
    {'e': 'trade', 'AC': 1713991342890, 's': 'BTCUSDT', 't': 97236, 'p': '0.006390', 'q': '2551', 'b': 69, 'a': 607, 'T': 1713991342890, 'm': True, 'M': True},
]

# Send each trade to the 'trades' topic using Avro serialization
for trade in trades:
    try:
        # Attempt to serialize and send the trade data
        producer.produce(topic='trades-avro', key=trade['s'], value=trade, on_delivery=acked)
        producer.poll(0)  # Serve delivery callback
    except KafkaException as e:
        # Handle exceptions
        print(f"An error occurred: {e}")
        
# Wait for any outstanding messages to be delivered
producer.flush()

An error occurred: KafkaError{code=_VALUE_SERIALIZATION,val=-161,str="no value and no default for E"}


0

Only if the trade data conforms to the Avro schema, the messages are successfully produced to the 'trades' topic.

In [14]:
# Sample trades data from Binance
trades = [
    {'e': 'trade', 'E': 1713991342890, 's': 'BTCUSDT', 't': 97236, 'p': '0.006390', 'q': '2551', 'b': 69, 'a': 607, 'T': 1713991342890, 'm': True, 'M': True},
    {'e': 'trade', 'E': 1713991378161, 's': 'BNBUSDT', 't': 77422, 'p': '0.008396', 'q': '2831', 'b': 307, 'a': 539, 'T': 1713991378161, 'm': False, 'M': True},
    {'e': 'trade', 'E': 1713991429451, 's': 'BNBBTC', 't': 43014, 'p': '0.004122', 'q': '3493', 'b': 123, 'a': 664, 'T': 1713991429451, 'm': True, 'M': True}
]

# Send each trade to the 'trades' topic using Avro serialization
for trade in trades:
    try:
        # Attempt to serialize and send the trade data
        producer.produce(topic='trades-avro', key=trade['s'], value=trade, on_delivery=acked)
        producer.poll(0)  # Serve delivery callback
    except KafkaException as e:
        # Handle exceptions
        print(f"An error occurred: {e}")
        
# Wait for any outstanding messages to be delivered
producer.flush()

Message produced: trades-avro 0 b'BTCUSDT'
Message produced: trades-avro 0 b'BNBUSDT'
Message produced: trades-avro 0 b'BNBBTC'


0

## Read Schema
In this section, we look at how to list schema in our schema registry and read a specific schema.

In [15]:
# schema registry client
schema_registry_url = 'http://schema-registry:8081'
schema_registry_client = SchemaRegistryClient({'url': schema_registry_url})

In [16]:
def list_subjects():
    """
    List the available subjects in the Schema Registry.
    """
    try:
        # Retrieve the list of subjects
        subjects = schema_registry_client.get_subjects()
        print("Available subjects:", subjects)
        return subjects
    except Exception as e:
        print(f"Error listing subjects: {e}")
        
# List the available subjects
subjects = list_subjects()

Available subjects: ['trades-avro-value']


In [17]:
def get_schema_by_subject(subject_name):
    """
    List schema by subject name.
    """
    try:
        # Retrieve the latest schema for the specified subject
        schema_metadata = schema_registry_client.get_latest_version(subject_name)
        schema = schema_metadata.schema
        return schema
    except Exception as e:
        print(f"Failed to fetch schema by subject: {e}")

subject_name = 'trades-avro-value'  # Replace with your subject name
trades_avro_schema = get_schema_by_subject(subject_name)
print(trades_avro_schema.schema_str)

{"type":"record","name":"Trade","namespace":"com.data605.kafka","fields":[{"name":"e","type":"string"},{"name":"E","type":"long"},{"name":"s","type":"string"},{"name":"t","type":"int"},{"name":"p","type":"string"},{"name":"q","type":"string"},{"name":"b","type":"int"},{"name":"a","type":"int"},{"name":"T","type":"long"},{"name":"m","type":"boolean"},{"name":"M","type":"boolean"}]}


## Avro Consumer
This section outlines the setup for consuming and deserializing trade data from a Kafka topic using Avro deserialization.

In [18]:
# Create an Avro deserializer for the trade data
avro_deserializer = AvroDeserializer(schema_registry_client,
                                 trade_schema_str,
                                 lambda obj, ctx: obj)

# Consumer configuration
consumer_conf = {
    'bootstrap.servers': 'broker:9092',
    'group.id': 'avro_group1',
    'key.deserializer': StringDeserializer('utf_8'),
    'value.deserializer': avro_deserializer,
    'auto.offset.reset': 'earliest'
}

# Create the DeserializingConsumer
consumer = DeserializingConsumer(consumer_conf)
consumer.subscribe(['trades-avro'])

In [19]:
# Poll for new messages and print them out
try:
    start_time = time.time()
    while True:
        if time.time() - start_time > 5:  # Break the loop after the specified duration
            break
        msg = consumer.poll(1.0)
        if msg is None:
            continue
        if msg.error():
            print(f"Consumer error: {msg.error()}")
            continue
        print(type(msg.key()), type(msg.value()))
        print(f"Received trade: {msg.key()}: {msg.value()}")
except KeyboardInterrupt:
    print("Stopping consumer")
finally:
    # Close down consumer to commit final offsets.
    consumer.close()

<class 'str'> <class 'dict'>
Received trade: BTCUSDT: {'e': 'trade', 'E': 1713991342890, 's': 'BTCUSDT', 't': 97236, 'p': '0.006390', 'q': '2551', 'b': 69, 'a': 607, 'T': 1713991342890, 'm': True, 'M': True}
<class 'str'> <class 'dict'>
Received trade: BNBUSDT: {'e': 'trade', 'E': 1713991378161, 's': 'BNBUSDT', 't': 77422, 'p': '0.008396', 'q': '2831', 'b': 307, 'a': 539, 'T': 1713991378161, 'm': False, 'M': True}
<class 'str'> <class 'dict'>
Received trade: BNBBTC: {'e': 'trade', 'E': 1713991429451, 's': 'BNBBTC', 't': 43014, 'p': '0.004122', 'q': '3493', 'b': 123, 'a': 664, 'T': 1713991429451, 'm': True, 'M': True}


# PostgreSQL database

In section, we look how to create database and table in PostgresSQL with predefined scheme, how to ingest records into our table, and how to use our Kakfa consumers to read and process records from our Kafka cluster, do some validation, and ingest them into our table 

## Create database
we create a database called `trades`

In [20]:
def create_database(conn_params, dbname):
    """
    Creates a new database in the PostgreSQL server.

    Parameters:
    conn_params (dict): A dictionary containing the connection parameters.
    dbname (str): The name of the database.
    """
    
    # Modify the connection parameters dictionary to connect to the default database (postgres)
    conn_params_default = conn_params.copy()
    conn_params_default['dbname'] = 'postgres'
    
    # Connect to the default database (postgres) to issue commands
    conn = psycopg2.connect(**conn_params_default)
    # Set the connection to autocommit mode
    conn.autocommit = True
    cur = conn.cursor()

    # Check if the database already exists
    cur.execute("SELECT 1 FROM pg_database WHERE datname=%s", (dbname,))
    exists = cur.fetchone()

    # Create the database if it does not exist
    if exists:
        print(f"Database '{dbname}' already exists.")
    else:
        # Execute the command to create a new database
        try:
            cur.execute(f"CREATE DATABASE {dbname};")  # Using f-string for database name in SQL statement
            print(f"Database '{dbname}' created successfully.")
        except psycopg2.Error as e:
            print(f"An error occurred: {e}")
            
    # Close the cursor and connection
    cur.close()
    conn.close()

In [21]:
# Connection parameters
conn_params = {
    "user": "postgres",
    "password": "postgres",
    "host": "pgdatabase",
    "port": "5432"
}

# Create the 'trades' database
create_database(conn_params, "trades")

Database 'trades' created successfully.


## Create table

we create a table called `binance` in our `trades` database to store our stream of Binance trade data from our Kafka cluster

In [22]:
# connection parameters
conn_params = {
    "dbname": "trades",
    "user": "postgres",
    "password": "postgres",
    "host": "pgdatabase",
    "port": "5432"
}

In [23]:
def create_table(conn_params, table_name):
    """
    Creates a table in the PostgreSQL database with a predefined schema.
    
    Parameters:
    conn_params (dict): A dictionary containing the connection parameters.
    table_name (str): The name of the table to create.
    """
    try:
        # Establish the connection using the connection parameters
        with psycopg2.connect(**conn_params) as conn:
            with conn.cursor() as cur:
                # SQL command to create a table
                create_table_command = f"""
                    CREATE TABLE IF NOT EXISTS {table_name} (
                        event_type TEXT,
                        event_time BIGINT,
                        symbol TEXT,
                        trade_id BIGINT,
                        price NUMERIC,
                        quantity NUMERIC,
                        buyer_order_id BIGINT,
                        seller_order_id BIGINT,
                        trade_time BIGINT,
                        is_buyer_maker BOOLEAN,
                        ignore_in_price BOOLEAN
                    );
                """
                # Execute the create table command
                cur.execute(create_table_command)
                # Commit the transaction
                conn.commit()
                print(f"Table '{table_name}' created successfully")
    
    except psycopg2.Error as e:
        # Handle exceptions that occur during the creation of the table
        print(f"Failed to create table: {e}")

In [24]:
# create the 'binance' table
create_table(conn_params, 'binance')

Table 'binance' created successfully


## Insert sample data into the table

we insert sample of trade data from Binance into our `binance` table

In [25]:
# Data to be inserted
data = {
    'e': 'trade',
    'E': 1714015066671,
    's': 'BNBBTC',
    't': 74686,
    'p': '0.006215',
    'q': '2112',
    'b': 88,
    'a': 544,
    'T': 1714015066671,
    'm': False,
    'M': True
}

# SQL query to insert data
insert_query = """
    INSERT INTO binance 
        (
            event_type, event_time, symbol, trade_id, price, quantity, buyer_order_id, 
            seller_order_id, trade_time, is_buyer_maker, ignore_in_price
        )
    VALUES (%(e)s, %(E)s, %(s)s, %(t)s, %(p)s, %(q)s, %(b)s, %(a)s, %(T)s, %(m)s, %(M)s);
"""

# insert data into the 'binance' table
with psycopg2.connect(**conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute(insert_query, data)
        print('Record inserted')

Record inserted


## Kafka consumer reads data from the Kafka stream and insert it into the table

we define and configure out consumer. The consumer will read the messages from our Kafka cluster, process them, and ingest them into our Kafka cluster in realtime

In [26]:
def validate_trade(trade):
    """
    This function validates trade record.

    Parameters:
    trade (dict): A dictionary containing a trader record.
    """
    
    required_keys = {'e', 'E', 's', 't', 'p', 'q', 'b', 'a', 'T', 'm', 'M'}
    
    # Check for missing keys
    if not required_keys.issubset(trade.keys()):
        return False, "Missing required fields"
    
    # Check event type
    if trade['e'] != 'trade':
        return False, "Invalid event type, must be 'trade'"

    # Validate timestamps and IDs
    for key in ['E', 't', 'b', 'a', 'T']:
        if not isinstance(trade[key], int) or trade[key] <= 0:
            return False, f"Field {key} must be a positive integer"

    # Validate price and quantity to be positive numbers
    for key in ['p', 'q']:
        try:
            val = float(trade[key])
            if val <= 0:
                return False, f"Field {key} must be a positive number"
        except ValueError:
            return False, f"Field {key} must be a numeric value"

    # Check boolean fields
    if not isinstance(trade['m'], bool) or not isinstance(trade['M'], bool):
        return False, "Fields 'm' and 'M' must be boolean values"
    
    return True, "Valid trade"

# Example usage:
trade = {
    'e': 'trade', 
    'E': 1713991342890, 
    's': 'BTCUSDT', 
    't': 97236, 
    'p': '0.006390', 
    'q': '2551', 
    'b': 69, 
    'a': 607, 
    'T': 1713991342890, 
    'm': True, 
    'M': True
}

# Validate the trade record
valid, message = validate_trade(trade)
if valid:
    print("Trade is valid. Insert into database.")
else:
    print(f"Validation failed: {message}")


Trade is valid. Insert into database.


In [27]:
# Configuration for the consumer
consumer_config = {
    'bootstrap.servers': 'broker:9092',
    'group.id': 'group_db',
    'auto.offset.reset': 'earliest'
}

# Initialize the consumer
consumer = Consumer(consumer_config)

# Subscribe to the topic
topic_name = 'trades'
consumer.subscribe([topic_name])

try:
    start_time = time.time()
    
    with psycopg2.connect(**conn_params) as conn:
        with conn.cursor() as cursor:
            while True:
                
                if time.time() - start_time > 5:  # Break the loop after 5 seconds
                    break
                    
                msg = consumer.poll(timeout=1.0)  # Poll for a message (timeout in seconds)
                
                if msg is None:
                    continue  # No message available within the timeout period
                    
                if msg.error():
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        # End of partition event
                        print(f'Reached end of {msg.partition()} at offset {msg.offset()}')
                    else:
                        print(f'Error: {msg.error()}')
                    continue
        
                # Access and print message details
                message_data = json.loads(msg.value().decode('utf-8'))
                print(f'Received message: {message_data} from partition {msg.partition()}')

                # Validate the trade record
                valid, message = validate_trade(message_data)

                # Insert the record into the database if it is valid
                if valid:
                    print("Trade is valid. Inserting into database.")
                    cursor.execute(insert_query, message_data)
                    print('Record inserted')
                    conn.commit()
                else:
                    print(f"Validation failed: {message}")
finally:
    consumer.close()

Received message: {'e': 'trade', 'E': 1713991378161, 's': 'BNBUSDT', 't': 77422, 'p': '0.008396', 'q': '2831', 'b': 307, 'a': 539, 'T': 1713991378161, 'm': False, 'M': True} from partition 2
Trade is valid. Inserting into database.
Record inserted
Received message: {'e': 'trade', 'E': 1713991429451, 's': 'BNBBTC', 't': 43014, 'p': '0.004122', 'q': '3493', 'b': 123, 'a': 664, 'T': 1713991429451, 'm': True, 'M': True} from partition 1
Trade is valid. Inserting into database.
Record inserted
Received message: {'e': 'trade', 'E': 1713991342890, 's': 'BTCUSDT', 't': 97236, 'p': '0.006390', 'q': '2551', 'b': 69, 'a': 607, 'T': 1713991342890, 'm': True, 'M': True} from partition 0
Trade is valid. Inserting into database.
Record inserted


## Explore data from PostgresSQL database

In section, we explore data that were ingested into our `binance` table by our consumer

In [28]:
%load_ext sql
# connect to our default database in our Postgres server
%sql postgresql://postgres:postgres@pgdatabase

In [29]:
%%sql
-- list all the database in our Postgres server
-- we can see that the trade database is one of them
    SELECT datname FROM pg_database;

 * postgresql://postgres:***@pgdatabase
4 rows affected.


datname
postgres
trades
template1
template0


In [30]:
# connect to our trades database
%sql postgresql://postgres:postgres@pgdatabase/trades

In [31]:
%%sql
-- list all the tables in our trades database
-- we can see that binance table in there
SELECT table_schema, table_name
    FROM information_schema.tables
    WHERE table_type = 'BASE TABLE' AND
    table_schema NOT IN ('pg_catalog', 'information_schema', 'priv');

   postgresql://postgres:***@pgdatabase
 * postgresql://postgres:***@pgdatabase/trades
1 rows affected.


table_schema,table_name
public,binance


In [32]:
%%sql
-- print the schema of our binance table.
SELECT column_name, data_type FROM Information_schema.Columns
    WHERE table_name = 'binance';

   postgresql://postgres:***@pgdatabase
 * postgresql://postgres:***@pgdatabase/trades
11 rows affected.


column_name,data_type
ignore_in_price,boolean
event_time,bigint
trade_time,bigint
is_buyer_maker,boolean
trade_id,bigint
price,numeric
quantity,numeric
buyer_order_id,bigint
seller_order_id,bigint
symbol,text


In [33]:
%%sql
-- count the record in our binance table
SELECT count(1) FROM binance;

   postgresql://postgres:***@pgdatabase
 * postgresql://postgres:***@pgdatabase/trades
1 rows affected.


count
4


In [34]:
%%sql
-- view record in our binance table
SELECT 
    event_type,
    event_time,
    symbol,
    trade_id,
    price,
    quantity,
    buyer_order_id,
    seller_order_id,
    trade_time,
    is_buyer_maker,
    ignore_in_price 
FROM binance;

   postgresql://postgres:***@pgdatabase
 * postgresql://postgres:***@pgdatabase/trades
4 rows affected.


event_type,event_time,symbol,trade_id,price,quantity,buyer_order_id,seller_order_id,trade_time,is_buyer_maker,ignore_in_price
trade,1714015066671,BNBBTC,74686,0.006215,2112,88,544,1714015066671,False,True
trade,1713991378161,BNBUSDT,77422,0.008396,2831,307,539,1713991378161,False,True
trade,1713991429451,BNBBTC,43014,0.004122,3493,123,664,1713991429451,True,True
trade,1713991342890,BTCUSDT,97236,0.00639,2551,69,607,1713991342890,True,True
