## Kafka Ingestion

### Required Imports + Subscribing to topic

In [11]:
from confluent_kafka import Consumer
import uuid
import os
from dotenv import load_dotenv
import snowflake.connector
import os
load_dotenv()

KAFKA_SERVER = os.getenv('KAFKA_SERVER')
KAFKA_USERNAME=os.getenv('KAFKA_USERNAME')
KAFKA_PASSWORD=os.getenv('KAFKA_PASSWORD')
KAFKA_TOPIC_NAME = os.getenv('KAFKA_TOPIC_NAME')

values = []

# Consumer with `confluent_kafka`
c = Consumer({
    'bootstrap.servers': KAFKA_SERVER,
    'group.id': f'deleton' +str(uuid.uuid1()),
    'security.protocol': 'SASL_SSL',
    'sasl.mechanisms': 'PLAIN',
    'sasl.username': KAFKA_USERNAME,
    'sasl.password': KAFKA_PASSWORD,
    'session.timeout.ms': 6000,
    'heartbeat.interval.ms': 1000,
    'fetch.wait.max.ms': 6000,
    'auto.offset.reset': 'latest',
    'enable.auto.commit': 'false',
    'max.poll.interval.ms': '86400000',
    'topic.metadata.refresh.interval.ms': "-1",
    "client.id": 'id-002-005',
})

c.subscribe([KAFKA_TOPIC_NAME])
"""Kafka consumer ready and consumes messages"""

### Testing Kafka Messages

In [114]:
import json
import re
i = 0
while i < 500000:
    kafka_message = c.poll(0.5) #poll all messages

    if kafka_message is not None: #exclude none values
        log = kafka_message.value().decode('utf-8')

        if 'INFO' in log: #only check for strings with INFO
            values = json.loads(log)
            new_log = values.get('log')
            # print(new_log) #Get value inside dict

            if 'Ride' in new_log: #process strings with Ride info
                new_log = new_log.split(' mendoza v9: [INFO]: Ride - ')
                timestamp = new_log[0]
                duration_and_resistance = re.findall(r'\d+.?\d+',new_log[1])

                # print(timestamp,duration_and_resistance)
            
            if 'Telemetry' in new_log:
                new_log = new_log.split(' mendoza v9: [INFO]: Telemetry - ')
                timestamp = new_log[0]
                duration_and_resistance = re.findall(r'\d+.?\d+',new_log[1])
                # print(timestamp,duration_and_resistance)
                
        if 'SYSTEM' in log:
            print(log)


    i += 1

{"log": "2022-10-04 10:47:40.576790 mendoza v9: [SYSTEM] data = {\"user_id\":4526,\"name\":\"Charlene Brown\",\"gender\":\"female\",\"address\":\"Flat 98y,Carolyn haven,East Valeriemouth,S5T 0TT\",\"date_of_birth\":-842140800000,\"email_address\":\"charlene.b72@gmail.com\",\"height_cm\":166,\"weight_kg\":71,\"account_create_date\":1643760000000,\"bike_serial\":\"SN0000\",\"original_source\":\"google ads\"}\n"}
{"log": "2022-10-04 10:56:21.286263 mendoza v9: [SYSTEM] data = {\"user_id\":4526,\"name\":\"Charlene Brown\",\"gender\":\"female\",\"address\":\"Flat 98y,Carolyn haven,East Valeriemouth,S5T 0TT\",\"date_of_birth\":-842140800000,\"email_address\":\"charlene.b72@gmail.com\",\"height_cm\":166,\"weight_kg\":71,\"account_create_date\":1643760000000,\"bike_serial\":\"SN0000\",\"original_source\":\"google ads\"}\n"}
{"log": "2022-10-04 11:05:04.002255 mendoza v9: [SYSTEM] data = {\"user_id\":4526,\"name\":\"Charlene Brown\",\"gender\":\"female\",\"address\":\"Flat 98y,Carolyn haven,Eas

%4|1664884561.019|SESSTMOUT|id-002-005#consumer-2| [thrd:main]: Consumer group session timed out (in join-state steady) after 6030 ms without a successful response from the group coordinator (broker 6, last error was Success): revoking assignment and rejoining group


{"log": "2022-10-04 11:56:20.246964 mendoza v9: [SYSTEM] data = {\"user_id\":4527,\"name\":\"Mr Stuart Austin\",\"gender\":\"male\",\"address\":\"Studio 73,Stuart turnpike,South Owenshire,SA4N 2PJ\",\"date_of_birth\":384912000000,\"email_address\":\"mr.s@gmail.com\",\"height_cm\":159,\"weight_kg\":48,\"account_create_date\":1645660800000,\"bike_serial\":\"SN0000\",\"original_source\":\"direct\"}\n"}


KeyboardInterrupt: 

### TODO: 

* Read kafka data into a dataframe (Pandas or PySpark?)
* Write data into Snowflake

In [13]:
USER = os.environ.get('USER')
ACCOUNT = os.environ.get('ACCOUNT')
PASSWORD = os.environ.get('PASSWORD')
WAREHOUSE= os.environ.get('WAREHOUSE')
DATABASE= os.environ.get('DATABASE')
SCHEMA= os.environ.get('SCHEMA')

In [16]:
conn = snowflake.connector.connect(
    user=USER,
    password=PASSWORD,
    account=ACCOUNT,
    warehouse=WAREHOUSE,
    database=DATABASE,
    schema='ZOOKEEPERS_BATCH_PRODUCTION'
)
cs = conn.cursor()
"""Already created schema to use"""

In [113]:
query = "SHOW TABLES"
result = cs.execute(query)
result.fetchmany(100)

[]