## Kafka Ingestion

### Required Imports + Subscribing to topic

In [11]:
from confluent_kafka import Consumer
import uuid
import os
from dotenv import load_dotenv
import snowflake.connector
import os
load_dotenv()

KAFKA_SERVER = os.getenv('KAFKA_SERVER')
KAFKA_USERNAME=os.getenv('KAFKA_USERNAME')
KAFKA_PASSWORD=os.getenv('KAFKA_PASSWORD')
KAFKA_TOPIC_NAME = os.getenv('KAFKA_TOPIC_NAME')

values = []

# Consumer with `confluent_kafka`
c = Consumer({
    'bootstrap.servers': KAFKA_SERVER,
    'group.id': f'deleton' +str(uuid.uuid1()),
    'security.protocol': 'SASL_SSL',
    'sasl.mechanisms': 'PLAIN',
    'sasl.username': KAFKA_USERNAME,
    'sasl.password': KAFKA_PASSWORD,
    'session.timeout.ms': 6000,
    'heartbeat.interval.ms': 1000,
    'fetch.wait.max.ms': 6000,
    'auto.offset.reset': 'latest',
    'enable.auto.commit': 'false',
    'max.poll.interval.ms': '86400000',
    'topic.metadata.refresh.interval.ms': "-1",
    "client.id": 'id-002-005',
})

c.subscribe([KAFKA_TOPIC_NAME])
"""Kafka consumer ready and consumes messages"""

### Testing Kafka Messages

In [111]:
import json
import re
i = 0
while i < 50:
    kafka_message = c.poll(0.5) #poll all messages

    if kafka_message is not None: #exclude none values
        log = kafka_message.value().decode('utf-8')

        if 'INFO' in log: #only check for strings with INFO
            values = json.loads(log)
            new_log = values.get('log')
            # print(new_log) #Get value inside dict

            if 'Ride' in new_log: #process strings with Ride info
                new_log = new_log.split(' mendoza v9: [INFO]: Ride - ')
                timestamp = new_log[0]
                duration_and_resistance = re.findall(r'\d+.?\d+',new_log[1])

                # print(timestamp,duration_and_resistance)
            
            if 'Telemetry' in new_log:
                new_log = new_log.split(' mendoza v9: [INFO]: Telemetry - ')
                timestamp = new_log[0]
                duration_and_resistance = re.findall(r'\d+.?\d+',new_log[1])
                print(timestamp,duration_and_resistance)


    i += 1

2022-10-04 10:41:31.070420 ['132', '51', '14.31786487']
2022-10-04 10:41:32.071779 ['132', '52', '14.75785543']
2022-10-04 10:41:33.073167 ['131', '51', '14.31786487']
2022-10-04 10:41:34.074526 ['131', '53', '15.20140731']
2022-10-04 10:41:35.075889 ['131', '56', '16.55272673']
2022-10-04 10:41:36.077258 ['133', '55', '16.09890976']
2022-10-04 10:41:37.078631 ['134', '55', '16.09890976']
2022-10-04 10:41:38.080004 ['134', '54', '15.64844827']
2022-10-04 10:41:39.081435 ['134', '48', '13.02004103']
2022-10-04 10:41:40.082800 ['133', '53', '15.20140731']
2022-10-04 10:41:41.084143 ['133', '51', '14.31786487']
2022-10-04 10:41:42.085407 ['133', '48', '13.02004103']
2022-10-04 10:41:43.086746 ['133', '51', '14.31786487']
2022-10-04 10:41:44.088108 ['133', '53', '15.20140731']
2022-10-04 10:41:45.089558 ['133', '53', '15.20140731']
2022-10-04 10:41:46.090907 ['132', '51', '14.31786487']
2022-10-04 10:41:47.092258 ['132', '49', '13.448875699999999']
2022-10-04 10:41:48.093610 ['131', '51', 

### TODO: 

* Read kafka data into a dataframe (Pandas or PySpark?)
* Write data into Snowflake

In [13]:
USER = os.environ.get('USER')
ACCOUNT = os.environ.get('ACCOUNT')
PASSWORD = os.environ.get('PASSWORD')
WAREHOUSE= os.environ.get('WAREHOUSE')
DATABASE= os.environ.get('DATABASE')
SCHEMA= os.environ.get('SCHEMA')

In [16]:
conn = snowflake.connector.connect(
    user=USER,
    password=PASSWORD,
    account=ACCOUNT,
    warehouse=WAREHOUSE,
    database=DATABASE,
    schema='ZOOKEEPERS_BATCH_PRODUCTION'
)
cs = conn.cursor()
"""Already created schema to use"""

In [18]:
query = "SHOW TABLES"
result = cs.execute(query)
result.fetchmany(100)

[]