In [45]:
from confluent_kafka import Consumer
from datetime import datetime
import uuid
import os
from dotenv import load_dotenv
import snowflake.connector
import os
import json
import re
from statistics import mean
load_dotenv()

KAFKA_SERVER = os.getenv('KAFKA_SERVER')
KAFKA_USERNAME=os.getenv('KAFKA_USERNAME')
KAFKA_PASSWORD=os.getenv('KAFKA_PASSWORD')
KAFKA_TOPIC_NAME = os.getenv('KAFKA_TOPIC_NAME')

USER = os.environ.get('USER')
ACCOUNT = os.environ.get('ACCOUNT')
PASSWORD = os.environ.get('PASSWORD')
WAREHOUSE= os.environ.get('WAREHOUSE')
DATABASE= os.environ.get('DATABASE')
SCHEMA= os.environ.get('SCHEMA')


In [46]:
def subscribe_to_kafka_topic():
    """Produce a consumer that subscribes to the relevant Kafka topic"""
    c = Consumer({
    'bootstrap.servers': KAFKA_SERVER,
    'group.id': f'deleton' + str(uuid.uuid1()),
    'security.protocol': 'SASL_SSL',
    'sasl.mechanisms': 'PLAIN',
    'sasl.username': KAFKA_USERNAME,
    'sasl.password': KAFKA_PASSWORD,
    'session.timeout.ms': 6000,
    'heartbeat.interval.ms': 1000,
    'fetch.wait.max.ms': 6000,
    'auto.offset.reset': 'latest',
    'enable.auto.commit': 'false',
    'max.poll.interval.ms': '86400000',
    'topic.metadata.refresh.interval.ms': "-1",
    "client.id": 'id-002-005',
})

    c.subscribe([KAFKA_TOPIC_NAME])
    return c

In [47]:
def extract_values_from_log(string):
    """Extract numerical values from Kafka log using regular expression"""
    regexp = r'\d+.?\d+|\d'
    numerical_values = re.findall(regexp,string)
    return numerical_values

In [48]:
def wait_for_system_log(c):
    """Waits for current user when starting script to finish their ride, as their user data is not retrievable"""
    while True:
        kafka_message = c.poll(0.5)

        if kafka_message is not None:
            kafka_log = kafka_message.value().decode('utf-8')

            if 'SYSTEM' in kafka_log:
                return kafka_message

In [49]:
def convert_unix_to_date(unix_timestamp):
    """Converts unix timestamp to datetime"""
    unix_timestamp /= 1000  #convert to seconds as unix timestamp is in milliseconds
    converted_to_date = datetime.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d')
 
    return converted_to_date

In [50]:
def split_full_name(name):
    """Split full name based on various conditions, such as if they have a title or not, checks to see if they have a last name"""
    name_list = name.split(' ')
    titles = ['Mr','Mrs','Miss','Ms','Dr']

    if name_list[0] in titles: #Exclude user titles from snowflake
        first_name = name_list[1]

        try:
            last_name = name_list[2]

        except IndexError: #Catch index error incase user did not give lastname
            last_name = None

    else:
        first_name = name_list[0]
        last_name = name_list[1]

    return first_name,last_name

In [51]:
def flatten_list(address_list):
    """Expands sublists present in lists, which is a consequence of splitting addresses"""
    flat_list = []

    for element in address_list:
        if type(element) is list:
          
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list


def split_address(address):
    """Splits address based on various conditions, such as if they have whitespace or commas, or how many elements they have"""
    address_list = address.split(',')

    if len(address_list) < 4:
        address_list[0] = address_list[0].split(' ', 1) 
        address_list = flatten_list(address_list)

    house_number = address_list[0]
    street_name = address_list[1].title()
    region = address_list[2].title()
    postcode = address_list[3]

    return house_number,street_name,region,postcode

In [52]:
def clean_user_data(user_dictionary):
    """Clean user data by converting timestamps, obtaining first and lastname, and splitting address field"""
    user_dictionary['account_create_date'] = convert_unix_to_date(user_dictionary['account_create_date'])
    user_dictionary['date_of_birth'] = convert_unix_to_date(user_dictionary['date_of_birth'])

    first_name,last_name = split_full_name(user_dictionary['name'])

    user_dictionary['first_name'] = first_name
    user_dictionary['last_name'] = last_name

    house_number,street_name,region,postcode = split_address(user_dictionary['address'])

    user_dictionary['house_number'] = house_number
    user_dictionary['street_name'] = street_name
    user_dictionary['region'] = region
    user_dictionary['postcode'] = postcode
    user_dictionary['gender'] = user_dictionary['gender'].title()

    return user_dictionary

In [53]:
"""connect to snowflake to make queries"""
conn = snowflake.connector.connect(
    user=USER,
    password=PASSWORD,
    account=ACCOUNT,
    warehouse=WAREHOUSE,
    database=DATABASE,
    schema='ZOOKEEPERS_BATCH_PRODUCTION'
)
cs = conn.cursor()

'connect to snowflake to make queries'

In [55]:
wait_for_first_user = True #Wait for current user to finish, so can obtain all relevant information
first_user_collected = False #Add check for first user so data is not sent to snowflake prematurely

resistance_list = []
power_list = []
heart_rate_list = []
rpm_list = []

c = subscribe_to_kafka_topic()

while True:
    
    kafka_message = c.poll(0.5)

    if wait_for_first_user:
        kafka_message = wait_for_system_log(c)
        wait_for_first_user = False


    if kafka_message is not None: #exclude none values
        log = kafka_message.value().decode('utf-8')


        if 'SYSTEM' in log:
            first_user_collected = True
            
            system_log = json.loads(log).get('log')
            split_log = system_log.split(' mendoza v9: [SYSTEM] data = ')
            begin_timestamp = split_log[0][:-7] #remove milliseconds from timestamp for snowflake compatability
            
            dictionary_string = split_log[1][:-1]
            user_dictionary = json.loads(dictionary_string)
            
            user_dictionary = clean_user_data(user_dictionary)

            


        elif 'INFO' in log: #only check for strings with INFO

            values = json.loads(log)
            new_log = values.get('log')
         

            if 'Ride' in new_log: #process strings with Ride info
                new_log = new_log.split(' mendoza v9: [INFO]: Ride - ')
               
                log_values = extract_values_from_log(new_log[1])

                duration = int(float(log_values[0]))
                resistance_list.append(int(log_values[1]))
         

            elif 'Telemetry' in new_log:
                new_log = new_log.split(' mendoza v9: [INFO]: Telemetry - ')
               
                log_values = extract_values_from_log(new_log[1])

                heart_rate_list.append(int(log_values[0]))
                rpm_list.append(int(log_values[1]))
                power_list.append(round(float(log_values[2]),3))


        elif 'new ride' in log and first_user_collected: #New user is starting, so load collected data into snowflake
            total_power = sum(power_list)
            mean_power = mean(power_list)
            mean_rpm = mean(rpm_list)
            mean_heart_rate = mean(heart_rate_list)
            mean_resistance = mean(resistance_list)

            cs.execute(
            """INSERT INTO users(user_id, first_name, last_name, gender, date_of_birth, 
            height_cm, weight_kg, house_name, street, region, postcode, email, account_created) """
            "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            (
            user_dictionary['user_id'],user_dictionary['first_name'],user_dictionary['last_name'],user_dictionary['gender'],
            user_dictionary['date_of_birth'],user_dictionary['height_cm'],user_dictionary['weight_kg'],user_dictionary['house_number'],
            user_dictionary['street_name'],user_dictionary['region'],user_dictionary['postcode'],
            user_dictionary['email_address'],user_dictionary['account_create_date']
            )
            )
            print('made insert into users')
            
            cs.execute(
            "INSERT INTO rides(user_id, begin_timestamp, total_duration_sec, total_power, mean_power, mean_resistance, mean_rpm, mean_heart_rate) "
            "VALUES(%s,%s,%s,%s,%s,%s,%s,%s)",
            (user_dictionary['user_id'], begin_timestamp, duration, 
            total_power, mean_power, mean_resistance, 
            mean_rpm, mean_heart_rate)
            
            )
            print('made insert into rides')
            
            
                


made insert into users
made insert into rides
made insert into users
made insert into rides
made insert into users
made insert into rides
