In [1]:
import os
from dotenv import load_dotenv

import requests
import json
from kafka import KafkaProducer, KafkaConsumer

# Definitions

In [2]:
load_dotenv()

OPEN_WEATHER_KEY = os.getenv('OPEN_WEATHER_KEY')
FREE_WEATHER_KEY = os.getenv('FREE_WEATHER_KEY')

OPEN_WEATHER_URL = f"https://api.openweathermap.org/data/2.5/weather?lat=-7.95&lon=112.61&appid={OPEN_WEATHER_KEY}"
OPEN_METEO_URL = "https://api.open-meteo.com/v1/forecast?latitude=-7.95&longitude=112.61&current=temperature_2m,apparent_temperature,relative_humidity_2m,rain,precipitation,weather_code,cloud_cover,showers,wind_speed_10m,wind_direction_10m,pressure_msl,surface_pressure,wind_gusts_10m&timezone=Asia%2FBangkok&forecast_days=1"
FREE_WEATHER_URL = f"https://api.weatherapi.com/v1/current.json?key={FREE_WEATHER_KEY}&q=-7.95,112.61&aqi=no"

# Helpers

In [3]:
def get_open_weather():
    '''
        Ideally GET every 15 minutes
    '''
    try:
        res = requests.get(OPEN_WEATHER_URL)
        if res.status_code == 200:
            data = res.json()
        else:
            print(res.status_code)
    except Exception as e:
        print(e)
        return None
    return data

def get_open_meteo():
    '''
        Ideally GET at every chance (but new data is pushed every 15 minutes).
        Based on API limit, you can GET up to 3 requests per minute.
    '''
    try:
        res = requests.get(OPEN_METEO_URL)
        if res.status_code == 200:
            data = res.json()
        else:
            print(res.status_code)
    except Exception as e:
        print(e)
        return None
    return data

def get_free_weather():
    '''
        Ideally GET at every chance (but new data is pushed every 15 minutes).
        Based on API limit, you can GET up to 22 requests per minute.
    '''
    try:
        res = requests.get(FREE_WEATHER_URL)
        if res.status_code == 200:
            data = res.json()
        else:
            print(res.status_code)
    except Exception as e:
        print(e)
        return None
    return data

# Kafka Producer

Topics:
- hourly
- real-time

In [4]:
HOURLY_TOPIC = 'hourly'
REAL_TIME_TOPIC = 'real-time'

In [12]:
producer = KafkaProducer(
    bootstrap_servers='localhost:29092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Get data then publish

In [22]:
ow_data = get_open_weather()
future = producer.send(HOURLY_TOPIC, ow_data)

# Debugging

In [8]:
future = producer.send('test', {'hello': 'world'})

In [15]:
consumer = KafkaConsumer(
    'free-weather',
    bootstrap_servers=['localhost:29092'],
    auto_offset_reset='earliest',
    enable_auto_commit=False,
    client_id='test_fw_consumer',
    group_id='i_fw_you',
    value_deserializer=lambda m: json.loads(m.decode('ascii')),
    # remove this parameter for continuous listening
    consumer_timeout_ms=1000
)

In [17]:
consumer.poll()
for message in consumer:
    # message value and key are raw bytes -- decode if necessary!
    # e.g., for unicode: `message.value.decode('utf-8')`
    print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
                                          message.offset, message.key,
                                          message.value))
    # consumer.commit()

In [1]:
import findspark
findspark.init()
findspark.find()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

SparkContext.setSystemProperty('spark.executor.memory', '2g')

conf = SparkConf().setAppName('RDV1').setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

spark