In [1]:
import os
import sys
from pyspark import SparkContext, SparkConf
from common.job import Job
from pyspark.streaming.kafka import KafkaUtils
import json

In [2]:
topic='encounter-obs-orders' # define topic/s
kafka_config = Job.getConfig()['kafka'][topic]
ssc= Job.getStreamingContext()
kafka_stream = KafkaUtils\
      .createDirectStream(ssc,topics=kafka_config['topics'],kafkaParams=kafka_config['config']) \
      .map(lambda msg: json.loads(msg[1]))

obs_stream = kafka_stream \
        .filter(lambda msg: msg['schema']['name'] == 'dbserver1.openmrs.obs.Envelope') \
        .map(lambda msg: msg['payload']['after']) \
        .map(lambda a: Row(**a))

orders_stream = kafka_stream \
        .filter(lambda msg: msg['schema']['name'] == 'dbserver1.openmrs.orders.Envelope') \
        .map(lambda msg: msg['payload']['after'])\
        .map(lambda a: Row(**a))

In [None]:
from streaming.encounter_job import EncounterJob
from pyspark.sql import SparkSession, Row, SQLContext, Window
job = EncounterJob()


# separate obs with null encounters from obs with encounters
obs_with_enc_stream = obs_stream.filter(lambda a: a['encounter_id'] is not None)
obs_with_null_enc_stream  = obs_stream.filter(lambda a: a['encounter_id'] is None)

#convert orders and obs with encounters into a tuple of encouter_id and person_id
orders_stream = orders_stream.map(lambda row: (row['encounter_id'], row['patient_id']))
obs_with_enc_stream = obs_with_enc_stream.map(lambda row: (row['encounter_id'], row['person_id']))

#union the orders and obs stream
enc_obs_orders = obs_with_enc_stream.union(orders_stream)

#extract distinct encounter id
enc_obs_orders = enc_obs_orders.reduceByKey(lambda x, y: x)

#convert into (person_id, encounter_id) tuple from (encounter_id, person_id) tuple
enc_obs_orders = enc_obs_orders.map(lambda tpl: (tpl[1], tpl[0]))

#convert obs without encounters into person_id, None tuple
obs_with_null_enc = obs_with_null_enc_stream.map(lambda row: (row['person_id'], None))

#join obs without encounters with the enc_obs_orders for processing
enc_obs_orders = enc_obs_orders.union(obs_with_null_enc)

#group by patient_id to get distinct patients
enc_obs_orders = enc_obs_orders.groupByKey()\
    .map(lambda x : Row(person_id=x[0], encounters=list(filter(None.__ne__, x[1]))))

enc_obs_orders.foreachRDD(lambda rdd: job.run_microbatch(rdd))

ssc.start()
ssc.awaitTermination()

---Micro-Batch--- 
Starting calculations for flat_enc_obs_orders Mon Mar 16 21:07:48 2020
Took 2.084018 seconds
