# kafkaReceiveDataPy
This notebook receives data from Kafka on the topic 'test', and stores it in the 'time_test' table of Cassandra (created by cassandra_init.script in startup_script.sh).

```
CREATE KEYSPACE test_time WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1};

CREATE TABLE test_time.sent_received(
 time_sent TEXT,
 time_received TEXT,
PRIMARY KEY (time_sent)
);
```

A message that gives the current time is received every second. 

In [1]:
TOPIC = 'dmi'
TOPIC = 'imu'
TOPIC = 'gps'

## Add dependencies

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
import time

## Load modules and start SparkContext
Note that SparkContext must be started to effectively load the package dependencies. Two cores are used, since one is needed for running the Kafka receiver.

In [3]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
    .setAppName("Streaming test") \
    .setMaster("local[2]") \
    .set("spark.cassandra.connection.host", "127.0.0.1")
sc = SparkContext(conf=conf) 
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

In [4]:
import argparse
import csv
import time
import json

from utils import parse_line_into_schema

from fastavro import writer, reader, parse_schema
import avro_schemas

In [5]:
debug = True

In [6]:
# PARSE ARGS AND SET PATHS, TOPICS
KEYSPACE = 'raw_instrument'
DATA_OUT_DIR = '/data/'
DMI_OUT_FILEPATH = DATA_OUT_DIR + 'dmi.csv'
IMU_OUT_FILEPATH = DATA_OUT_DIR + 'imu.csv'
LIDAR_OUT_FILEPATH = DATA_OUT_DIR + 'lidar.csv'
GPS_OUT_FILEPATH = DATA_OUT_DIR + 'gps.csv'
INSTRUMENTS = ['imu', 'dmi', 'lidar', 'gps']

if debug:
    instrument = TOPIC
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--instrument', help='which instrument data to save to disk {0}'.format(INSTRUMENTS))
    args = parser.parse_args()
    instrument = args.instrument.lower()
    
if 'dmi' in instrument:
    filepath = DMI_OUT_FILEPATH
    schema = avro_schemas.dmi
    field_names = [d['name'] for d in schema['fields']]
    TOPIC = 'dmi'
elif 'imu' in instrument:
    filepath = IMU_OUT_FILEPATH
    schema = avro_schemas.imu
    field_names = [d['name'] for d in schema['fields']]
    TOPIC = 'imu'
elif 'lidar' in instrument:
    filepath = LIDAR_OUT_FILEPATH
    schema = avro_schemas.lidar
    field_names = [d['name'] for d in schema['fields']]
    TOPIC = 'lidar'
elif 'gps' in instrument:
    filepath = GPS_OUT_FILEPATH
    schema = avro_schemas.gps
    field_names = [d['name'] for d in schema['fields']]
    TOPIC = 'gps'
else:
    print('ERROR: ARGUMENT {0} NOT IN INSTRUMENTS {1}'.format(args.instrument, INSTRUMENTS))
    assert False

parsed_schema = parse_schema(schema)


In [7]:
parse_schema(schema)

{'__fastavro_parsed': True,
 'fields': [{'name': 'coordinate_id', 'type': 'int'},
  {'name': 'gps_timestamp', 'type': 'string'},
  {'name': 'latitude', 'type': ['float', 'null']},
  {'name': 'lat_dir', 'type': 'string'},
  {'name': 'longitude', 'type': ['float', 'null']},
  {'name': 'lon_dir', 'type': 'string'},
  {'name': 'num_sats', 'type': ['int', 'null']},
  {'name': 'horizontal_dil', 'type': ['float', 'null']},
  {'name': 'altitude', 'type': ['float', 'null']},
  {'name': 'altitude_units', 'type': 'string'},
  {'name': 'geo_sep_units', 'type': 'string'},
  {'name': 'age_gps_data', 'type': 'string'},
  {'name': 'unused', 'type': 'string'}],
 'name': 'gps.avro.sidewalk_rig',
 'type': 'record'}

In [8]:
field_names

['coordinate_id',
 'gps_timestamp',
 'latitude',
 'lat_dir',
 'longitude',
 'lon_dir',
 'num_sats',
 'horizontal_dil',
 'altitude',
 'altitude_units',
 'geo_sep_units',
 'age_gps_data',
 'unused']

In [9]:
len(field_names)

13

## SaveToCassandra function
Takes a list of tuple (rows) and save to Cassandra 

In [10]:
def saveToCassandra(rows):
    if not rows.isEmpty(): 
        sqlContext.createDataFrame(rows).write\
        .format("org.apache.spark.sql.cassandra")\
        .mode('append')\
        .options(table=TOPIC, keyspace=KEYSPACE)\
        .save()

## Create streaming task
* Receive data from Kafka 'test' topic every five seconds
* Get stream content, and add receiving time to each message
* Save each RDD in the DStream to Cassandra. Also print on screen

In [11]:
# row = '1556477374944823040,26,3821,3751,150,113'.split(',')
# record = parse_line_into_schema(row, schema)
# record

In [12]:
ssc = StreamingContext(sc, 5)
kvs = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {TOPIC: 1})
#data = kvs.map(lambda x: x[1])
data = kvs.map(lambda x: json.loads(x[1]))
# rows= data.map(lambda x:Row(time_sent=x,time_received=time.strftime("%Y-%m-%d %H:%M:%S")))
# rows = data.map(lambda x: x.split(',')).map(lambda x: Row(field_names, x))
# rows = data.map(lambda x: x.split(','))
rows = data.map(lambda x: x)
#rows = data.map(lambda x: parse_line_into_schema(x, schema))
#rows= data.map(lambda x:Row(x))

rows.foreachRDD(saveToCassandra)
rows.pprint()

## Start streaming

In [13]:
ssc.start()

-------------------------------------------
Time: 2019-04-29 01:19:25
-------------------------------------------





-------------------------------------------
Time: 2019-04-29 01:19:30
-------------------------------------------
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'45.4', u'longitude': u'-122.91595', u'unused': u'', u'gps_timestamp': u'20:23:51', u'age_gps_data': u'', u'num_sats': u'06', u'latitude': u'46.909256666666664', u'horizontal_dil': u'1.7', u'altitude_units': u'M', u'coordinate_id': 1556500197366183168}
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'45.4', u'longitude': u'-122.91594666666667', u'unused': u'', u'gps_timestamp': u'20:23:51.200000', u'age_gps_data': u'', u'num_sats': u'06', u'latitude': u'46.909256666666664', u'horizontal_dil': u'1.7', u'altitude_units': u'M', u'coordinate_id': 1556500199367689984}
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'45.5', u'longitude': u'-122.915945', u'unused': u'', u'gps_timestamp': u'20:23:51.400000', u'age_gps_data': u'', u'num_sats': u'06', u'lat

## Stop streaming

In [14]:
ssc.stop(stopSparkContext=False,stopGraceFully=True)

-------------------------------------------
Time: 2019-04-29 01:19:50
-------------------------------------------
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'49.7', u'longitude': u'-122.91576333333333', u'unused': u'', u'gps_timestamp': u'20:24:53', u'age_gps_data': u'', u'num_sats': u'05', u'latitude': u'46.90936666666666', u'horizontal_dil': u'2.4', u'altitude_units': u'M', u'coordinate_id': 1556500784915683072}
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'49.7', u'longitude': u'-122.91576333333333', u'unused': u'', u'gps_timestamp': u'20:24:53.200000', u'age_gps_data': u'', u'num_sats': u'05', u'latitude': u'46.90936666666666', u'horizontal_dil': u'2.4', u'altitude_units': u'M', u'coordinate_id': 1556500786918374144}
{u'geo_sep_units': u'M', u'lon_dir': u'W', u'lat_dir': u'N', u'altitude': u'49.7', u'longitude': u'-122.915765', u'unused': u'', u'gps_timestamp': u'20:24:53.400000', u'age_gps_data': u'', u'num_sats': u'05'

## Get Cassandra table content

In [15]:
TOPIC

'gps'

In [16]:
data=sqlContext.read\
    .format("org.apache.spark.sql.cassandra")\
    .options(table=TOPIC, keyspace=KEYSPACE)\
    .load()
data.show()

+-------------------+------------+--------+--------------+-------------+---------------+--------------+-------+---------+-------+-----------+--------+------+
|      coordinate_id|age_gps_data|altitude|altitude_units|geo_sep_units|  gps_timestamp|horizontal_dil|lat_dir| latitude|lon_dir|  longitude|num_sats|unused|
+-------------------+------------+--------+--------------+-------------+---------------+--------------+-------+---------+-------+-----------+--------+------+
|1556500776903388928|            |    49.7|             M|            M|20:24:52.200000|           2.4|      N| 46.90937|      W| -122.91576|       5|      |
|1556499477659725824|            |    45.0|             M|            M|20:23:19.800000|           1.5|      N| 46.90932|      W| -122.91586|       6|      |
|1556500784915683072|            |    49.7|             M|            M|       20:24:53|           2.4|      N|46.909367|      W|-122.915764|       5|      |
|1556499479662646016|            |    45.0|         

## Get Cassandra table content using SQL

In [17]:
data.registerTempTable("TOPIC");
data.printSchema()
data=sqlContext.sql("select * from TOPIC")
data.show()

root
 |-- coordinate_id: long (nullable = true)
 |-- age_gps_data: string (nullable = true)
 |-- altitude: float (nullable = true)
 |-- altitude_units: string (nullable = true)
 |-- geo_sep_units: string (nullable = true)
 |-- gps_timestamp: string (nullable = true)
 |-- horizontal_dil: float (nullable = true)
 |-- lat_dir: string (nullable = true)
 |-- latitude: float (nullable = true)
 |-- lon_dir: string (nullable = true)
 |-- longitude: float (nullable = true)
 |-- num_sats: integer (nullable = true)
 |-- unused: string (nullable = true)

+-------------------+------------+--------+--------------+-------------+---------------+--------------+-------+---------+-------+-----------+--------+------+
|      coordinate_id|age_gps_data|altitude|altitude_units|geo_sep_units|  gps_timestamp|horizontal_dil|lat_dir| latitude|lon_dir|  longitude|num_sats|unused|
+-------------------+------------+--------+--------------+-------------+---------------+--------------+-------+---------+-------+------

In [18]:
data.count()

18