In [1]:
import json
import pandas as pd
import v3io.dataplane

### Docs
- https://github.com/v3io/v3io-py

### Load Sample Data

In [2]:
df = pd.read_parquet("data.parquet")

### Create V3IO Stream

In [3]:
v3io_client = v3io.dataplane.Client()

In [4]:
v3io_client.stream.create(container='bigdata',
                          stream_path='/my-test-stream',
                          shard_count=1,
                          raise_for_status=[204, 409]) # 204 status = successful stream creation, 409 status = stream already exists

<v3io.dataplane.response.Response at 0x7f42b0fc4dd0>

In [5]:
# Check that stream exists on file system
!ls -la /v3io/bigdata | grep my-test-stream

drwxrwxr-x 2   52 staff 0 Dec 14 21:39 my-test-stream


### Format Chunks in same style as Gesa
Gesa's tables will be coming in via Kafka. The information we are interested is located under `event["message"]["data"]`.

Additionally, V3IO streams expect the outer-most key to be `data`. Therefore, we are formatting the records such that
our actual data is located under `event["data"]["message"]["data"]`. We will be feeding the stream a list of data
formatted this way.

In [6]:
def format_records_stream_kafka(df: pd.DataFrame) -> list:
    """
    Format records in correct format to simulate receiving from Kafka stream. Also
    converts any datetime fields into strings to preserve formatting.
    
    :param df: Dataframe to format
    
    :return: Formatted records
    """
    for column, dtype in df.dtypes.items():
        # Convert datetime into string to preserve formatting
        if dtype == "datetime64[ns]":
            df[column] = df[column].astype(str)
            
    # V3IO stream expects a list of dicts with the key "data" - Kafka message starts with "message" key
    return [{"data" : json.dumps({"message": {"data": d}})} for d in df.to_dict(orient="records")]

In [7]:
# Data formatting sample
format_records_stream_kafka(df)[0]

{'data': '{"message": {"data": {"sepal length (cm)": 5.1, "sepal width (cm)": 3.5, "petal length (cm)": 1.4, "petal width (cm)": 0.2, "label": 0}}}'}

### Chunk Data into dataframes of `n` rows
Because we will be working with a lot of data, we will need to write it into the stream in chunks. I have found that `n = 1000` is a good comproimise between size and speed - feel free to experiment.

In [8]:
# Chunk row size
n = 1000

### Format + Write Chunks to Stream - ACTUAL WORK

In [9]:
chunk_df = [df[i:i+n] for i in range(0, df.shape[0], n)]
for data in chunk_df:
    records = format_records_stream_kafka(data)
    v3io_client.stream.put_records(container='bigdata', stream_path='/my-test-stream', records=records)

### Verify Stream has been Written to

In [10]:
response = v3io_client.stream.seek(container='bigdata',
                                   stream_path='/my-test-stream',
                                   shard_id=0,
                                   seek_type='EARLIEST')

In [11]:
response = v3io_client.stream.get_records(container='bigdata',
                                          stream_path='/my-test-stream',
                                          shard_id=0,
                                          location=response.output.location)

In [12]:
for record in response.output.records:
    print(record.data.decode('utf-8'))

{"message": {"data": {"sepal length (cm)": 5.1, "sepal width (cm)": 3.5, "petal length (cm)": 1.4, "petal width (cm)": 0.2, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 4.9, "sepal width (cm)": 3.0, "petal length (cm)": 1.4, "petal width (cm)": 0.2, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 4.7, "sepal width (cm)": 3.2, "petal length (cm)": 1.3, "petal width (cm)": 0.2, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 4.6, "sepal width (cm)": 3.1, "petal length (cm)": 1.5, "petal width (cm)": 0.2, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 5.0, "sepal width (cm)": 3.6, "petal length (cm)": 1.4, "petal width (cm)": 0.2, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 5.4, "sepal width (cm)": 3.9, "petal length (cm)": 1.7, "petal width (cm)": 0.4, "label": 0}}}
{"message": {"data": {"sepal length (cm)": 4.6, "sepal width (cm)": 3.4, "petal length (cm)": 1.4, "petal width (cm)": 0.3, "label": 0}}}
{"message": {"data": {"sepal lengt

### Clean Up

In [13]:
v3io_client.stream.delete(container='bigdata', stream_path='/my-test-stream')

<v3io.dataplane.response.Response at 0x7f430548ad10>