In [31]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.dataframe_reader import *
from snowflake.snowpark.functions import *
from snowflake.snowpark.window import *
from snowflake.snowpark.types import StringType, StructType, StructField

In [32]:
connection_parameters = {
    "account": "",
    "user": "",
    "password": "",
    "warehouse": "", # optional
    "database":"COMMONS",
    "schema":"UTILS"
}  
session = Session.builder.configs(connection_parameters).create()  

#### Use custom_package_usage_config to do the magic

custom_package_usage_config helps to get or set configuration parameters related to usage of custom Python packages in Snowflake.

If enabled, pure Python packages that are not available in Snowflake will be installed locally via pip and made available as an import (see add_import for more information on imports). You can speed up this process by mentioning a remote stage path as cache_path where unsupported pure Python packages will be persisted. To use a specific version of pip, you can set the environment variable PIP_PATH to point to your pip executable. 
To use custom Python packages which are not purely Python, specify the force_push configuration parameter (*note that using non-pure Python packages is not recommended!*).

This feature is **experimental** as of now and not yet ready for production.

In [33]:
session.custom_package_usage_config = {"enabled": True, "force_push": True}
session.add_packages("amazon.ion==0.12.0","snowflake-snowpark-python")

The following packages are not available in Snowflake: ['amazon.ion==0.12.0'].
If you are adding package(s) unavailable in Snowflake, it is highly recommended that you include the 'cache_path' configuration parameter in order to reduce latency.


#### Generate sample ion files from the TPCH CUSTOMER data
The below lines of code uses the snowflake connection to read from the sample TPCH_SF1 dataset provided and uses the customer table to generate 5 files corresponding to 5 rows of the data.
The files are stored in your local working directory in a folder `./customer_binary_data`. Create this folder or uncomment the code that creates the directory

In [44]:
import amazon.ion.simpleion as ion
import os

from json import loads, dumps
df = session.table('SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.CUSTOMER').select('C_NATIONKEY','C_MKTSEGMENT', 'C_ACCTBAL').limit(5).to_pandas()
result = df.to_json(orient="records")
jsonvals = loads(result)
print(type(jsonvals))
print(dumps(jsonvals, indent=4))

# Create the directory if it does not exist
#os.makedirs('./customer_binary_data')

for i in range(len(jsonvals)):
    ionval = ion.loads(str(jsonvals[i]))
    print(ionval)
    with open(f"./customer_binary_data/customer{i}.ion", "wb") as f:
        f.write(ion.dumps(ionval))
        f.close()

<class 'list'>
[
    {
        "C_NATIONKEY": 14,
        "C_MKTSEGMENT": "HOUSEHOLD",
        "C_ACCTBAL": 9957.56
    },
    {
        "C_NATIONKEY": 15,
        "C_MKTSEGMENT": "BUILDING",
        "C_ACCTBAL": 742.46
    },
    {
        "C_NATIONKEY": 16,
        "C_MKTSEGMENT": "BUILDING",
        "C_ACCTBAL": 2526.92
    },
    {
        "C_NATIONKEY": 10,
        "C_MKTSEGMENT": "AUTOMOBILE",
        "C_ACCTBAL": 7975.22
    },
    {
        "C_NATIONKEY": 12,
        "C_MKTSEGMENT": "MACHINERY",
        "C_ACCTBAL": 2504.74
    }
]
{'C_NATIONKEY': 14, 'C_MKTSEGMENT': IonPySymbol(text='HOUSEHOLD', sid=None, location=None), 'C_ACCTBAL': Decimal('9957.56')}
{'C_NATIONKEY': 15, 'C_MKTSEGMENT': IonPySymbol(text='BUILDING', sid=None, location=None), 'C_ACCTBAL': Decimal('742.46')}
{'C_NATIONKEY': 16, 'C_MKTSEGMENT': IonPySymbol(text='BUILDING', sid=None, location=None), 'C_ACCTBAL': Decimal('2526.92')}
{'C_NATIONKEY': 10, 'C_MKTSEGMENT': IonPySymbol(text='AUTOMOBILE', sid=None, locat

#### Create a stage to store your the customer ion data

In [13]:
query = "create or replace stage commons.utils.customer_binary_data" +\
        " directory = (enable = true)" +\
        " copy_options = (on_error='skip_file')"
        
session.sql(query).collect()


[Row(status='Stage area CUSTOMER_BINARY_DATA successfully created.')]

Upload the local files into the stage

In [14]:
put_result = session.file.put("./customer_binary_data/*", "@customer_binary_data/",auto_compress=True, overwrite=True)
put_result[0].status

'UPLOADED'

#### A quick local experiment on reading ion files
This is a code snippet to test on how to read the ion files locally. You may skip this.

In [21]:
import amazon.ion.simpleion as ion
import pandas as pd
from snowflake.snowpark.types import IntegerType, StringType, StructField
def read_ion(file_name):
    with open(file_name, "rb") as file_handle:
        return ion.load(file_handle)
ionval = read_ion('customer_binary_data/customer0.ion')
rows = []
rows.append( (ionval['C_NATIONKEY'], ionval['C_MKTSEGMENT'].text, ionval['C_ACCTBAL']))
schema = StructType([
    StructField("C_NATIONKEY", IntegerType()), 
    StructField("C_MKTSEGMENT", StringType()),
    StructField("C_ACCTBAL", IntegerType()) 
    ])
session.createDataFrame(rows, schema).collect()


[Row(C_NATIONKEY=14, C_MKTSEGMENT='HOUSEHOLD', C_ACCTBAL=9958)]

#### Snowflake snowpark code to read from the stage and parse the ion files

The below code reads from the stage file by file and is written into a table

In [23]:
import io
# List all files in the stage
files = session.sql('LIST @customer_binary_data').collect()

rows = []
schema = StructType([
    StructField("C_NATIONKEY", IntegerType()), 
    StructField("C_MKTSEGMENT", StringType()),
    StructField("C_ACCTBAL", IntegerType()) 
    ])

# # Iterate over the files and read them
for file in files:
    # Get the stream for the file
    stream = session.file.get_stream(file['name'], decompress=True)

    # Read the stream
    fd  = io.BytesIO(stream.read())
    print(fd)
    ionval = ion.load(fd)
    rows.append( (ionval['C_NATIONKEY'], ionval['C_MKTSEGMENT'].text, ionval['C_ACCTBAL']))

df = session.createDataFrame(rows, schema)
df.write.save_as_table("CUSTOMER_RESULT", mode="overwrite", table_type="transient")


<_io.BytesIO object at 0x16a4114e0>
<_io.BytesIO object at 0x172349710>
<_io.BytesIO object at 0x157e71030>
<_io.BytesIO object at 0x172310590>
<_io.BytesIO object at 0x157e71030>


In [24]:
session.table("CUSTOMER_RESULT").collect()

[Row(C_NATIONKEY=14, C_MKTSEGMENT='HOUSEHOLD', C_ACCTBAL=9958),
 Row(C_NATIONKEY=15, C_MKTSEGMENT='BUILDING', C_ACCTBAL=742),
 Row(C_NATIONKEY=16, C_MKTSEGMENT='BUILDING', C_ACCTBAL=2527),
 Row(C_NATIONKEY=10, C_MKTSEGMENT='AUTOMOBILE', C_ACCTBAL=7975),
 Row(C_NATIONKEY=12, C_MKTSEGMENT='MACHINERY', C_ACCTBAL=2505)]

#### Snowflake python stored procedure to read and store ion data in table

This is same as above code but is wrapped in a python stored procedure.

In [37]:
@sproc(name="test_ion", is_permanent=False, replace=True)
def test_ion(session: Session) -> bool:
    
    import amazon.ion.simpleion as ion
    import io
    # List all files in the stage
    files = session.sql('LIST @customer_binary_data').collect()

    rows = []
    schema = StructType([
        StructField("C_NATIONKEY", IntegerType()), 
        StructField("C_MKTSEGMENT", StringType()),
        StructField("C_ACCTBAL", IntegerType()) 
        ])

    # # Iterate over the files and read them
    for file in files:
        # Get the stream for the file
        stream = session.file.get_stream("@"+file['name'], decompress=True)

        # Read the stream
        fd  = io.BytesIO(stream.read())
        print(fd)
        ionval = ion.load(fd)
        rows.append( (ionval['C_NATIONKEY'], ionval['C_MKTSEGMENT'].text, ionval['C_ACCTBAL']))

    df = session.createDataFrame(rows, schema)
    df.write.save_as_table("CUSTOMER_RESULT", mode="overwrite", table_type="transient")

    return True

In [38]:
test_ion()

True

In [39]:
session.table("CUSTOMER_RESULT").collect()

[Row(C_NATIONKEY=14, C_MKTSEGMENT='HOUSEHOLD', C_ACCTBAL=9958),
 Row(C_NATIONKEY=15, C_MKTSEGMENT='BUILDING', C_ACCTBAL=742),
 Row(C_NATIONKEY=16, C_MKTSEGMENT='BUILDING', C_ACCTBAL=2527),
 Row(C_NATIONKEY=10, C_MKTSEGMENT='AUTOMOBILE', C_ACCTBAL=7975),
 Row(C_NATIONKEY=12, C_MKTSEGMENT='MACHINERY', C_ACCTBAL=2505)]

In [2]:
session.close()