### Utility Functions

In [1]:
import os
import glob
import itertools
import random
import socket
import struct
import boto3


CLIENT = boto3.client('s3')

def upload(table, bucket, path):
    assert os.path.isdir(table)
    for item in itertools.chain(glob.glob(table + '/**'), glob.glob(table + '/.**')):
        if not os.path.isfile(item):
            upload(item, bucket, path + '/' + os.path.basename(item))
        else:
            CLIENT.upload_file(
                item,
                bucket,
                os.path.join(path, os.path.basename(item))
            )


def ipv4():
    return socket.inet_ntoa(struct.pack('>I', random.randint(1, 0xffffffff)))


def port():
    return random.randrange(0, 65536)

### Prepare CSV File

In [2]:
import csv


with open('../../data/security.csv', 'w', newline='') as csvfile:
    writer = csv.writer(
        csvfile,
        delimiter=',',
    )
    writer.writerow(['src_ip', 'src_port', 'dst_ip', 'dst_port'])
    for _ in range(100_000):
        writer.writerow([ipv4(), port(), ipv4(), port()])

### Setup Spark Session

In [3]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip


builder = SparkSession.builder.appName('CreateDeltaTables') \
    .config(
        'spark.jars.packages',
        'io.delta:delta-core_2.12:2.2.0') \
    .config(
        'spark.sql.extensions',
        'io.delta.sql.DeltaSparkSessionExtension') \
    .config(
        'spark.sql.catalog.spark_catalog',
        'org.apache.spark.sql.delta.catalog.DeltaCatalog')

spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/04/28 13:42:03 WARN Utils: Your hostname, aix.local resolves to a loopback address: 127.0.0.1; using 192.168.2.100 instead (on interface en0)
23/04/28 13:42:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/shin/.local/share/virtualenvs/sharing-examples-hEeTnWv9/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/shin/.ivy2/cache
The jars for the packages stored in: /Users/shin/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c5d33e39-c713-47a4-8fc0-8ce4332d0353;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 276ms :: artifacts dl 12ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0 

23/04/28 13:42:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Load CSV

In [4]:
avocado = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('../../data/security.csv')
avocado.show(n=5, truncate=False, vertical=True)

                                                                                

-RECORD 0-------------------
 src_ip   | 176.181.219.134 
 src_port | 3836            
 dst_ip   | 60.4.181.119    
 dst_port | 176             
-RECORD 1-------------------
 src_ip   | 203.151.28.53   
 src_port | 31453           
 dst_ip   | 74.158.3.78     
 dst_port | 55840           
-RECORD 2-------------------
 src_ip   | 95.231.173.163  
 src_port | 50204           
 dst_ip   | 36.213.129.216  
 dst_port | 14130           
-RECORD 3-------------------
 src_ip   | 245.213.200.240 
 src_port | 53350           
 dst_ip   | 190.191.119.251 
 dst_port | 13394           
-RECORD 4-------------------
 src_ip   | 31.190.212.201  
 src_port | 15051           
 dst_ip   | 255.123.161.96  
 dst_port | 64589           
only showing top 5 rows

