In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

df_events = (
    spark.read.option("header", "true").csv("/home/iceberg/data/events.csv")
)

df_devices = (
    spark.read.option("header", "true").csv("/home/iceberg/data/devices.csv")
    .select(
        "browser_type",
        "os_type",
        "device_type",
        "device_id"
    )
    .withColumnRenamed("browser_type", "browser_family")
    .withColumnRenamed("os_type", "os_family")
    .withColumnRenamed("device_type", "device_family")
)

df = (
    df_events
    .join(df_devices, ["device_id"], "inner")
    .withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))
)

df.show(10)

+---------+-----------+--------+--------------------+---+--------------------+--------------+---------+-------------+-------------------+
|device_id|    user_id|referrer|                host|url|          event_time|browser_family|os_family|device_family|         event_date|
+---------+-----------+--------+--------------------+---+--------------------+--------------+---------+-------------+-------------------+
|532630305| 1037710827|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|         Other|    Other|        Other|2021-03-08 00:00:00|
|532630305|  925588856|    NULL|    www.eczachly.com|  /|2021-05-10 11:26:...|         Other|    Other|        Other|2021-05-10 00:00:00|
|532630305|-1180485268|    NULL|admin.zachwilson....|  /|2021-02-17 16:19:...|         Other|    Other|        Other|2021-02-17 00:00:00|
|532630305|-1044833855|    NULL| www.zachwilson.tech|  /|2021-09-24 15:53:...|         Other|    Other|        Other|2021-09-24 00:00:00|
|532630305|  747494706|    NULL| w

In [15]:
sorted = df.repartition(10, col("event_date")) \
        .sortWithinPartitions(col("event_date"), col("host"), col("browser_family")) \
        .withColumn("event_time", col("event_time").cast("timestamp")) \

sorted.show(10)



+-----------+-----------+--------+--------------------+--------------------+--------------------+--------------+---------+------------------+-------------------+
|  device_id|    user_id|referrer|                host|                 url|          event_time|browser_family|os_family|     device_family|         event_date|
+-----------+-----------+--------+--------------------+--------------------+--------------------+--------------+---------+------------------+-------------------+
|  532630305| 1129583063|    NULL|admin.zachwilson....|                   /|2021-01-07 09:21:...|         Other|    Other|             Other|2021-01-07 00:00:00|
|  532630305|-1180485268|    NULL|    www.eczachly.com|                   /|2021-01-07 18:45:...|         Other|    Other|             Other|2021-01-07 00:00:00|
|  532630305| 1129583063|    NULL|    www.eczachly.com|                   /|2021-01-07 21:57:...|         Other|    Other|             Other|2021-01-07 00:00:00|
| 1088283544| -648945006|   

                                                                                

In [11]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [12]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [13]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (event_date);


In [16]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (event_date);

In [17]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (event_date);

In [18]:

start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    
first_sort_df = start_df.sortWithinPartitions(col("event_date"), col("browser_family"), col("host"))

start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

                                                                                

In [19]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files
    
UNION ALL
    
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files

size,num_files,sorted
5091394,4,sorted
5553012,4,unsorted


In [21]:
%%sql

SELECT *
FROM demo.bootcamp.events_sorted.files

content,file_path,file_format,spec_id,partition,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,readable_metrics
0,s3://warehouse/bootcamp/events_sorted/data/00000-72-c93bb32e-4418-4f27-8a77-1806ea9558b7-0-00001.parquet,PARQUET,1,Row(event_date=None),89391,1031403,"{1: 107448, 2: 61005, 3: 11437, 4: 12908, 5: 10692, 6: 7365, 7: 426434, 8: 2274, 9: 77406, 10: 310046}","{1: 89391, 2: 89391, 3: 89391, 4: 89391, 5: 89391, 6: 89391, 7: 89391, 8: 89391, 9: 89391, 10: 89391}","{1: 0, 2: 46359, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 1}",{},"{1: bytearray(b'/'), 2: bytearray(b'52.20.78.240'), 3: bytearray(b'%E3%82%A6%E3%82%'), 4: bytearray(b'Android'), 5: bytearray(b'17MB150WB'), 6: bytearray(b'aashish.techcrea'), 7: bytearray(b' \xba\xe7\xb8\xa8\xb8\x05\x00'), 8: bytearray(b'\x00\xa0&\xb4\xa8\xb8\x05\x00'), 9: bytearray(b'-100210680'), 10: bytearray(b'-1000095488')}","{1: bytearray(b'/zzageqnf.php?Fp'), 2: bytearray(b'zachwilson.tech'), 3: bytearray(b'webprosbot'), 4: bytearray(b'iOS'), 5: bytearray(b'vivo $2'), 6: bytearray(b'zachwilson.techd'), 7: bytearray(b'\xe8\xb0\x1b\x8ec\x03\x06\x00'), 8: bytearray(b'\x00\xe0dqO\x03\x06\x00'), 9: bytearray(b'999535123'), 10: bytearray(b'999884938')}",,[4],,0,"Row(browser_family=Row(column_size=11437, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='%E3%82%A6%E3%82%', upper_bound='webprosbot'), device_family=Row(column_size=10692, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='17MB150WB', upper_bound='vivo $2'), device_id=Row(column_size=77406, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='-100210680', upper_bound='999535123'), event_date=Row(column_size=2274, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 12, 0, 0), upper_bound=datetime.datetime(2023, 8, 20, 0, 0)), event_time=Row(column_size=426434, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 12, 0, 1, 19, 764000), upper_bound=datetime.datetime(2023, 8, 20, 23, 59, 41, 89000)), host=Row(column_size=7365, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='aashish.techcrea', upper_bound='zachwilson.techd'), os_family=Row(column_size=12908, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='Android', upper_bound='iOS'), referrer=Row(column_size=61005, value_count=89391, null_value_count=46359, nan_value_count=None, lower_bound='52.20.78.240', upper_bound='zachwilson.tech'), url=Row(column_size=107448, value_count=89391, null_value_count=0, nan_value_count=None, lower_bound='/', upper_bound='/zzageqnf.php?Fp'), user_id=Row(column_size=310046, value_count=89391, null_value_count=1, nan_value_count=None, lower_bound='-1000095488', upper_bound='999884938'))"
0,s3://warehouse/bootcamp/events_sorted/data/00001-73-c93bb32e-4418-4f27-8a77-1806ea9558b7-0-00001.parquet,PARQUET,1,Row(event_date=None),99232,1164694,"{1: 142161, 2: 67344, 3: 11896, 4: 16525, 5: 11505, 6: 9100, 7: 475847, 8: 2355, 9: 86496, 10: 336994}","{1: 99232, 2: 99232, 3: 99232, 4: 99232, 5: 99232, 6: 99232, 7: 99232, 8: 99232, 9: 99232, 10: 99232}","{1: 0, 2: 49299, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 58}",{},"{1: bytearray(b'""/?""""<?=print(93'), 2: bytearray(b'""https://www.goo'), 3: bytearray(b') Bot'), 4: bytearray(b'Android'), 5: bytearray(b'13 Pro Max'), 6: bytearray(b'abhishekanand.te'), 7: bytearray(b'(\x83\xb2EX\xb8\x05\x00'), 8: bytearray(b'\x00 \xc9<X\xb8\x05\x00'), 9: bytearray(b'-100210680'), 10: bytearray(b'-1000370060')}","{1: bytearray(b'/zz.php'), 2: bytearray(b'zachwilson.tech'), 3: bytearray(b'webprosbot'), 4: bytearray(b'iOS'), 5: bytearray(b'vivo $2'), 6: bytearray(b'zsavi524.techcrf'), 7: bytearray(b'\x88\xb8\x07P;\x03\x06\x00'), 8: bytearray(b""\x00 \xb65\'\x03\x06\x00""), 9: bytearray(b'999535123'), 10: bytearray(b'999956796')}",,[4],,0,"Row(browser_family=Row(column_size=11896, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound=') Bot', upper_bound='webprosbot'), device_family=Row(column_size=11505, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound='13 Pro Max', upper_bound='vivo $2'), device_id=Row(column_size=86496, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound='-100210680', upper_bound='999535123'), event_date=Row(column_size=2355, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 8, 0, 0), upper_bound=datetime.datetime(2023, 8, 18, 0, 0)), event_time=Row(column_size=475847, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 8, 0, 2, 29, 513000), upper_bound=datetime.datetime(2023, 8, 18, 23, 59, 0, 901000)), host=Row(column_size=9100, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound='abhishekanand.te', upper_bound='zsavi524.techcrf'), os_family=Row(column_size=16525, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound='Android', upper_bound='iOS'), referrer=Row(column_size=67344, value_count=99232, null_value_count=49299, nan_value_count=None, lower_bound='""https://www.goo', upper_bound='zachwilson.tech'), url=Row(column_size=142161, value_count=99232, null_value_count=0, nan_value_count=None, lower_bound='""/?""""<?=print(93', upper_bound='/zz.php'), user_id=Row(column_size=336994, value_count=99232, null_value_count=58, nan_value_count=None, lower_bound='-1000370060', upper_bound='999956796'))"
0,s3://warehouse/bootcamp/events_sorted/data/00002-74-c93bb32e-4418-4f27-8a77-1806ea9558b7-0-00001.parquet,PARQUET,1,Row(event_date=None),93955,1353937,"{1: 345899, 2: 86601, 3: 10943, 4: 12869, 5: 12116, 6: 8576, 7: 447220, 8: 2018, 9: 86856, 10: 336422}","{1: 93955, 2: 93955, 3: 93955, 4: 93955, 5: 93955, 6: 93955, 7: 93955, 8: 93955, 9: 93955, 10: 93955}","{1: 0, 2: 48227, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0}",{},"{1: bytearray(b'""/?""""<?=print(93'), 2: bytearray(b'""https://www.goo'), 3: bytearray(b') Bot'), 4: bytearray(b'Android'), 5: bytearray(b'ALP-AL00'), 6: bytearray(b'ablumhardt.techc'), 7: bytearray(b'\x18\xe8_\xb2\xf3\xb7\x05\x00'), 8: bytearray(b'\x00@\x94\xa7\xf3\xb7\x05\x00'), 9: bytearray(b'-1000866068'), 10: bytearray(b'-1000675882')}","{1: bytearray(b'/zz/address.php@'), 2: bytearray(b'zachwilson.tech'), 3: bytearray(b'webprosbot'), 4: bytearray(b'webOS'), 5: bytearray(b'vivo $2'), 6: bytearray(b'zzz.techcreator/'), 7: bytearray(b'HE\xdbM\xb3\x03\x06\x00'), 8: bytearray(b'\x00`\xc2\xe8\x9f\x03\x06\x00'), 9: bytearray(b'998961543'), 10: bytearray(b'999956796')}",,[4],,0,"Row(browser_family=Row(column_size=10943, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound=') Bot', upper_bound='webprosbot'), device_family=Row(column_size=12116, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='ALP-AL00', upper_bound='vivo $2'), device_id=Row(column_size=86856, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='-1000866068', upper_bound='998961543'), event_date=Row(column_size=2018, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 3, 0, 0), upper_bound=datetime.datetime(2023, 8, 24, 0, 0)), event_time=Row(column_size=447220, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 3, 0, 3, 1, 119000), upper_bound=datetime.datetime(2023, 8, 24, 23, 8, 20, 509000)), host=Row(column_size=8576, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='ablumhardt.techc', upper_bound='zzz.techcreator/'), os_family=Row(column_size=12869, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='Android', upper_bound='webOS'), referrer=Row(column_size=86601, value_count=93955, null_value_count=48227, nan_value_count=None, lower_bound='""https://www.goo', upper_bound='zachwilson.tech'), url=Row(column_size=345899, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='""/?""""<?=print(93', upper_bound='/zz/address.php@'), user_id=Row(column_size=336422, value_count=93955, null_value_count=0, nan_value_count=None, lower_bound='-1000675882', upper_bound='999956796'))"
0,s3://warehouse/bootcamp/events_sorted/data/00003-75-c93bb32e-4418-4f27-8a77-1806ea9558b7-0-00001.parquet,PARQUET,1,Row(event_date=None),122235,1541360,"{1: 284335, 2: 87432, 3: 11501, 4: 16872, 5: 13540, 6: 9324, 7: 558656, 8: 2154, 9: 110112, 10: 442464}","{1: 122235, 2: 122235, 3: 122235, 4: 122235, 5: 122235, 6: 122235, 7: 122235, 8: 122235, 9: 122235, 10: 122235}","{1: 0, 2: 53009, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 8}",{},"{1: bytearray(b'/'), 2: bytearray(b'3.220.57.224'), 3: bytearray(b') Bot'), 4: bytearray(b'Android'), 5: bytearray(b'$2'), 6: bytearray(b'accc.techcreator'), 7: bytearray(b'@n.\xbd\xdf\xb7\x05\x00'), 8: bytearray(b'\x00\xe0\xbc\x89\xdf\xb7\x05\x00'), 9: bytearray(b'-1001669954'), 10: bytearray(b'-1000015881')}","{1: bytearray(b'/zz.php'), 2: bytearray(b'zachwilson.tech'), 3: bytearray(b'webprosbot'), 4: bytearray(b'iOS'), 5: bytearray(b'vivo $2'), 6: bytearray(b'zachwilson.techd'), 7: bytearray(b'\xd8\xaf\x9a\xe8\x9f\x03\x06\x00'), 8: bytearray(b'\x00\x00\xeb\xca\x8b\x03\x06\x00'), 9: bytearray(b'998766634'), 10: bytearray(b'999882344')}",,[4],,0,"Row(browser_family=Row(column_size=11501, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound=') Bot', upper_bound='webprosbot'), device_family=Row(column_size=13540, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound='$2', upper_bound='vivo $2'), device_id=Row(column_size=110112, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound='-1001669954', upper_bound='998766634'), event_date=Row(column_size=2154, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 2, 0, 0), upper_bound=datetime.datetime(2023, 8, 23, 0, 0)), event_time=Row(column_size=558656, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 1, 2, 0, 14, 23, 80000), upper_bound=datetime.datetime(2023, 8, 23, 23, 59, 57, 399000)), host=Row(column_size=9324, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound='accc.techcreator', upper_bound='zachwilson.techd'), os_family=Row(column_size=16872, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound='Android', upper_bound='iOS'), referrer=Row(column_size=87432, value_count=122235, null_value_count=53009, nan_value_count=None, lower_bound='3.220.57.224', upper_bound='zachwilson.tech'), url=Row(column_size=284335, value_count=122235, null_value_count=0, nan_value_count=None, lower_bound='/', upper_bound='/zz.php'), user_id=Row(column_size=442464, value_count=122235, null_value_count=8, nan_value_count=None, lower_bound='-1000015881', upper_bound='999882344'))"


In [22]:
%%sql

SELECT *
FROM demo.bootcamp.events_sorted

                                                                                

device_id,user_id,referrer,host,url,event_time,browser_family,os_family,device_family,event_date
589185851,-21136712,,admin.zachwilson.tech,/,2021-01-12 18:49:28.425000,Chrome,Linux,Other,2021-01-12 00:00:00
589185851,-414920062,,admin.zachwilson.tech,/,2021-01-12 18:54:30.995000,Chrome,Linux,Other,2021-01-12 00:00:00
589185851,-414920062,,admin.zachwilson.tech,/,2021-01-12 19:56:56.809000,Chrome,Linux,Other,2021-01-12 00:00:00
589185851,-694958230,,admin.zachwilson.tech,/,2021-01-12 20:08:15.964000,Chrome,Linux,Other,2021-01-12 00:00:00
-843023486,-2116612468,,www.eczachly.com,/,2021-01-12 01:13:53.762000,Chrome,Mac OS X,Other,2021-01-12 00:00:00
-843023486,-2116612468,https://www.eczachly.com/,www.eczachly.com,/blog,2021-01-12 01:13:56.914000,Chrome,Mac OS X,Other,2021-01-12 00:00:00
-843023486,-2116612468,https://www.eczachly.com/blog,www.eczachly.com,/graphs,2021-01-12 01:13:58.899000,Chrome,Mac OS X,Other,2021-01-12 00:00:00
-843023486,-2116612468,https://www.eczachly.com/graphs,www.eczachly.com,/graphs,2021-01-12 01:14:01.017000,Chrome,Mac OS X,Other,2021-01-12 00:00:00
-843023486,-2116612468,https://www.eczachly.com/graphs,www.eczachly.com,/about,2021-01-12 01:14:02.615000,Chrome,Mac OS X,Other,2021-01-12 00:00:00
-843023486,-2116612468,https://www.eczachly.com/about,www.eczachly.com,/contact,2021-01-12 01:14:40.193000,Chrome,Mac OS X,Other,2021-01-12 00:00:00


In [90]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;

size,num_files
3145713,5


In [None]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files

count(1)
3665
