In [4]:
# Read the CSV file
root_path = '/home/iceberg/data'

input_path = f'{root_path}/gem/appsflyer/installs_report/app_id={{id123456789,com.appsflyer.referrersender}}/event_date=2024-05-21'

df = spark.read.format('csv').options(header='true').load(input_path)

df = spark.sql("""
select event_name, event_time, install_time, af_prt, media_source, campaign, af_c_id, country_code, appsflyer_id, platform
from {installs_report}
where af_c_id is not null
""", installs_report = df)


# Show the data
df.show(5)

+----------+-------------------+-------------------+------+----------------+--------------------+----------+------------+--------------------+--------+
|event_name|         event_time|       install_time|af_prt|    media_source|            campaign|   af_c_id|country_code|        appsflyer_id|platform|
+----------+-------------------+-------------------+------+----------------+--------------------+----------+------------+--------------------+--------+
|   install|2024-05-21 23:30:52|2024-05-21 23:30:52|  NULL|Apple Search Ads|912_VHDL_SearchTa...|1576541798|          VN|1716308509853-509...|     ios|
|   install|2024-05-21 23:20:35|2024-05-21 23:20:35|  NULL|Apple Search Ads|912_VHDL_SearchTa...|1576541798|          VN|1716307585459-568...|     ios|
|   install|2024-05-21 23:00:50|2024-05-21 23:00:50|  NULL|Apple Search Ads|912_VHDL_FA Plan_...|1576814064|          VN|1716303912074-904...|     ios|
|   install|2024-05-21 22:30:44|2024-05-21 22:30:44|  NULL|Apple Search Ads|912_VHDL_Sea

In [37]:
df = spark.read.parquet('s3a://lakehouse/gem/raw/appsflyer/installs_report/partition_date=2024-05-21')
df.show(1)

+---------------------+---------------------+-------------------+-------------------+----------+------+------------+----------+--------+---------+-------+--------+-----------+-----+--------+----------+----------+-------------+-------------+----------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+------+------------+-----+--------+-----------+--------------+-----+--------+-------+----------+--------------------+--------------+----+----------+----------------+----+----+--------+-----------+----------+-----------+-----------+--------------------+--------------------+--------------+---------------------------+--------------------+----------------------+----------------------+------------

In [49]:
in_app_events = spark.read.parquet('s3a://lakehouse/gem/raw/appsflyer/in_app_events_report/partition_date=2024-05-21')
in_app_events.show(1)

in_app_events.printSchema()

in_app_events.select('event_name').distinct().show(100, False)

+---------------------+---------------------+-------------------+-------------------+--------------------+--------------------+-------------+----------------------+-----------------+------------+--------------------+------+------------+----------+--------+--------+-------+--------+-----------+-----+--------+----------+----------+--------------+-------------+-------------+----------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+-------------------+-------------------------+---------------------+-----------------------+-----------------------+------+------------+-----+--------------+-----------+----+------------+----+--------------+--------------+----------+--------------------+--------------+----+----------+----------------+----+----+--------+-----------+----------+-----------+-----------+---------

In [52]:
in_app_events.where("event_name = 'af_purchase'").show(1, False)

+---------------------+---------------------+-------------------+-------------------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+----------------------+------------------+------------+--------------------+------+------------+----------+--------+--------+-----

In [60]:
in_app_events = spark.read.parquet('s3a://lakehouse/gem/raw/appsflyer/in_app_events_report/partition_date=2024-05-21')

df = spark.sql("""
with in_app_events as (
  select *, '{{product_id}}' as product_id, to_date(event_time) as partition_date, customer_user_id as user_id
  from {in_app_events}
)
select  product_id, user_id, event_revenue_currency as currency
      , min(cast(event_time as timestamp)) as first_purchase_time, max(cast(event_time as timestamp)) as last_purchase_time
      , count(event_time) as number_of_purchases, sum(event_revenue) as revenue, sum(event_revenue_usd) as revenue_usd
      , partition_date
from in_app_events
where event_name = 'af_purchase' and event_revenue is not null and user_id is not null
group by product_id, user_id, event_revenue_currency, partition_date
""", in_app_events = in_app_events)

df.show(5, False)

df.count()

+------------+-------------------+--------+-------------------+-------------------+-------------------+--------+------------------+--------------+
|product_id  |user_id            |currency|first_purchase_time|last_purchase_time |number_of_purchases|revenue |revenue_usd       |partition_date|
+------------+-------------------+--------+-------------------+-------------------+-------------------+--------+------------------+--------------+
|{product_id}|1241363582532603904|VND     |2024-05-21 11:55:28|2024-05-21 16:37:07|7                  |313000.0|12.294880405806936|2024-05-21    |
|{product_id}|1199550335929909248|VND     |2024-05-21 11:03:30|2024-05-21 11:09:26|48                 |2.2824E7|896.5855687118487 |2024-05-21    |
|{product_id}|1232240346301456384|VND     |2024-05-21 16:15:05|2024-05-21 16:23:46|2                  |45000.0 |1.7673395648417247|2024-05-21    |
|{product_id}|1193843416465596416|VND     |2024-05-21 10:39:54|2024-05-21 10:39:54|1                  |10000.0 |0.3928

6

In [58]:
df = spark.read.parquet("s3a://lakehouse/gem/enriched/user/activity/partition_date=2024-05-21")
df.show(1)
df.printSchema()

+----------+-------------------+------------+----------+------+------------+-----------+------------+--------+----------+------------+-------------------+-------------------+-------------------+-------------+-----+
|product_id|            user_id|install_time|install_id|agency|media_source|campaign_id|country_code|platform|os_version|device_model|  registration_time|   first_login_time|    last_login_time|session_count|level|
+----------+-------------------+------------+----------+------+------------+-----------+------------+--------+----------+------------+-------------------+-------------------+-------------------+-------------+-----+
|       gem|1174568175868424192|        NULL|      NULL|  NULL|        NULL|       NULL|        NULL|    NULL|      NULL|        NULL|2024-05-21 10:06:01|2024-05-21 10:06:01|2024-05-21 11:01:43|            3| NULL|
+----------+-------------------+------------+----------+------+------------+-----------+------------+--------+----------+------------+------

In [61]:
df = spark.read.parquet("s3a://lakehouse/gem/enriched/user/purchase/partition_date=2024-05-21")
df.show(1)
df.printSchema()

+----------+-------------------+--------+-------------------+-------------------+-------------------+--------+-----------------+
|product_id|            user_id|currency|first_purchase_time| last_purchase_time|number_of_purchases| revenue|      revenue_usd|
+----------+-------------------+--------+-------------------+-------------------+-------------------+--------+-----------------+
|       gem|1199550335929909248|     VND|2024-05-21 11:03:30|2024-05-21 11:09:26|                 48|2.2824E7|896.5855687118487|
+----------+-------------------+--------+-------------------+-------------------+-------------------+--------+-----------------+
only showing top 1 row

root
 |-- product_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- first_purchase_time: timestamp (nullable = true)
 |-- last_purchase_time: timestamp (nullable = true)
 |-- number_of_purchases: long (nullable = true)
 |-- revenue: double (nullable = true)
 |-- reven

In [64]:
df = spark.read.parquet("s3a://lakehouse/gem/enriched/user_profile")
df.where('media_source is not null').show(1)
df.printSchema()

+----------+-------------------+-------------------+--------------------+------+--------------------+-----------+------------+--------+----------+-------------+-------------------+-------------------+-------------------+-------------+-----+-------------------+-------------------+-------------------+------------------+
|product_id|            user_id|       install_time|          install_id|agency|        media_source|campaign_id|country_code|platform|os_version| device_model|  registration_time|   first_login_time|    last_login_time|session_count|level|first_purchase_time| last_purchase_time|number_of_purchases|       revenue_usd|
+----------+-------------------+-------------------+--------------------+------+--------------------+-----------+------------+--------+----------+-------------+-------------------+-------------------+-------------------+-------------+-----+-------------------+-------------------+-------------------+------------------+
|       gem|1241363582532603904|2024-05-