## 총 매출이 가장 많은 사용자 10명 찾기

In [0]:
df_user_session_channel = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/user_session_channel.csv")

df_session_timestamp = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/session_timestamp.csv")

df_session_transaction = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("s3a://s3-geospatial/readonly/session_transaction.csv")

In [0]:
df_user_session_channel.createOrReplaceTempView("user_session_channel")
df_session_timestamp.createOrReplaceTempView("session_timestamp")
df_session_transaction.createOrReplaceTempView("session_transaction")

In [0]:
df_user_session_channel.show(5)

+------+--------------------+--------+
|userid|           sessionid| channel|
+------+--------------------+--------+
|   184|c41dd99a69df04044...|   Naver|
|    80|fdc0eb412a84fa549...| Organic|
|   251|0a54b19a13b6712dc...|Facebook|
|   264|a914ecef9c12ffdb9...|  Google|
|   744|05ae14d7ae387b933...|Facebook|
+------+--------------------+--------+
only showing top 5 rows



In [0]:
df_session_timestamp.show(5)

+--------------------+--------------------+
|           sessionid|                  ts|
+--------------------+--------------------+
|7cdace91c487558e2...|2019-05-01 00:13:...|
|94f192dee566b018e...|2019-05-01 00:49:...|
|7ed2d3454c5eea711...|2019-05-01 10:18:...|
|f1daf122cde863010...|2019-05-01 13:10:...|
|fd0efcca272f704a7...|2019-05-01 13:45:...|
+--------------------+--------------------+
only showing top 5 rows



In [0]:
df_session_transaction.show(5)

+--------------------+--------+------+
|           sessionid|refunded|amount|
+--------------------+--------+------+
|00029153d12ae1c9a...|   false|    85|
|008909bd27b680698...|   false|    13|
|0107acb41ef20db22...|   false|    16|
|018544a2c48077d2c...|   false|    39|
|020c38173caff0203...|   false|    61|
+--------------------+--------+------+
only showing top 5 rows



In [0]:
top_rev_user_df = spark.sql("""
    SELECT userid,
        SUM(str.amount) revenue,
        SUM(CASE WHEN str.refunded = False THEN str.amount END) net_revenue
    FROM user_session_channel usc
    JOIN session_transaction str ON usc.sessionid = str.sessionid
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 10""")

In [0]:
top_rev_user_df.show()

+------+-------+-----------+
|userid|revenue|net_revenue|
+------+-------+-----------+
|   989|    743|        743|
|   772|    556|        556|
|  1615|    506|        506|
|   654|    488|        488|
|  1651|    463|        463|
|   973|    438|        438|
|   262|    422|        422|
|  1099|    421|        343|
|  2682|    414|        414|
|   891|    412|        412|
+------+-------+-----------+



In [0]:
top_rev_user_df_rank = spark.sql("""
SELECT
  userid,
  SUM(amount) total_amount, 
 	RANK() OVER (ORDER BY SUM(amount) DESC) rank
FROM session_transaction st
JOIN user_session_channel usc ON st.sessionid = usc.sessionid
GROUP	BY userid
ORDER BY rank
LIMIT 10""")

In [0]:
top_rev_user_df_rank.show()

+------+------------+----+
|userid|total_amount|rank|
+------+------------+----+
|   989|         743|   1|
|   772|         556|   2|
|  1615|         506|   3|
|   654|         488|   4|
|  1651|         463|   5|
|   973|         438|   6|
|   262|         422|   7|
|  1099|         421|   8|
|  2682|         414|   9|
|   891|         412|  10|
+------+------------+----+

