In [1]:
# get spark session, 2g mem per executor
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

# set python env
os.environ['PYSPARK_PYTHON'] = "/opt/conda3/envs/lab2/bin/python"
spark = SparkSession.builder \
    .appName("CalculateUserScoreAndLevel") \
    .master("spark://node01:10077") \
    .enableHiveSupport()\
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

sc = spark.sparkContext

23/05/31 14:07:27 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/31 14:07:28 WARN spark.SparkContext: Please ensure that the number of slots available on your executors is limited by the number of cores to task cpus and not another custom resource. If cores is not the limiting resource then dynamic allocation will not work properly!


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
event_score_data = [
    ('view', 1.0),
    ('click', 4.0),
    ('long_view', 7.0),
    ('add_to_favorites', 20.0),
    ('purchase', 30.0),
    ('search_view', 5.0),
    ('search_click', 10.0)
]

schema = StructType([
    StructField("event_type", StringType(), True),
    StructField("score", DoubleType(), True)
])

event_score_df = spark.createDataFrame(event_score_data, schema)
event_score_df.show()
event_score_df.createOrReplaceTempView("event_score")

                                                                                

+----------------+-----+
|      event_type|score|
+----------------+-----+
|            view|  1.0|
|           click|  4.0|
|       long_view|  7.0|
|add_to_favorites| 20.0|
|        purchase| 30.0|
|     search_view|  5.0|
|    search_click| 10.0|
+----------------+-----+



In [4]:
# define map functions 
from datetime import datetime, timedelta

today_string = datetime.today().strftime('%Y-%m-%d')
history_string = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
print(today_string)
print(history_string)

time_factor = 0.9

2023-05-31
2023-05-30


In [12]:
user_score = spark.sql(f"""
with all_users as(
    select
        distinct user_id
    from
        user_ods
), history_user_score as(
    select
        a.user_id,
        if(b.user_id is null,0,b.score) as score
    from
        all_users a
    left_join
        user_score_and_rec_level b on a.user_id = b.user_id
), today_user_score as(
    select
        a.user_id,
        sum(if(c.score is null,0,c.score)) as score
    from
        all_users a
    left join
        event_ods b on a.user_id = b.user_id and b.timestamp = '{today_string}'
    left join
        event_score c on b.event_type = c.event_type
    group by
        a.user_id
    order by
        score desc
)
select 
    a.user_id,
    b.score + c.score as score,
    '{today_string}' as date
from
    all_users a
left_join
    history_user_score b on a.user_id = b.user_id
left_join
    today_user_score c on a.user_id = c.user_id
order by
    score desc
""")
user_score.show()

ParseException: 
no viable alternative at input 'with all_users as(\n    select\n        distinct user_id\n    from\n        user_ods\n), history_user_score as(\n    select\n        a.user_id,\n        if(b.user_id is null,0,b.score) as score\n    from\n        all_users a\n    left_join'(line 13, pos 4)

== SQL ==

with all_users as(
    select
        distinct user_id
    from
        user_ods
), history_user_score as(
    select
        a.user_id,
        if(b.user_id is null,0,b.score) as score
    from
        all_users a
    left_join
----^^^
        user_score_and_rec_level b on a.user_id = b.user_id
), today_user_score as(
    select
        a.user_id,
        sum(if(c.score is null,0,c.score)) as score
    from
        all_users a
    left join
        event_ods b on a.user_id = b.user_id and b.timestamp = '2023-05-31'
    left join
        event_score c on b.event_type = c.event_type
    group by
        a.user_id
    order by
        score desc
)
select 
    a.user_id,
    b.score + c.score as score,
    '2023-05-31' as date
from
    all_users a
left_join
    history_user_score b on a.user_id = b.user_id
left_join
    today_user_score c on a.user_id = c.user_id
order by
    score desc


In [14]:
user_score_schema = StructType([
    StructField("item_id", StringType(), True),
    StructField("score", DoubleType(), True),
    StructField("date", StringType(), True)
])
user_score = spark.createDataFrame([],user_score_schema)
spark.createDataFrame([],user_score_schema).createOrReplaceTempView("user_score_and_rec_level")

In [17]:
user_score.write.mode("overwrite").partitionBy("date").saveAsTable("user_score_and_rec_level")

In [16]:
spark.sql("show tables").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|     cattle_prod_idf|      false|
| default|      cattle_prod_tf|      false|
| default|   cattle_prod_tfidf|      false|
| default|cattle_prod_word_...|      false|
| default|           event_ods|      false|
| default|     item_fresh_list|      false|
| default|       item_hot_list|      false|
| default|            item_ods|      false|
| default|          item_order|      false|
| default|          item_score|      false|
| default|        item_tag_ods|      false|
| default|     item_word_count|      false|
| default|       item_word_idf|      false|
| default|        item_word_tf|      false|
| default|     item_word_tfidf|      false|
| default|             tag_ods|      false|
| default|                test|      false|
| default|               test2|      false|
| default|user_item_action_...|      false|
| default|            user_ods| 

In [None]:
spark.stop()