In [1]:
# get spark session, 2g mem per executor
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

# set python env
os.environ['PYSPARK_PYTHON'] = "/opt/conda3/envs/lab2/bin/python"
spark = SparkSession.builder \
    .appName("CalculateCattleProdTFIDF") \
    .master("spark://node01:10077") \
    .enableHiveSupport()\
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

sc = spark.sparkContext

23/05/31 13:50:12 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/31 13:50:13 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/31 13:50:13 WARN spark.SparkContext: Please ensure that the number of slots available on your executors is limited by the number of cores to task cpus and not another custom resource. If cores is not the limiting resource then dynamic allocation will not work properly!


In [2]:
from datetime import datetime

date_string = datetime.today().strftime('%Y-%m-%d')

In [3]:
# compute IDF
item_word_idf = spark.sql('''
select
    key_word,
    count(distinct item_id) as item_num_has_word,
    max(a.item_num) as item_num_all,
    log10(max(a.item_num)/ count(distinct item_id)) as idf,
    max(date) as date
from
    cattle_prod_word_count,(
        select
            count(distinct item_id) as item_num
        from
            cattle_prod_word_count
    ) as a
group by
    key_word
order by
    idf desc
''')
item_word_idf.createOrReplaceTempView("cattle_prod_idf")
item_word_idf.show()
item_word_idf.write.mode("overwrite").partitionBy("date").saveAsTable("cattle_prod_idf")
spark.sql("show tables").show()

23/05/31 13:50:30 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
                                                                                

+--------+-----------------+------------+------------------+----------+
|key_word|item_num_has_word|item_num_all|               idf|      date|
+--------+-----------------+------------+------------------+----------+
|    400g|                1|         112|2.0492180226701815|2023-05-31|
|    不二|                1|         112|2.0492180226701815|2023-05-31|
|  中草药|                1|         112|2.0492180226701815|2023-05-31|
|    之选|                1|         112|2.0492180226701815|2023-05-31|
|    乳酪|                1|         112|2.0492180226701815|2023-05-31|
|    低温|                1|         112|2.0492180226701815|2023-05-31|
|    回味|                1|         112|2.0492180226701815|2023-05-31|
|    尽享|                1|         112|2.0492180226701815|2023-05-31|
|    悠长|                1|         112|2.0492180226701815|2023-05-31|
|    户外|                1|         112|2.0492180226701815|2023-05-31|
|    牛腿|                1|         112|2.0492180226701815|2023-05-31|
|    生于|     

                                                                                

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|     cattle_prod_idf|      false|
| default|cattle_prod_word_...|      false|
| default|           event_ods|      false|
| default|     item_fresh_list|      false|
| default|       item_hot_list|      false|
| default|            item_ods|      false|
| default|          item_order|      false|
| default|          item_score|      false|
| default|        item_tag_ods|      false|
| default|     item_word_count|      false|
| default|       item_word_idf|      false|
| default|        item_word_tf|      false|
| default|     item_word_tfidf|      false|
| default|             tag_ods|      false|
| default|                test|      false|
| default|               test2|      false|
| default|user_item_action_...|      false|
| default|            user_ods|      false|
| default|        user_tag_ods|      false|
| default|vet_twitte_conten...| 

In [4]:
# compute TF
item_word_tf = spark.sql(f'''
with item_word_total_num as(
    select
        item_id,
        sum(word_count) as word_total
    from
        cattle_prod_word_count
    group by
        item_id
), item_all as(
    select
        distinct id as item_id
    from
        item_ods
), word_all as(
    select
        distinct key_word
    from
        cattle_prod_word_count
), item_word_all as(
    select
        item_id,
        key_word
    from
        item_all,
        word_all
)
select 
    a.item_id,
    a.key_word,
    if(b.item_id is null or c.word_count is null or b.item_id = 0, 0, c.word_count/b.word_total) as tf,
    '{date_string}' as date
from
    item_word_all a
left join
    item_word_total_num b on a.item_id = b.item_id
left join
    cattle_prod_word_count c on a.item_id = c.item_id and a.key_word = c.key_word
order by
    tf desc
''')
item_word_tf.createOrReplaceGlobalTempView("cattle_prod_tf")
item_word_tf.write.mode("overwrite").partitionBy("date").saveAsTable("cattle_prod_tf")
spark.sql("show tables").show()

                                                                                

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|     cattle_prod_idf|      false|
| default|      cattle_prod_tf|      false|
| default|cattle_prod_word_...|      false|
| default|           event_ods|      false|
| default|     item_fresh_list|      false|
| default|       item_hot_list|      false|
| default|            item_ods|      false|
| default|          item_order|      false|
| default|          item_score|      false|
| default|        item_tag_ods|      false|
| default|     item_word_count|      false|
| default|       item_word_idf|      false|
| default|        item_word_tf|      false|
| default|     item_word_tfidf|      false|
| default|             tag_ods|      false|
| default|                test|      false|
| default|               test2|      false|
| default|user_item_action_...|      false|
| default|            user_ods|      false|
| default|        user_tag_ods| 

In [5]:
# compute tf-idf
item_word_tfidf = spark.sql('''
select 
    tf.item_id,
    tf.key_word,
    tf.tf * idf.idf as tfidf,
    tf.date
from
    item_word_tf as tf
left join
    item_word_idf as idf on idf.key_word = tf.key_word
order by
    tfidf desc
''')
item_word_tfidf.createOrReplaceGlobalTempView("cattle_prod_tfidf")
item_word_tfidf.show()
item_word_tfidf.write.mode("overwrite").saveAsTable("cattle_prod_tfidf")
spark.sql("show tables").show()

                                                                                

+-------+--------+------------------+----------+
|item_id|key_word|             tfidf|      date|
+-------+--------+------------------+----------+
|     80|  黑山羊|0.8788292987844551|2023-05-02|
|     39|    延边|0.8788292987844551|2023-05-02|
|    245|咨询服务|0.8637466566534813|2023-05-02|
|    236|    牛类|0.8137711620101613|2023-05-02|
|     79|    牛骨|0.7784859668964614|2023-05-02|
|     78|    红烧|0.7784859668964614|2023-05-02|
|    243|    建设|0.7532822561009616|2023-05-02|
|    279|消化不良|0.7532822561009616|2023-05-02|
|    242|    乳牛|0.7165559425356435|2023-05-02|
|     76|    牛排|0.6781426350084676|2023-05-02|
|    225|  抗生素|0.6672736859112526|2023-05-02|
|    253|    奶粉|0.6672736859112526|2023-05-02|
|    243|    牛棚|0.6672736859112526|2023-05-02|
|    280|  胃肠炎|0.6591219740883414|2023-05-02|
|    214|    储藏|0.6591219740883414|2023-05-02|
|     12|  南犬牛|0.6591219740883414|2023-05-02|
|    232|  牛舒安|0.6591219740883414|2023-05-02|
|     32|      拉|0.6458392973391155|2023-05-02|
|     33|  巴拉巴|

                                                                                

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|     cattle_prod_idf|      false|
| default|      cattle_prod_tf|      false|
| default|   cattle_prod_tfidf|      false|
| default|cattle_prod_word_...|      false|
| default|           event_ods|      false|
| default|     item_fresh_list|      false|
| default|       item_hot_list|      false|
| default|            item_ods|      false|
| default|          item_order|      false|
| default|          item_score|      false|
| default|        item_tag_ods|      false|
| default|     item_word_count|      false|
| default|       item_word_idf|      false|
| default|        item_word_tf|      false|
| default|     item_word_tfidf|      false|
| default|             tag_ods|      false|
| default|                test|      false|
| default|               test2|      false|
| default|user_item_action_...|      false|
| default|            user_ods| 

In [6]:
spark.stop()