In [1]:
# mysql connection
import mysql.connector

mydb = mysql.connector.connect(
  host="cowstudio.wayne-lee.cn",
  user="cowstudio",
  password="cowstudio_2119",
  database="cowstudio"
)

In [2]:
# get spark session, 2g mem per executor
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

# set python env
os.environ['PYSPARK_PYTHON'] = "/opt/conda3/envs/lab2/bin/python"
spark = SparkSession.builder \
    .appName("tiem_tfidf") \
    .master("spark://node01:10077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

sc = spark.sparkContext

23/04/20 09:25:25 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/20 09:25:26 WARN spark.SparkContext: Please ensure that the number of slots available on your executors is limited by the number of cores to task cpus and not another custom resource. If cores is not the limiting resource then dynamic allocation will not work properly!


In [3]:
# import jieba and set stop word set
import jieba
import jieba.analyse

stop_words_rdd = sc.textFile("hdfs:///user/spark_temp/stopwords.dat")
stop_words_set = set(stop_words_rdd.collect())

                                                                                

In [4]:
# define map functions 
from datetime import datetime

date_string = datetime.today().strftime('%Y-%m-%d')
# cut item name and description
def cut_name_and_desc(item):
    id, name, desc = item
    name_cut = set(jieba.cut(name))
    name_cut_pure = set(jieba.cut(name)) - stop_words_set
    desc_cut = set(jieba.cut(desc))
    desc_cut_pure = set(jieba.cut(desc)) - stop_words_set
    return (id,name_cut,name_cut_pure,desc_cut,desc_cut_pure)

# map item's cut list to word count
def to_count(item):
    id,name_cut,name_cut_pure,desc_cut,desc_cut_pure = item
    for i in name_cut_pure:
        yield ((id, i),1)
    for i in desc_cut_pure:
        yield ((id,i),1)
        
# trnasfer (id, key), count to id,key,count, date
def split_key_set_date(item):
    key1,count = item
    id,key = key1
    date = date_string
    return id,key,count,date

In [5]:
# get all item data from db, set result to an RDD
cur = mydb.cursor()
cur.execute("SELECT id,name,description from items order by RAND() ")
result = cur.fetchall()
all_items = sc.parallelize(result)
print(all_items.count())



433


                                                                                

In [6]:
# cut item's name and description
cut_items = all_items.map(cut_name_and_desc)
print(all_items.count())

433


In [7]:
# do word count
item_word_count = cut_items.flatMap(to_count)\
                    .reduceByKey(lambda a,b:a+b)\
                    .map(split_key_set_date)
print(item_word_count.count())



4638


                                                                                

In [8]:
# create a table for wordcount
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("key_word", StringType(), True),
    StructField("word_count", IntegerType(), True),
    StructField("date", StringType(),True)
])
item_word_count = spark.createDataFrame(item_word_count, schema)
item_word_count.createOrReplaceTempView("item_word_count")
item_word_count.show()

+-------+--------+----------+----------+
|item_id|key_word|word_count|      date|
+-------+--------+----------+----------+
|    113|    黑牛|         2|2023-04-20|
|    113|适应能力|         1|2023-04-20|
|    113|    适宜|         1|2023-04-20|
|    113|东北地区|         1|2023-04-20|
|    113|    营养|         1|2023-04-20|
|    113|    肉质|         1|2023-04-20|
|    113|    品种|         1|2023-04-20|
|    113|    生长|         1|2023-04-20|
|     89|    牛肉|         2|2023-04-20|
|     89|  矿物质|         1|2023-04-20|
|     89|    细嫩|         1|2023-04-20|
|     89|营养成分|         1|2023-04-20|
|     89|味道鲜美|         1|2023-04-20|
|    333|    出租|         2|2023-04-20|
|    333|    租赁|         1|2023-04-20|
|    333|    灵活|         1|2023-04-20|
|    333|    服务|         1|2023-04-20|
|    371|      牛|         2|2023-04-20|
|    371|      质|         1|2023-04-20|
|    371|    结实|         1|2023-04-20|
+-------+--------+----------+----------+
only showing top 20 rows



In [None]:
# 将DataFrame写入MySQL
item_word_count.write.format("jdbc") \
    .option("url", "jdbc:mysql://cowstudio.wayne-lee.cn:3306/cowstudio") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "item_word_count") \
    .option("user", "cowstudio") \
    .option("password", "cowstudio_2119") \
    .save(mode="overwrite")


[Stage 14:>                                                         (0 + 3) / 3]

In [9]:
# compute IDF
item_word_idf = spark.sql('''
select
    key_word,
    count(distinct item_id) as item_num_has_word,
    max(a.item_num) as item_num_all,
    log10(max(a.item_num)/ count(distinct item_id)) as idf,
    max(date) as date
from
    item_word_count,(
        select
            count(distinct item_id) as item_num
        from
            item_word_count
    ) as a
group by
    key_word
order by
    idf desc
''')
item_word_idf.createOrReplaceTempView("item_word_idf")
item_word_idf.show()



+--------+-----------------+------------+------------------+----------+
|key_word|item_num_has_word|item_num_all|               idf|      date|
+--------+-----------------+------------+------------------+----------+
|    吉利|                1|         433|2.6364878963533656|2023-04-20|
|    每日|                1|         433|2.6364878963533656|2023-04-20|
|    后代|                1|         433|2.6364878963533656|2023-04-20|
|      嘴|                1|         433|2.6364878963533656|2023-04-20|
|    山地|                1|         433|2.6364878963533656|2023-04-20|
|    书籍|                1|         433|2.6364878963533656|2023-04-20|
|    弹牙|                1|         433|2.6364878963533656|2023-04-20|
|产品质量|                1|         433|2.6364878963533656|2023-04-20|
|    强化|                1|         433|2.6364878963533656|2023-04-20|
|    优秀|                1|         433|2.6364878963533656|2023-04-20|
|    感染|                1|         433|2.6364878963533656|2023-04-20|
|    储藏|       

                                                                                

In [10]:
# compute TF
item_word_tf = spark.sql('''
with item_word_num as(
    select
        item_id,
        sum(word_count) as word_total_count
    from
        item_word_count
    group by
        item_id
)
select 
    item_word_count.item_id,
    item_word_count.key_word,
    item_word_count.word_count / item_word_num.word_total_count as tf
from
    item_word_count
left join 
    item_word_num on item_word_count.item_id = item_word_num.item_id
''')
item_word_tf.createGlobalTempView("item_word_tf")
item_word_tf.show()

+-------+--------+-------------------+
|item_id|key_word|                 tf|
+-------+--------+-------------------+
|     14|    牧场|0.10526315789473684|
|     14|    夏日|0.10526315789473684|
|     14|  松花江|0.10526315789473684|
|     14|    繁殖|0.05263157894736842|
|     14|    优良|0.05263157894736842|
|     14|    观赏|0.05263157894736842|
|     14|      月|0.05263157894736842|
|     14|    适合|0.05263157894736842|
|     14|健康状况|0.05263157894736842|
|     14|    家畜|0.05263157894736842|
|     14|    怀孕|0.05263157894736842|
|     14|    母牛|0.10526315789473684|
|     14|      黑|0.10526315789473684|
|     14|    这头|0.05263157894736842|
|     18|    吉林|0.10526315789473684|
|     18|    牛奶|0.05263157894736842|
|     18|    养生|0.05263157894736842|
|     18|    优质|0.05263157894736842|
|     18|    饮用|0.05263157894736842|
|     18|    适合|0.05263157894736842|
+-------+--------+-------------------+
only showing top 20 rows



In [11]:
# compute tf-idf
# item_word_tfidf = spark.sql('''
# select 
#     tf.item_id,
#     tf.key_word,
#     if(tf.tf is )
# from
#     item_word_idf as idf
# left join
#     item_word_tf as tf on idf.key_word = tf.key_word
# ''')

ParseException: 
extraneous input 'is' expecting {')', ','}(line 5, pos 13)

== SQL ==

select 
    tf.item_id,
    tf.key_word,
    if(tf.tf is )
-------------^^^
from
    item_word_idf as idf
left join
    item_word_tf as tf on idf.key_word = tf.key_word


In [5]:
# close spark session
spark.stop()