In [1]:
# get spark session, 2g mem per executor
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

# set python env
os.environ['PYSPARK_PYTHON'] = "/opt/conda3/envs/lab2/bin/python"
spark = SparkSession.builder \
    .appName("ProcessVetTwitteText") \
    .master("spark://node01:10077") \
    .enableHiveSupport()\
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.cores.max", "3") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .getOrCreate()

sc = spark.sparkContext

23/05/31 13:30:09 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/31 13:30:10 WARN spark.SparkContext: Please ensure that the number of slots available on your executors is limited by the number of cores to task cpus and not another custom resource. If cores is not the limiting resource then dynamic allocation will not work properly!


In [2]:
# import jieba and set stop word set
import jieba
import jieba.analyse

stop_words_rdd = sc.textFile("hdfs:///user/spark_temp/stopwords.dat")
stop_words_set = set(stop_words_rdd.collect())

                                                                                

In [3]:
# define map functions 
from datetime import datetime

date_string = datetime.today().strftime('%Y-%m-%d')
# cut item name and description
def cut_name_and_desc(item):
    id, name, desc = item
    name_cut = set(jieba.cut(name))
    name_cut_pure = set(jieba.cut(name)) - stop_words_set
    desc_cut = set(jieba.cut(desc))
    desc_cut_pure = set(jieba.cut(desc)) - stop_words_set
    return (id,name_cut,name_cut_pure,desc_cut,desc_cut_pure)

# map item's cut list to word count
def title_to_count(item):
    id,name_cut,name_cut_pure,desc_cut,desc_cut_pure = item
    for i in name_cut_pure:
        yield ((id, i),1)
        
# map item's cut list to word count
def content_to_count(item):
    id,name_cut,name_cut_pure,desc_cut,desc_cut_pure = item
    for i in desc_cut_pure:
        yield ((id,i),1)
        
# trnasfer (id, key), count to id,key,count, date
def split_key_set_date(item):
    key1,count = item
    id,key = key1
    date = date_string
    return id,key,count,date

In [4]:
all_items = spark.sql("select id,name,description from item_ods where category = 'twitte' ").rdd
print(all_items.count())

23/05/31 13:30:28 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.

0


                                                                                

In [5]:
# cut item's name and description
cut_items = all_items.map(cut_name_and_desc)
print(all_items.count())

0


In [6]:
# do word count
item_title_word_count = cut_items.flatMap(title_to_count)\
                    .reduceByKey(lambda a,b:a+b)\
                    .map(split_key_set_date)
print(item_title_word_count.count())
item_content_word_count = cut_items.flatMap(content_to_count)\
                    .reduceByKey(lambda a,b:a+b)\
                    .map(split_key_set_date)
print(item_content_word_count.count())

                                                                                

0
0


In [7]:
# create a table for wordcount
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
    StructField("item_id", IntegerType(), True),
    StructField("key_word", StringType(), True),
    StructField("word_count", IntegerType(), True),
    StructField("date", StringType(),True)
])
item_title_word_count = spark.createDataFrame(item_title_word_count, schema)
item_title_word_count.createOrReplaceTempView("vet_twitte_title_word_count")

item_content_word_count = spark.createDataFrame(item_content_word_count, schema)
item_content_word_count.createOrReplaceTempView("vet_twitte_content_word_count")

In [8]:
# 将DataFrame写入Hive
item_title_word_count.write.mode("overwrite").partitionBy("date").saveAsTable("vet_twitte_title_word_count")
item_content_word_count.write.mode("overwrite").partitionBy("date").saveAsTable("vet_twitte_content_word_count")

In [9]:
# close spark session
spark.stop()