In [1]:
import findspark
findspark.init('/home/hadoop/spark-2.2.2-bin-hadoop2.7')
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
conf = SparkConf().setAppName("miniProject").setMaster("spark://master:7077")
sc = SparkContext(conf=conf)
sqlcontext = SQLContext(sc)

In [2]:
token_df = sqlcontext.read.parquet("hdfs://master:9000/user/hadoop/all_hangul_tokens/part-00000-b55b78ed-216f-4c7d-b173-a63291f97ba0-c000.snappy.parquet", 
                    "hdfs://master:9000/user/hadoop/all_hangul_tokens/part-00001-b55b78ed-216f-4c7d-b173-a63291f97ba0-c000.snappy.parquet",
                    "hdfs://master:9000/user/hadoop/all_hangul_tokens/part-00002-b55b78ed-216f-4c7d-b173-a63291f97ba0-c000.snappy.parquet",
                    "hdfs://master:9000/user/hadoop/all_hangul_tokens/part-00003-b55b78ed-216f-4c7d-b173-a63291f97ba0-c000.snappy.parquet")
token_df.select('title_id','tokens').show(5) #example

+--------+--------------------+
|title_id|              tokens|
+--------+--------------------+
|  112931|[정주행, 하시는, 분들, 추천...|
|  112931|[만, 렙이, 레이, 드, 못,...|
|  112931|[아랑소드, 용사, 시절, 에피...|
|  112931|[본격, 드래곤, 이, 자기, ...|
|  112931|[본격, 세계, 를구한, 드래곤...|
+--------+--------------------+
only showing top 5 rows



In [4]:
import pyspark.sql.functions as F
split_udf=F.udf(lambda s: s[1:-1])
token_df=token_df.select('title_id','tokens').withColumn('split', F.split(split_udf(token_df.tokens),', ') )
mapped_df=token_df.select(token_df.title_id, F.explode(token_df.split).alias('token'), F.lit(1)).filter(F.length('token')>1)
mapped_df.show(5)

+--------+-----+---+
|title_id|token|  1|
+--------+-----+---+
|  112931|  정주행|  1|
|  112931|  하시는|  1|
|  112931|   분들|  1|
|  112931|   추천|  1|
|  112931|   한번|  1|
+--------+-----+---+
only showing top 5 rows



In [11]:
token_count=mapped_df.groupBy('title_id','token').count().orderBy('title_id', F.desc('count'))
token_count.write.parquet('hdfs://master:9000/user/hadoop/token_count')

In [14]:
token_count=sqlcontext.read.parquet('hdfs://master:9000/user/hadoop/token_count')
token_count.show(5)

+--------+-----+-----+
|title_id|token|count|
+--------+-----+-----+
|  119874|  수집이|    3|
|  119874| 전쟁터가|    3|
|  119874|  푸시고|    3|
|  119874|  미간이|    3|
|  119874|  바다에|    3|
+--------+-----+-----+
only showing top 5 rows



In [16]:
token_count_sum=token_count.groupBy('token').sum('count')
token_count_sum.show(5)

+-----+----------+
|token|sum(count)|
+-----+----------+
|  도발을|        64|
|  듯요전|         3|
|  시다고|        98|
|  손대는|        41|
|   외려|        35|
+-----+----------+
only showing top 5 rows

