## pyspark_wordcount

In [1]:
from pyspark.sql import (
    Row,
    SparkSession)
import pyspark.sql.functions as F

In [2]:
spark=(
    SparkSession
    .builder
    .appName("word-count")
    .master("spark://spark-master:7077")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/30 05:43:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df=spark.read.text("file:///workspace/data/word_count.txt")

In [6]:
words=df.select(
    F.explode(
        F.split(F.col("value"),' ')
    ).alias("word")
)
words.show()

+-----------+
|       word|
+-----------+
|      Spark|
|         is|
|          a|
|       fast|
|distributed|
| processing|
|     engine|
|      Spark|
|         is|
|     widely|
|       used|
|         in|
|       data|
|engineering|
|       Data|
|  engineers|
|        use|
|      Spark|
|        for|
|      large|
+-----------+
only showing top 20 rows



In [12]:
count=words.select(
   F.col("word")
).groupBy(F.col("word")).agg(
    F.count("*").alias("count")
).orderBy(F.col("count").desc())
count.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|      Spark|   41|
|         is|   18|
|       data|   18|
|           |   14|
| processing|   10|
|        can|    9|
|       Data|    7|
|        for|    7|
|         in|    7|
|engineering|    7|
|  important|    5|
|        API|    4|
|distributed|    4|
|  streaming|    4|
|        use|    4|
|       jobs|    4|
|    systems|    4|
|        run|    4|
|     Hadoop|    4|
|      makes|    3|
+-----------+-----+
only showing top 20 rows



### 더 보완 해야할 것이 있을까?

1. 대소문자 문제
2. 빈 문자열 문제
3. `,`,`.` 등 문제

3가지 문제를 해결해보고 개수를 다시확인해보자!

In [13]:
words=df.select(
    F.explode(
        F.split(F.col("value"),r"\s+")
    ).alias("word")
).filter(F.col("word")!="")
words.show()

+-----------+
|       word|
+-----------+
|      Spark|
|         is|
|          a|
|       fast|
|distributed|
| processing|
|     engine|
|      Spark|
|         is|
|     widely|
|       used|
|         in|
|       data|
|engineering|
|       Data|
|  engineers|
|        use|
|      Spark|
|        for|
|      large|
+-----------+
only showing top 20 rows



In [32]:
# groupBy를 실행한 직후는 GroupdData객체, 이 객체는 메서드 제공
# 단, 이름 변경불가, 한번에 하나만 계산가능 
count=(
    words
    .groupBy("word")
    .count()
    .orderBy(F.desc("count"))
)
count.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|      Spark|   41|
|         is|   18|
|       data|   18|
| processing|   10|
|        can|    9|
|       Data|    7|
|        for|    7|
|         in|    7|
|engineering|    7|
|  important|    5|
|        API|    4|
|distributed|    4|
|  streaming|    4|
|        use|    4|
|       jobs|    4|
|    systems|    4|
|        run|    4|
|     Hadoop|    4|
|      makes|    3|
|         be|    3|
+-----------+-----+
only showing top 20 rows

