# Working with text

In this notebook:
- Reading multiple text files into a data frame.
- Processing the text to calculate occurances of unique words.
- Filtering words by length and displaying $N$ most common ones.

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Text").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
spark

---

In [5]:
raw_books = spark.read.text("data/gutenberg_books/")
raw_books.printSchema()
raw_books.count()

root
 |-- value: string (nullable = true)



77910

In [6]:
raw_books.show(n=10, truncate=88)

+-----------------------------------------------------------------------+
|                                                                  value|
+-----------------------------------------------------------------------+
|                                                                       |
|      The Project Gutenberg EBook of Moby Dick; or The Whale, by Herman|
|                                                               Melville|
|                                                                       |
|This eBook is for the use of anyone anywhere at no cost and with almost|
|no restrictions whatsoever.  You may copy it, give it away or re-use it|
|    under the terms of the Project Gutenberg License included with this|
|                                   eBook or online at www.gutenberg.org|
|                                                                       |
|                                                                       |
+-------------------------------------

In [7]:
raw_books.select(F.split("value", pattern=" ")).show(n=10, truncate=88)

+---------------------------------------------------------------------------------------+
|                                                                    split(value,  , -1)|
+---------------------------------------------------------------------------------------+
|                                                                                     []|
|         [The, Project, Gutenberg, EBook, of, Moby, Dick;, or, The, Whale,, by, Herman]|
|                                                                             [Melville]|
|                                                                                     []|
|[This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with, almost]|
| [no, restrictions, whatsoever., , You, may, copy, it,, give, it, away, or, re-use, it]|
|        [under, the, terms, of, the, Project, Gutenberg, License, included, with, this]|
|                                             [eBook, or, online, at, www.gutenberg.org]|
|         

In [8]:
word_count = (
    raw_books.select(F.split("value", pattern=" ").alias("words_arr"))
    .select(F.explode("words_arr").alias("word"))
    .select(F.lower("word").alias("word"))
    .select(F.regexp_extract("word", pattern="[a-z]*", idx=0).alias("word"))
    .filter(F.col("word") != "")
    .groupby("word")
    .count()
)

word_count.printSchema()

root
 |-- word: string (nullable = false)
 |-- count: long (nullable = false)



In [9]:
longest_most_common_words = word_count.filter(F.length("word") > 5).orderBy(
    "count", ascending=False
)

longest_most_common_words.show(n=20, truncate=88)

+---------+-----+
|     word|count|
+---------+-----+
|   before| 1275|
|   little| 1208|
|   should| 1110|
|elizabeth|  721|
|  without|  680|
|   though|  672|
|  thought|  630|
|   myself|  594|
|   seemed|  566|
|  nothing|  557|
|  herself|  549|
|gutenberg|  527|
|  himself|  519|
|  through|  510|
|  project|  502|
|   holmes|  460|
|   almost|  458|
|delicious|  453|
|  between|  446|
|  however|  445|
+---------+-----+
only showing top 20 rows



                                                                                

In [10]:
first_letter_count = (
    word_count.filter(F.length("word") > 1)
    .select(F.substring("word", pos=0, len=1).alias("fl"), "count")
    .groupby("fl")
    .agg(F.sum("count").alias("count"))
)

first_letter_count.printSchema()

root
 |-- fl: string (nullable = false)
 |-- count: long (nullable = true)



In [11]:
first_letter_count.orderBy("count").show(n=7)

+---+-----+
| fl|count|
+---+-----+
|  x|   10|
|  z|   47|
|  q| 2018|
|  j| 2535|
|  k| 3703|
|  v| 5214|
|  u| 9240|
+---+-----+
only showing top 7 rows



In [12]:
word_count.filter(F.substring("word", pos=0, len=1) == "x").filter(
    F.length("word") > 1
).show(n=20)

+------+-----+
|  word|count|
+------+-----+
| xxxix|    1|
|   xvi|    1|
|xerxes|    2|
|    xi|    3|
|   xii|    3|
+------+-----+



In [13]:
word_count.filter(F.substring("word", pos=0, len=1) == "z").show(n=20)

+----------+-----+
|      word|count|
+----------+-----+
|       zig|    1|
| zoroaster|    1|
| zeuglodon|    1|
|      zeal|    6|
|zealanders|    1|
|   zealand|    9|
|   zoology|    2|
|     zones|    3|
|      zone|    5|
|    zephyr|    1|
|  zogranda|    1|
|    zodiac|    5|
|       zay|    1|
|     zoned|    2|
|    zigzag|    3|
|      zest|    3|
|      zero|    2|
+----------+-----+



In [14]:
first_letter_count.orderBy("count", ascending=False).show(n=7)

+---+------+
| fl| count|
+---+------+
|  t|106609|
|  a| 68835|
|  s| 55688|
|  h| 52504|
|  w| 52335|
|  o| 45164|
|  i| 40028|
+---+------+
only showing top 7 rows

