In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName('Analyzing the vocabulary of Pride and Prejudice')
         .getOrCreate())

In [3]:
import requests 
import tempfile

url = 'https://www.gutenberg.org/cache/epub/1342/pg1342.txt'
response = requests.get(url)
text = response.text

with tempfile.NamedTemporaryFile(delete=False, mode='w') as temp_file:
    temp_file.write(text)
    temp_path = temp_file.name

book = spark.read.text(temp_path)



In [4]:
book.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
print(book.dtypes)

[('value', 'string')]


In [6]:
# Show the first 25 rows and truncate the row to 80 characters

book.show(25,truncate=50, vertical=False)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg eBook of Pride and Prejudice|
|                                                  |
|This ebook is for the use of anyone anywhere in...|
|most other parts of the world at no cost and wi...|
|whatsoever. You may copy it, give it away or re...|
|of the Project Gutenberg License included with ...|
|at www.gutenberg.org. If you are not located in...|
|you will have to check the laws of the country ...|
|                          before using this eBook.|
|                                                  |
|                        Title: Pride and Prejudice|
|                                                  |
|                               Author: Jane Austen|
|                                                  |
|          Release date: June 1, 1998 [eBook #1342]|
|                Most recently updated: June 1

In [7]:
from pyspark.sql.functions import split

lines = book.select(split(book.value, ' ').alias('line'))
lines.show(5)

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|          [, , , , ]|
|[This, ebook, is,...|
|[most, other, par...|
|[whatsoever., You...|
+--------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import explode, col

words = lines.select(explode(col('line')).alias('word'))
words.show(5)

+---------+
|     word|
+---------+
|      The|
|  Project|
|Gutenberg|
|    eBook|
|       of|
+---------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import lower

words_lower = words.select(lower(col('word')).alias('word_lower'))
words_lower.show(5)

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
+----------+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import regexp_extract

words_clean = words_lower.select(
    regexp_extract(col('word_lower'), '[a-z]+', 0).alias('word') 
)
words_clean.show(15)

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|         |
|         |
|         |
|         |
|         |
|     this|
|    ebook|
+---------+
only showing top 15 rows



In [11]:
words_nonull = words_clean.filter(col('word') != "")
words_nonull.show(15)

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
+---------+
only showing top 15 rows



In [12]:
words_nonull.printSchema()

root
 |-- word: string (nullable = false)



In [15]:
results = words_nonull.groupBy(col('word')).count()
results.orderBy(col('count').desc()).show()

+----+-----+
|word|count|
+----+-----+
| the| 4842|
|  to| 4399|
|  of| 3959|
| and| 3785|
| her| 2281|
|   i| 2105|
|   a| 2080|
|  in| 2039|
| was| 1877|
| she| 1744|
|that| 1639|
|  it| 1597|
| not| 1526|
| you| 1444|
|  he| 1359|
| his| 1302|
|  be| 1280|
|  as| 1240|
| had| 1186|
|with| 1148|
+----+-----+
only showing top 20 rows



In [16]:
results.write.csv('data/vocabulary_count.csv')