In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("wordcount").getOrCreate()

In [3]:
df = spark.read.text(r"D:\my_project\word_count_with_Apache_Spark\Monetary-policy-in-English-2023_24-Full-text.txt")
df.show()

+--------------------+
|               value|
+--------------------+
|Unofficial Transl...|
|                    |
|     Monetary Policy|
|         for 2023/24|
|                    |
|   Nepal Rastra Bank|
|      Central Office|
|Baluwatar, Kathmandu|
|           July 2023|
|                    |
| \f\fMonetary Policy|
|         for 2023/24|
|                    |
|        Delivered by|
|Governor Mr. Maha...|
|                  On|
|        23 July 2023|
|                    |
|   Nepal Rastra Bank|
|      Central Office|
+--------------------+
only showing top 20 rows



In [4]:
import pyspark.sql.functions as F


# Split lines into words
words = df.select(F.explode(F.split(df.value, " ")).alias("word"))

# Clean the words (optional): Remove punctuation and convert to lowercase
cleaned_words = words.withColumn("word", F.lower(F.regexp_replace("word", "[^a-zA-Z]", "")))

# Filter out empty strings
cleaned_words = cleaned_words.filter(cleaned_words.word != '')

# Count the words
word_counts = cleaned_words.groupBy("word").count().orderBy("count", ascending=False)

# Show the top 20 words by count
word_counts.show(20, truncate=False)


+--------+-----+
|word    |count|
+--------+-----+
|the     |983  |
|to      |456  |
|of      |442  |
|and     |365  |
|in      |270  |
|for     |167  |
|be      |162  |
|a       |140  |
|percent |135  |
|policy  |125  |
|on      |112  |
|monetary|110  |
|rate    |105  |
|has     |98   |
|sector  |91   |
|is      |90   |
|as      |90   |
|by      |90   |
|foreign |80   |
|bank    |78   |
+--------+-----+
only showing top 20 rows



In [5]:
spark.stop()