# Example 1: Word Count

The classic MapReduce example - counting words in text data.

This demonstrates:
- Working with text data
- String operations
- Grouping and aggregation
- Sorting results

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("WordCount").getOrCreate()

In [None]:
# Sample text data
text_data = [
    ("PySpark is great for big data processing",),
    ("Big data requires distributed computing",),
    ("PySpark makes big data processing easy",),
    ("Spark is fast and powerful",),
    ("Data processing with PySpark is fun",)
]

df = spark.createDataFrame(text_data, ["text"])
print("Original text:")
df.show(truncate=False)

In [None]:
# Split text into words
words_df = df.select(F.explode(F.split(F.lower(F.col("text")), " ")).alias("word"))

print("\nIndividual words:")
words_df.show(10)

In [None]:
# Count word occurrences
word_counts = words_df.groupBy("word").count()

print("\nWord counts (unsorted):")
word_counts.show()

In [None]:
# Sort by count (descending)
word_counts_sorted = word_counts.orderBy(F.col("count").desc())

print("\nTop 10 most frequent words:")
word_counts_sorted.show(10)

In [None]:
# Filter out common words and show results
stop_words = ["is", "for", "and", "with", "the", "a", "an"]

filtered_counts = word_counts_sorted.filter(~F.col("word").isin(stop_words))

print("\nWord counts (excluding stop words):")
filtered_counts.show()

In [None]:
# Summary statistics
total_words = words_df.count()
unique_words = word_counts.count()

print(f"\nTotal words: {total_words}")
print(f"Unique words: {unique_words}")

In [None]:
spark.stop()