In [62]:
from pyspark.sql import SparkSession
import re

In [63]:
spark = SparkSession.builder.appName("Starting Letter Count")\
    .master("local[4]")\
    .getOrCreate()

In [64]:
rdd = spark.sparkContext.textFile('sample.txt')

In [65]:
'''
This code processes the input text by performing several transformations on the RDD.
It splits the text into words, cleans the words, and then extracts the first letter
of each word, ultimately preparing the data for counting word occurrences by starting letter.
'''
mapped_rdd = rdd.flatMap(lambda line: line.split())\
    .map(lambda word: re.sub(r'[^a-zA-Z\s]', '', word).lower().strip())\
    .filter(lambda word: word and not word.isdigit())\
    .map(lambda word: (word[0].lower(), 1))

In [66]:
mapped_rdd.take(5)

[('t', 1), ('p', 1), ('g', 1), ('e', 1), ('o', 1)]

In [67]:
reduced_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b)\
    .sortBy(lambda x: x[1], ascending=False)

In [68]:
reduced_rdd.take(5)

[('t', 11585), ('a', 7983), ('s', 6048), ('h', 5877), ('w', 5479)]

In [69]:
result = reduced_rdd.collect()
for letter, count in result:
    print(f"Letter: {letter}, Count: {count}")

Letter: t, Count: 11585
Letter: a, Count: 7983
Letter: s, Count: 6048
Letter: h, Count: 5877
Letter: w, Count: 5479
Letter: i, Count: 4303
Letter: o, Count: 4184
Letter: c, Count: 4178
Letter: b, Count: 3915
Letter: m, Count: 3689
Letter: f, Count: 2754
Letter: l, Count: 2525
Letter: d, Count: 2426
Letter: p, Count: 2323
Letter: g, Count: 1815
Letter: n, Count: 1707
Letter: r, Count: 1341
Letter: y, Count: 1318
Letter: e, Count: 1217
Letter: u, Count: 807
Letter: k, Count: 489
Letter: j, Count: 418
Letter: v, Count: 392
Letter: q, Count: 183
Letter: x, Count: 42
Letter: z, Count: 1


In [53]:
# Stop Spark session
spark.stop()