In [None]:
import os
import string
from pyspark.sql import SparkSession
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

spark = SparkSession.builder.master("local[2]").appName("WordCount").getOrCreate()
sc = spark.sparkContext
file_path = "/content/drive/MyDrive/inputfile.txt"

#Load file into an RDD
text_file = sc.textFile(file_path)

#Define punctuation remover globally to avoid serialization issues
translator = str.maketrans("", "", string.punctuation)

#Function to process text: lowercase, remove punctuation, split into words
def process_line(line):
    return line.lower().translate(translator).split()

#Process words and count occurrences
word_counts = (text_file.flatMap(process_line)  # Process each line
                        .filter(lambda word: word)  # Remove empty words
                        .map(lambda word: (word, 1))
                        .reduceByKey(lambda a, b: a + b))

for word, count in word_counts.take(30):
    print(f"'{word}': {count}")

spark.stop()


Mounted at /content/drive
'watermelon': 1
'kiwi': 4
'banana': 6
'pineapple': 1
'blueberry': 1
'apple': 1
'currant': 1
'date': 1
'boysenberry': 1
'strawberry': 1
'papaya': 1
'orange': 2
'mango': 3
'fig': 1
'gooseberry': 1
'olive': 1
'tangerine': 1
'apricot': 1
'ackee': 1
'durian': 1
