In [3]:
import os
import string
from pyspark.sql import SparkSession
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

spark = SparkSession.builder.master("local[2]").appName("WordCount").getOrCreate()
sc = spark.sparkContext
file_path = "/content/drive/MyDrive/France.txt"

# Load file into an RDD
text_file = sc.textFile(file_path)

# Define punctuation remover globally to avoid serialization issues
translator = str.maketrans("", "", string.punctuation)

# Function to process text: lowercase, remove punctuation, split into words
def process_line(line):
    return line.lower().translate(translator).split()

# Process words and count occurrences
word_counts = (text_file.flatMap(process_line)  # Process each line
                        .filter(lambda word: word)  # Remove empty words
                        .map(lambda word: (word, 1))
                        .reduceByKey(lambda a, b: a + b)
                        .sortByKey())  # Sort alphabetically by word

# Print the top 30 words in alphabetical order
for word, count in word_counts.take(30):
    print(f"'{word}': {count}")

spark.stop()


Mounted at /content/drive
'1789': 1
'1815': 1
'19th': 1
'2018': 1
'20th': 2
'a': 21
'about': 3
'accompanied': 1
'addressing': 1
'aerospace': 1
'affairs': 2
'affected': 1
'ages': 2
'agricultural': 2
'agriculture': 1
'aimed': 1
'allows': 1
'along': 2
'alpine': 1
'alps': 1
'also': 3
'an': 2
'and': 57
'andorra': 1
'another': 1
'architectural': 1
'architecture': 1
'are': 3
'areas': 1
'array': 1
