In [105]:
#Import required libraries
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

import tkinter as tk
from tkinter import filedialog

# Create a Spark session with additional configuration settings
spark = SparkSession.builder \
    .appName("example") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .getOrCreate()

In [106]:
#Create sparkContext instance as entry point for RDD
sc = spark.sparkContext

In [107]:
# Create a Tkinter root window
root = tk.Tk()
root.withdraw()  # Hide the main window

# Prompt the user to select a file
file_path = filedialog.askopenfilename()
root.destroy()

In [108]:
RDD = sc.textFile(file_path)

In [109]:
RDD.take(5)

['',
 '                         *THE THREE MUSKETEERS*',
 '',
 '                                  _By_',
 '']

In [110]:
RDD_split = RDD.flatMap(lambda x : x.split())

In [111]:
RDD_split_case = RDD_split.map(lambda x : x.lower())

In [112]:
RDD_split_case.take(10)

['*the',
 'three',
 'musketeers*',
 '_by_',
 '*alexandre',
 'dumas,',
 'pere*',
 '_first',
 'volume',
 'of']

In [113]:
RDD_split_case.count()

232286

In [114]:
RDD_split_case_pair = RDD_split_case.map(lambda x : (x,1))

In [115]:
RDD_split_case_pair.take(10)

[('*the', 1),
 ('three', 1),
 ('musketeers*', 1),
 ('_by_', 1),
 ('*alexandre', 1),
 ('dumas,', 1),
 ('pere*', 1),
 ('_first', 1),
 ('volume', 1),
 ('of', 1)]

In [116]:
RDD_split_case_pair_reduce = RDD_split_case_pair.reduceByKey(lambda x, y:x+y)

In [117]:
RDD_split_case_pair_reduce.take(10)

[('three', 279),
 ('musketeers*', 1),
 ('*alexandre', 1),
 ('pere*', 1),
 ('_first', 1),
 ('volume', 1),
 ('of', 6457),
 ('series_', 1),
 ('contents', 6),
 ('preface', 3)]

In [118]:
RDD_split_case_pair_reduce_sorted = RDD_split_case_pair_reduce.sortBy(lambda x:x[1],ascending=False)

In [119]:
RDD_split_case_pair_reduce_sorted.take(10)

[('the', 13565),
 ('to', 6577),
 ('of', 6457),
 ('and', 5457),
 ('a', 4664),
 ('in', 3304),
 ('i', 3035),
 ('that', 2986),
 ('his', 2959),
 ('you', 2820)]

In [120]:
total_unique_words = RDD_split_case_pair_reduce_sorted.count()

In [121]:
total_unique_words

22576

In [122]:
total_sum = RDD_split_case_pair_reduce_sorted.values().sum()

In [123]:
total_sum

232286

In [124]:
sc.stop()