# Create Spark

## Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import functions as func

## Spark Conf

In [2]:
conf = SparkConf().setAppName("Assignment 25072024")\
                    .setMaster("local[*]") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g") \
                    .set("spark.executor.cores", "4") \
                    .set("spark.driver.cores","5")

## SparkSession

In [3]:
spark = SparkSession.builder \
                    .config(conf=conf) \
                    .getOrCreate()

## SparkContext

In [4]:
sc = spark.sparkContext

# Exercise 1
- Given a dataset of 1000 arbitrary numbers in a text file
- Find all prime numbers in the given dataset
- Save result under a new text file

## Check Prime number function

In [5]:
from math import sqrt,ceil
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2,ceil(sqrt(n))+1):
        if n%i == 0:
            return False
    return True

## No random

### Generate 10.000.000 number list sorted

In [6]:
# import random
# numbers = [random.randint(1, 100000000) for _ in range(10000000)]
numbers = list(range(1,10000001))
# numbers = [number for number in range(1,100000001)]
with open('input_data/numbers.txt','w+') as file:
    for number in numbers:
        file.write(f"{number}\n")
    file.close()

In [7]:
len(numbers)

10000000

### RDD

In [8]:
numbers_rdd = sc.textFile(r"D:\fpt_software\Code\Assignment_25072024\input_data\numbers.txt")

In [9]:
numbers_rdd = numbers_rdd.map(lambda x: int(x))

In [10]:
prime_numbers_rdd = numbers_rdd.filter(is_prime)

In [11]:
prime_numbers_rdd = prime_numbers_rdd.repartition(12)

In [12]:
prime_numbers_rdd.saveAsTextFile(path="output_data/prime_numbers_rdd.txt")

## Random

In [13]:
import random

numbers_random = [random.randint(1,10000000) for _ in range(1,10000000)]

with open('input_data/numbers_random.txt','w+') as file:
    for number in numbers_random:
        file.write(f"{number}\n")

In [14]:
numbers_rdd_random = sc.textFile('input_data/numbers_random.txt')

In [15]:
numbers_rdd_random = numbers_rdd_random.map(lambda x: int(x))

In [16]:
prime_numbers_rdd_random = numbers_rdd_random.filter(is_prime)

In [17]:
prime_numbers_rdd_random = prime_numbers_rdd_random.repartition(numPartitions=12)

In [18]:
prime_numbers_rdd_random.saveAsTextFile("output_data/prime_numbers_rdd_random.txt")

# Exercise 2
- Generate a text file of 10,000 lines, each line contains a pair of (key,value)
- Calculate the average value for each key
- Apply GroupByKey() and ReduceByKey() Functions

## Generate text file with 10000

In [19]:
import random

with open('textLine.txt', 'w+') as file:
    for i in range(1, 10001):
        key = random.randint(1, 100)
        line = f"{key}, this is line number {i}\n"
        file.write(line)

In [20]:
rdd = sc.textFile('textLine.txt')

key_value_rdd = rdd.map(lambda line: (int(line.split(',')[0].strip()), line.split(',')[1].strip()))

In [21]:
key_lenValue_rdd = rdd.map(lambda line: (int(line.split(',')[0].strip()), len(line.split(',')[1].strip())))

In [22]:
key_lenValue_rdd.take(5)

[(76, 21), (3, 21), (93, 21), (74, 21), (100, 21)]

## Group by key

In [23]:
grouped_rdd = key_lenValue_rdd.groupByKey()

sum_count_rdd = grouped_rdd.mapValues(lambda values: (sum(values) , len(values)))

avg_rdd_groupByKey = sum_count_rdd.mapValues(lambda x: x[0] / x[1])

In [24]:
sum_count_rdd.take(5)

[(76, (2149, 90)),
 (74, (2602, 109)),
 (100, (2332, 98)),
 (98, (2148, 90)),
 (78, (2508, 105))]

In [25]:
avg_rdd_groupByKey.take(5)

[(76, 23.877777777777776),
 (74, 23.871559633027523),
 (100, 23.79591836734694),
 (98, 23.866666666666667),
 (78, 23.885714285714286)]

## Reduce by key

In [26]:
sum_count_rdd = key_lenValue_rdd.mapValues(lambda x: (x, 1)) \
                                .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

avg_rdd = sum_count_rdd.mapValues(lambda x: x[0] / x[1])

In [27]:
sum_count_rdd.take(5)

[(76, (2149, 90)),
 (74, (2602, 109)),
 (100, (2332, 98)),
 (98, (2148, 90)),
 (78, (2508, 105))]

In [28]:
avg_rdd.take(5)

[(76, 23.877777777777776),
 (74, 23.871559633027523),
 (100, 23.79591836734694),
 (98, 23.866666666666667),
 (78, 23.885714285714286)]