# AUTHOR: T1 TIDE

## Create Spark

### Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql import functions as func

### Spark Conf

In [2]:
conf = SparkConf().setAppName("Assignment 25072024")\
                    .setMaster("local[*]") \
                    .set("spark.executor.memory", "4g") \
                    .set("spark.driver.memory", "4g") \
                    .set("spark.executor.cores", "4") \
                    .set("spark.driver.cores","5")

### SparkSession

In [3]:
spark = SparkSession.builder \
                    .config(conf=conf) \
                    .getOrCreate()

### SparkContext

In [4]:
sc = spark.sparkContext

## Exercise 1
- Given a dataset of 1000 arbitrary numbers in a text file
- Find all prime numbers in the given dataset
- Save result under a new text file

### Check Prime number function

In [5]:
from math import sqrt,ceil
def is_prime(n):
    if n <= 1:
        return False
    for i in range(2,ceil(sqrt(n))+1):
        if n%i == 0:
            return False
    return True

### No random

#### Generate 10.000.000 number list sorted

In [6]:
numbers_file = "input_data/numbers.txt"

In [7]:
# import random
# numbers = [random.randint(1, 100000000) for _ in range(10000000)]
numbers = list(range(1,10000001))
# numbers = [number for number in range(1,100000001)]
with open(numbers_file,'w+') as file:
    for number in numbers:
        file.write(f"{number}\n")
    file.close()

In [8]:
len(numbers)

10000000

#### RDD

In [9]:
prime_numbers_rdd_output_path = "output_data/prime_numbers_rdd.txt"

In [10]:
numbers_rdd = sc.textFile(numbers_file)

In [11]:
numbers_rdd = numbers_rdd.map(lambda x: int(x))

In [12]:
prime_numbers_rdd = numbers_rdd.filter(is_prime)

In [13]:
prime_numbers_rdd = prime_numbers_rdd.repartition(12)

In [14]:
prime_numbers_rdd.saveAsTextFile(path=prime_numbers_rdd_output_path)

### Random

In [15]:
numbers_random_file = "input_data/numbers_random.txt"

In [16]:
import random

numbers_random = [random.randint(1,10000000) for _ in range(1,10000000)]

with open(numbers_random_file,'w+') as file:
    for number in numbers_random:
        file.write(f"{number}\n")

In [17]:
numbers_random_rdd_output_path = "output_data/prime_numbers_rdd_random.txt"

In [18]:
numbers_rdd_random = sc.textFile(numbers_random_file)

In [19]:
numbers_rdd_random = numbers_rdd_random.map(lambda x: int(x))

In [20]:
prime_numbers_rdd_random = numbers_rdd_random.filter(is_prime)

In [21]:
prime_numbers_rdd_random = prime_numbers_rdd_random.repartition(numPartitions=12)

In [22]:
prime_numbers_rdd_random.saveAsTextFile(numbers_random_rdd_output_path)

## Exercise 2
- Generate a text file of 10,000 lines, each line contains a pair of (key,value)
- Calculate the average value for each key
- Apply GroupByKey() and ReduceByKey() Functions

### Generate text file with 10000

In [23]:
import random

textFile_path = "input_data/textLine.txt"

with open(textFile_path, 'w+') as file:
    for i in range(1, 10001):
        key = random.randint(1, 100)
        line = f"{key}, this is line number {i}\n"
        file.write(line)

In [24]:
rdd = sc.textFile(textFile_path)

In [25]:
rdd.take(5)

['87, this is line number 1',
 '25, this is line number 2',
 '23, this is line number 3',
 '17, this is line number 4',
 '14, this is line number 5']

In [26]:
def convert_to_key_value(value):
    value = (int(value.split(',')[0].strip()), value.split(',')[1].strip())
    return value

def convert_to_key_lenValue(value):
    value = (int(value.split(',')[0].strip()), len(value.split(',')[1].strip()))
    return value

In [27]:
key_value_rdd = rdd.map(convert_to_key_value)

In [28]:
key_lenValue_rdd = rdd.map(convert_to_key_lenValue)

In [29]:
key_lenValue_rdd.take(5)

[(87, 21), (25, 21), (23, 21), (17, 21), (14, 21)]

### Group by key

In [30]:
def sum_count(value):
    value = (sum(value) , len(value))
    return value

def avg_cal_groupByKey(value):
    value = value[0] / value[1]
    return value

In [31]:
grouped_rdd = key_lenValue_rdd.groupByKey()

sum_count_rdd = grouped_rdd.mapValues(sum_count)

avg_rdd_groupByKey = sum_count_rdd.mapValues(avg_cal_groupByKey)

In [32]:
sum_count_rdd.take(5)

[(14, (2197, 92)),
 (82, (2604, 109)),
 (100, (2603, 109)),
 (54, (2049, 86)),
 (94, (2221, 93))]

In [33]:
avg_rdd_groupByKey.take(5)

[(14, 23.880434782608695),
 (82, 23.889908256880734),
 (100, 23.880733944954127),
 (54, 23.825581395348838),
 (94, 23.881720430107528)]

In [34]:
avg_groupByKey_output_path = "output_data/avg_groupByKey.txt"

In [35]:
avg_rdd_groupByKey.saveAsTextFile(path=avg_groupByKey_output_path)

### Reduce by key

In [36]:
def value_and_count(value):
    value = (value,1)
    return value

def sum_lenValue(value_1,value_2):
    value_1,value_2  =(value_1[0] + value_2[0], value_1[1] + value_2[1])
    return value_1,value_2

def avg_cal_reduceByKey(value):
    value = value[0] / value[1]
    return value

In [37]:
sum_count_rdd = key_lenValue_rdd.mapValues(value_and_count) \
                                .reduceByKey(sum_lenValue)

avg_rdd_reduceByKey = sum_count_rdd.mapValues(avg_cal_reduceByKey)

In [38]:
sum_count_rdd.take(5)

[(14, (2197, 92)),
 (82, (2604, 109)),
 (100, (2603, 109)),
 (54, (2049, 86)),
 (94, (2221, 93))]

In [39]:
avg_rdd_reduceByKey.take(5)

[(14, 23.880434782608695),
 (82, 23.889908256880734),
 (100, 23.880733944954127),
 (54, 23.825581395348838),
 (94, 23.881720430107528)]

In [40]:
avg_reduceByKey_output_path = "output_data/avg_reduceByKey.txt"

In [41]:
avg_rdd_reduceByKey.saveAsTextFile(path=avg_reduceByKey_output_path)