# DSCI 417 – Homework 01

**Lauren Forti**

In [0]:
# imports
import math
from pyspark.sql import SparkSession
from pyspark.mllib.random import RandomRDDs

In [0]:
# create objects
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Terminology

1. Scala
2. SparkSession
3. SparkContext
4. Resilient Distributed Dataset
5. Partitions
6. Transformation
7. Action
8. Transformation
9. Action
10. Transformation
11. Action
12. List
13. Master node
14. Workers
15. Driver
16. Executor

## Problem 2: Working with a Numerical RDD

In [0]:
# create RDD w/ 1.2 million elements selected from [0,1]
random_rdd = RandomRDDs.uniformRDD(sc, size=1200000, seed=1)

# get descriptive statistics
print('Sum:     ', random_rdd.sum())
print('Mean:    ', random_rdd.mean())
print('Std Dev: ', random_rdd.stdev())
print('Minimum: ', random_rdd.min())
print('Maximum: ', random_rdd.max())

In [0]:
# determine the # of partitions
partition_num = random_rdd.getNumPartitions()

# create list w/ # of elements within each of the partitions
partition_list = random_rdd.glom().map(len).collect()
# output list
print('Number of Partitions: ', partition_num)
print('Size of Partitions:\n', partition_list)

## Problem 3: Transformations

In [0]:
# scale random_rdd by 10
scaled_rdd = random_rdd.map(lambda x : x*10)

# get descriptive statistics
print('Sum:     ', scaled_rdd.sum())
print('Mean:    ', scaled_rdd.mean())
print('Std Dev: ', scaled_rdd.stdev())
print('Minimum: ', scaled_rdd.min())
print('Maximum: ', scaled_rdd.max())

In [0]:
# take the ln of each element
log_rdd = scaled_rdd.map(lambda x : math.log(x))

# get descriptive statistics
print('Sum:     ', log_rdd.sum())
print('Mean:    ', log_rdd.mean())
print('Std Dev: ', log_rdd.stdev())
print('Minimum: ', log_rdd.min())
print('Maximum: ', log_rdd.max())

## Problem 4: Calculating SSE

In [0]:
# read data file into RDD
pairs_raw = sc.textFile('/FileStore/tables/pairs_data.txt')
# get # of elements
print(pairs_raw.count())

In [0]:
# display first 5 elements
for row in pairs_raw.take(5):
  print(row)

In [0]:
# function to split strings into float tuples
def process_line(row):
  items = row.split(' ')
  return(float(items[0]), float(items[1]))

# call process_line
pairs = pairs_raw.map(process_line)

# print first 5 elements
for row in pairs.take(5):
  print(row)

In [0]:
# calc SSE
SSE = pairs.map(lambda x : (x[0] - x[1])**2).sum()
#output results
print(f'The value of SSE is {SSE}')

## Problem 5: Calculating r-Squared

In [0]:
# get the mean of the first element in each tuple
mean = pairs.map(lambda x : x[0]).mean()
# output results
print(f'The mean is {mean}')

In [0]:
# calc SST
SST = pairs.map(lambda x: (x[0] - mean)**2).sum()
# output results
print(f'The value of SST is {SST}')

In [0]:
# calc r-squared score
r2 = 1 - SSE/SST
# output results
print(f'The r-Squared score is {r2}')

## Problem 6: NASA Server Logs

In [0]:
# read data file into RDD
nasa = sc.textFile('/FileStore/tables/NASA_server_logs_Aug_1995.txt')
print(nasa.count())

In [0]:
# display first 5 elements
for row in nasa.take(5):
  print(row)

In [0]:
# convert each line to Boolean value if contains specific string and count
num_get = nasa.map(lambda x: 'GET' in x).sum()
num_post = nasa.map(lambda x: 'POST' in x).sum()
num_head = nasa.map(lambda x: 'HEAD' in x).sum()

# output results
print(f'Number of GET requests:  {num_get}')
print(f'Number of POST requests: {num_post}')
print(f'Number of HEAD requests: {num_head}')