# DSCI 417 â€“ Homework 02

**Lauren Forti**

In [0]:
# imports
import pandas as pd
from string import punctuation
from pyspark.sql import SparkSession

In [0]:
# create objects
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Problem 1: Word Count

In [0]:
# read in dataset as RDD
ws_lines = sc.textFile('/FileStore/tables/shakespeare_complete.txt')

# tokenize strings
ws_words = (
  ws_lines
  # remove misc characters
  .flatMap(lambda x : x.split(' '))
  .flatMap(lambda x : x.split('-'))
  .flatMap(lambda x : x.split('_'))
  .flatMap(lambda x : x.split('.'))
  .flatMap(lambda x : x.split(','))
  .flatMap(lambda x : x.split(':'))
  .flatMap(lambda x : x.split('|'))
  .flatMap(lambda x : x.split('\t'))
  # remove punctuation
  .map(lambda x : x.strip(punctuation))
  # remove 0-9
  .map(lambda x: x.strip('0123456789'))
  # remove apostrophes
  .map(lambda x : x.replace("'", ''))
  # convert to lowercase
  .map(lambda x : x.lower())
  # filter out empty strings
  .filter(lambda x : x != '')
)

# create RDD with each word
dist_words = ws_words.distinct()

# output # of words
print('Total Number of Words:   ', ws_words.count())
print('Number of Distinct Words:', dist_words.count())

In [0]:
# draw a sample
ws_sample = ws_words.sample(withReplacement = False, fraction = 0.0001)
# collect and output results
print(ws_sample.collect())

## Problem 2: Longest Words

In [0]:
# compare lengths of strings
def compare_str(x,y):
  if len(x) > len(y):
    return x
  elif len(x) < len(y):
    return y
  else:
    if x > y:
      return x
    else:
      return y

# find longest word
print('The longest word is', dist_words.reduce(compare_str))

In [0]:
# find top 20 longest words
longest = dist_words.sortBy(len, ascending = False)

# output first 20 elements
n = 0
for row in longest.take(20):
  n += 1
  print(str(n)+'.', row)

## Problem 3: Word Frequency

In [0]:
# create an RDD of tuples
pairs = (
  ws_words
  .map(lambda x : (x,1))
)

# create an RDD of word and count pairs
word_counts = (
  pairs
  # get count for each word
  .reduceByKey(lambda x,y : x+y)
  # sort by desc count
  .sortBy(lambda x : x[1], ascending = False)
)

# make list of first 20 elements
word_list = list(word_counts.take(20))

# convert to df
word_df = pd.DataFrame(
  data = word_list,
  columns = ['Word', 'Count']
)
# show df
display(word_df)

Word,Count
the,27379
and,26082
i,20717
to,19661
of,17474
a,14723
you,13630
my,12489
in,10996
that,10915


##Problem 4: Removing Stop Words

In [0]:
# read in data as RDD
sw_rdd = sc.textFile('/FileStore/tables/stopwords.txt')

# print # of elements
print(sw_rdd.count())

# draw a sample and output
print(sw_rdd.sample(withReplacement = False, fraction = 0.05).collect())

# store contents in list
sw = list(sw_rdd.collect())

In [0]:
# create RDD w/ stop words removed
ws_words_f = (
  ws_words
  .filter(lambda x : x not in sw)
)

# create RDD w/ distinct elements
dist_words_f = (
  ws_words_f.distinct()
)

# print # of distinct non-stop words
print('Number of Distinct Non-Stop Words:', dist_words_f.count())

In [0]:
# create an RDD of tuples
pairs_f = (
  ws_words_f
  .map(lambda x : (x,1))
)

# create an RDD of word and count pairs
word_counts_f = (
  pairs_f
  # get count for each word
  .reduceByKey(lambda x,y : x+y)
  # sort by desc count
  .sortBy(lambda x : x[1], ascending = False)
)

# make list of first 20 elements
word_list_f = list(word_counts_f.take(20))

# convert to df
word_f_df = pd.DataFrame(
  data = word_list_f,
  columns = ['Word', 'Count']
)
# show df
display(word_f_df)

Word,Count
will,4977
thy,4034
thee,3180
lord,3062
king,2871
good,2834
sir,2763
well,2553
enter,2350
love,2109


## Problem 5: Diamonds Dataset

In [0]:
# read in data as RDD
diamonds_raw = sc.textFile('/FileStore/tables/diamonds.txt')

# print # of elements
print(diamonds_raw.count())

In [0]:
# show first 5 elements
for row in diamonds_raw.take(5):
  print(row)

In [0]:
# tokenize rows
def process_row(row):
  tokens = row.split('\t')
  return [float(tokens[0]), tokens[1], tokens[2], tokens[3], float(tokens[4]), float(tokens[5]), int(tokens[6]), float(tokens[7]), float(tokens[8]), float(tokens[9])]

# get header info
header = diamonds_raw.take(1)[0]

# process each row
diamonds = (
  diamonds_raw
  # remove header row
  .filter(lambda x : x not in header)
  # process each element
  .map(process_row)
)

diamonds.persist()

# output first 5 rows
for row in diamonds.take(5):
  print(row)

## Problem 6: Grouped Means

In [0]:
cut_summary = (
  diamonds
  # transform -> (cut, (carat, price, 1))
  .map(lambda x : (x[1], (x[0], x[6], 1)))
  # sum of tuples (carat, price, 1)
  .reduceByKey(lambda x,y : (x[0] + y[0], x[1] + y[1], x[2] + y[2]))
  # transform -> (cut, count, mean_carat_size, mean_price)
  .map(lambda x : (x[0], x[1][2], round(x[1][0]/x[1][2], 2), round(x[1][1]/x[1][2], 2)))
)
# create list
cut_summary2 = cut_summary.collect()

# convert to df
cut_df = pd.DataFrame(
  data = cut_summary2,
  columns = ['Cut', 'Count', 'Mean_Carat', 'Mean_Price']
)
# show df
display(cut_df)

Cut,Count,Mean_Carat,Mean_Price
Premium,13791,0.89,4584.26
Good,4906,0.85,3928.86
Very Good,12082,0.81,3981.76
Fair,1610,1.05,4358.76
Ideal,21551,0.7,3457.54
