In [None]:
import pyspark
from pyspark.sql import SparkSession
app_name = "accidentes"
master = "local[*]"
spark = (SparkSession.builder
    .master(master)
    .config("spark.driver.cores", 1)
    .appName(app_name)
    .getOrCreate() )
sc = spark.sparkContext
print ('SparkContext created')

In [None]:
#Carga el fichero en el vector lines2 leyendolo desde la web
import urllib.request
url = 'https://www.gutenberg.org/files/76/76-0.txt' #huckleberry.txt en el proyecto Gutemberg
response = urllib.request.urlopen(url)
data = response.read()
data = data.decode('utf-8')
len(data)

In [None]:
lines = data.split('\n')
len (lines)

In [None]:
#Crea el RDD a partir de lines
book = sc.parallelize(lines)
book.count()

In [None]:
book.first()

In [None]:
def clean_line(line):
    """
    Remove \ufeff\r characters
    Remove \t \n \r
    Remove additional characters
    """
    return line.replace('\ufeff\r', '').\
        replace('\t', ' ').replace('\n', '').replace('\r', '').\
        replace('(', '').replace(')', '').replace("'", '').\
        replace('"', '').replace(',', ''). replace('.', '').\
        replace('*', '')

In [None]:
# Remove characters and empty lines
cleaned_book = book.map(lambda x: clean_line (x))\
                   .filter (lambda x: x != '')
cleaned_book.count()

In [None]:
cleaned_book.first()

In [None]:
import re
def normalize_tokenize(line):
    """
    Normalize: lowercase
    tokenize: split in tokens
    """
    return re.sub('\s+', ' ', line).strip().lower().split(' ')
tokens = cleaned_book.flatMap (normalize_tokenize)
tokens.count()

In [None]:
tokens.first()

In [None]:
reduced_tokens = tokens.filter (lambda s: len(s) > 3)
reduced_tokens.count()

In [None]:
reduced_tokens.first()

In [None]:
counts = reduced_tokens.map (lambda x: (x, 1))
counts.first()

In [None]:
reduced_counts = counts.reduceByKey (
                    lambda accumulator , value : accumulator + value)
reduced_counts.take(4)

In [None]:
# ordered by natural key (word)
reduced_counts.takeOrdered(4)

In [None]:
# ordered by frequency
reduced_counts.takeOrdered (4, key=lambda x: x[1])

In [None]:
# reverse order by frequency
reduced_counts.takeOrdered (8, key=lambda x: -x[1])

In [None]:
# reverse order by frequency, other way
reduced_counts.top (8, key=lambda x: x[1])

In [None]:
# exclude top n words with top high frequecy but meaningless
huckleberry_book = reduced_counts.filter(
          lambda x: x[1] < 500)
huckleberry_book.takeOrdered (8, key=lambda x: -x[1])

In [None]:
hamlet_url = 'https://www.gutenberg.org/files/2265/2265.txt'
response = urllib.request.urlopen(hamlet_url)
data = response.read().decode('utf-8').split('\n')

In [None]:
'''
Creates a RDD for hamlet book
Removes characters, empty lines
Tokenize
Removes stop words
Counts frequecy
'''
shakespeare_book = sc.parallelize (data).\
      map (clean_line).\
      filter (lambda x: x != '').\
      flatMap (normalize_tokenize).\
      filter (lambda x: len(x) > 3).\
      map (lambda x: (x, 1)).\
      reduceByKey (
          lambda accum, val: accum + val)
shakespeare_book.count()

In [None]:
shakespeare_book.first()

In [None]:
shakespeare_book.takeOrdered (4, key=lambda x: x[1])

In [None]:
'''
Perform join operation to find out what words
are used in both books
'''
common_words = huckleberry_book.join (shakespeare_book)
common_words.count()

In [None]:
# ordering by word
common_words.takeOrdered (8)

In [None]:
common_words.takeOrdered (8, key=lambda x: -x[1][0])

In [None]:
# ordering by the sum of the frequencies in both books
common_words.takeOrdered (8, key=lambda x: x[1][0] + x[1][1])

In [None]:
common_words.top (8, key=lambda x: x[1][0] + x[1][1])

In [None]:
common_words.takeOrdered (8, key=lambda x: -1 * (x[1][0] + x[1][1]))

In [None]:
# words that are unique to huckleberry_book
hamlet_book = shakespeare_book
unique_huckleberry_book = huckleberry_book.\
    leftOuterJoin (hamlet_book).\
        filter (lambda x: x[1][1] is None).\
        map (lambda x: x[0])
unique_huckleberry_book.count()

In [None]:
unique_huckleberry_book.take (8)

In [None]:
# words that are unique to hamlet_book
unique_hamlet_book = hamlet_book.\
    leftOuterJoin (huckleberry_book).\
        filter (lambda x: x[1][1] is None).\
        map (lambda x: x[0])
unique_hamlet_book.count()

In [None]:
unique_hamlet_book.take (6)

In [None]:
# words that are unique to hamlet_book using rightOuterJoin ????
