In [1]:
import pyspark
from pyspark.sql import SparkSession
app_name = "accidentes"
master = "local[*]"
spark = (SparkSession.builder
    .master(master)
    .config("spark.driver.cores", 1)
    .appName(app_name)
    .getOrCreate() )
sc = spark.sparkContext
print ('SparkContext created')

SparkContext created


In [2]:
#Carga el fichero en el vector lines2 leyendolo desde la web
import urllib.request
url = 'https://www.gutenberg.org/files/76/76-0.txt' #huckleberry.txt en el proyecto Gutemberg
response = urllib.request.urlopen(url)
data = response.read()
data = data.decode('utf-8')
len(data)

606285

In [3]:
lines = data.split('\n')
len (lines)

12326

In [9]:
#Crea el RDD a partir de lines
book = sc.parallelize(lines)
book.count()

12326

In [10]:
book.first()

'\ufeff\r'

In [11]:
def clean_line(line):
    """
    Remove \ufeff\r characters
    Remove \t \n \r
    Remove additional characters
    """
    return line.replace('\ufeff\r', '').\
        replace('\t', ' ').replace('\n', '').replace('\r', '').\
        replace('(', '').replace(')', '').replace("'", '').\
        replace('"', '').replace(',', ''). replace('.', '').\
        replace('*', '')

In [12]:
# Remove characters and empty lines
cleaned_book = book.map(lambda x: clean_line (x))\
                   .filter (lambda x: x != '')
cleaned_book.count()

9655

In [13]:
cleaned_book.first()

'The Project Gutenberg EBook of Adventures of Huckleberry Finn Complete'

In [18]:
import re
def normalize_tokenize(line):
    """
    Normalize: lowercase
    tokenize: split in tokens
    """
    return re.sub('\s+', ' ', line).strip().lower().split(' ')  #['the','project',....]
tokens = cleaned_book.flatMap (normalize_tokenize)
tokens.count()

114194

In [19]:
tokens.first()

'the'

In [20]:
reduced_tokens = tokens.filter (lambda s: len(s) > 3)
reduced_tokens.count()

58043

In [21]:
reduced_tokens.first()

'project'

In [22]:
counts = reduced_tokens.map (lambda x: (x, 1))
counts.first()

('project', 1)

In [23]:
reduced_counts = counts.reduceByKey (
                    lambda accumulator , value : accumulator + value)  # conmutativa y asociativa
reduced_counts.take(4)

[('project', 83), ('gutenberg', 24), ('ebook', 10), ('adventures', 9)]

In [24]:
# ordered by natural key (word)
reduced_counts.takeOrdered(4)

[('#76]', 1), ('$5000', 1), ('1500', 1), ('2001', 1)]

In [25]:
# ordered by frequency
reduced_counts.takeOrdered (4, key=lambda x: x[1])

[('author:', 1), ('date:', 1), ('february', 1), ('language:', 1)]

In [26]:
# reverse order by frequency
reduced_counts.takeOrdered (8, key=lambda x: -x[1])

[('that', 1021),
 ('they', 690),
 ('with', 572),
 ('then', 565),
 ('there', 539),
 ('them', 471),
 ('down', 459),
 ('said', 458)]

In [27]:
# reverse order by frequency, other way
reduced_counts.top (8, key=lambda x: x[1])

[('that', 1021),
 ('they', 690),
 ('with', 572),
 ('then', 565),
 ('there', 539),
 ('them', 471),
 ('down', 459),
 ('said', 458)]

In [28]:
# exclude top n words with top high frequecy but meaningless
huckleberry_book = reduced_counts.filter(
          lambda x: x[1] < 500)
huckleberry_book.takeOrdered (8, key=lambda x: -x[1])

[('them', 471),
 ('down', 459),
 ('said', 458),
 ('when', 421),
 ('about', 416),
 ('would', 392),
 ('come', 366),
 ('what', 349)]

In [29]:
hamlet_url = 'https://www.gutenberg.org/files/2265/2265.txt'
response = urllib.request.urlopen(hamlet_url)
data = response.read().decode('utf-8').split('\n')

In [32]:
'''
Creates a RDD for hamlet book
Removes characters, empty lines
Tokenize
Removes stop words
Counts frequecy
'''
shakespeare_book = sc.parallelize (data).\
      map (clean_line).\
      filter (lambda x: x != '').\
      flatMap (normalize_tokenize).\
      filter (lambda x: len(x) > 3).\
      map (lambda x: (x, 1)).\
      reduceByKey (
          lambda accum, val: accum + val)
shakespeare_book.count()

5715

In [33]:
shakespeare_book.first()

('project', 36)

In [34]:
shakespeare_book.takeOrdered (4, key=lambda x: x[1])

[('produced', 1), ('tools', 1), ('developed', 1), ('improved', 1)]

In [35]:
'''
Perform join operation to find out what words
are used in both books
'''
common_words = huckleberry_book.join (shakespeare_book)
common_words.count()

1405

In [36]:
# ordering by word
common_words.takeOrdered (8)

[('2001', (1, 2)),
 ('about', (416, 24)),
 ('above', (17, 3)),
 ('accept', (1, 1)),
 ('access', (10, 1)),
 ('accident', (1, 2)),
 ('according', (6, 3)),
 ('account', (16, 2))]

In [37]:
common_words.takeOrdered (8, key=lambda x: -x[1][0])   

[('them', (471, 60)),
 ('said', (458, 12)),
 ('when', (421, 56)),
 ('about', (416, 24)),
 ('would', (392, 69)),
 ('come', (366, 99)),
 ('what', (349, 202)),
 ('didnt', (332, 1))]

In [38]:
# ordering by the sum of the frequencies in both books
common_words.takeOrdered (8, key=lambda x: x[1][0] + x[1][1])

[('comrade', (1, 1)),
 ('loses', (1, 1)),
 ('finds', (1, 1)),
 ('ominous', (1, 1)),
 ('wounded', (1, 1)),
 ('explanatory', (1, 1)),
 ('prayers', (1, 1)),
 ('passages', (1, 1))]

In [39]:
common_words.top (8, key=lambda x: x[1][0] + x[1][1])

[('this', (282, 309)),
 ('what', (349, 202)),
 ('them', (471, 60)),
 ('when', (421, 56)),
 ('said', (458, 12)),
 ('come', (366, 99)),
 ('your', (200, 265)),
 ('would', (392, 69))]

In [40]:
common_words.takeOrdered (8, key=lambda x: -1 * (x[1][0] + x[1][1]))

[('this', (282, 309)),
 ('what', (349, 202)),
 ('them', (471, 60)),
 ('when', (421, 56)),
 ('said', (458, 12)),
 ('come', (366, 99)),
 ('your', (200, 265)),
 ('would', (392, 69))]

In [41]:
# words that are unique to huckleberry_book
hamlet_book = shakespeare_book
unique_huckleberry_book = huckleberry_book.\
    leftOuterJoin (hamlet_book).\
        filter (lambda x: x[1][1] is None).\
        map (lambda x: x[0])
unique_huckleberry_book.count()

8514

In [42]:
unique_huckleberry_book.take (8)

['adventures',
 'huckleberry',
 'twain',
 'samuel',
 'clemens',
 'anyone',
 'anywhere',
 'restrictions']

In [47]:
# words that are unique to hamlet_book
unique_hamlet_book = hamlet_book.\
    leftOuterJoin (huckleberry_book).\
        filter (lambda x: x[1][1] is None).\
        map (lambda x: x[0])
unique_hamlet_book.count()

4310

In [44]:
unique_hamlet_book.take (6)

['proofing',
 'developed',
 'there',
 'improved',
 '#100',
 'https://wwwgutenbergorg/ebooks/100']

In [None]:
# words that are unique to hamlet_book using rightOuterJoin ????
