# Download Datasets

In [0]:
%sh 
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/frankenstein.txt'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/el_quijote.txt'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/characters.csv'
curl -O 'https://raw.githubusercontent.com/masfworld/datahack_docker/master/zeppelin/data/planets.csv'

In [0]:
%sh
ls /databricks/driver/

# RDD

---



## Example 1

In [0]:
textFile = spark.sparkContext.textFile('file:/databricks/driver/frankenstein.txt')
display(textFile.first())


## Creation of paralelized collection de colecciones paralelizadas
This is a fast way to create a RDD:

## Example 2

In [0]:
distData = spark.sparkContext.parallelize([25, 20, 15, 10, 5])
display(distData.reduce(lambda x ,y: x + y))

## Exercise 1
Count the number of lines for `el_quijote.txt` file

---



In [0]:
textfile_quijote = spark.sparkContext.textFile("file:/databricks/driver/el_quijote.txt")
print("Number of lines: " + str(textfile_quijote.count()))

## Exercise 2
Print the first line of the file `el_quijote.txt`

---



In [0]:
display(textfile_quijote.first())

## Transformations and Actions in RDDs 

### Actions

### Example 3

In [0]:
print(textFile.count()) # Número de elementos en el RDD
print(textFile.first()) # Primer elemento del RDD

### Transformations

### Example 4

In [0]:
# ReduceByKey
lines = spark.sparkContext.textFile("file:/databricks/driver/frankenstein.txt")
pairs = lines.map(lambda s: (s, 1))
counts = pairs.reduceByKey(lambda a, b: a + b).cache()
counts.count()
display(counts.collect())

In [0]:
# SortByKey
sorted = counts.sortByKey()
display(sorted.collect())

### Example 5

In [0]:
# Filter
linesWithSpark = textFile.filter(lambda line: "the" in line)
display(linesWithSpark.count())

### Exercise 3
Get the word count for the file `frankenstein.txt`

---

In [0]:
words = spark.sparkContext.textFile("file:/databricks/driver/frankenstein.txt")

words.flatMap(lambda x: x.split(" ")) \
.map(lambda s: (s, 1)) \
.reduceByKey(lambda a, b: a + b) \
.map(lambda x: (x[1], x[0])) \
.sortByKey(False) \
.collect()

### Exercise 4
Get TOP 10 of the words with more than 4 characters

---



In [0]:
words \
.flatMap(lambda line: line.split(" ")) \
.filter(lambda word: len(word) > 4) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b) \
.map(lambda x: (x[1], x[0])) \
.sortByKey(False) \
.take(10)

In [0]:
words \
.flatMap(lambda line: line.split(" ")) \
.filter(lambda word: len(word) > 4) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b) \
.top(10, key=lambda x: x[1])

## Key/Value Pair RDD

---



### Example 6


---



In [0]:
charac_sw = spark.sparkContext.textFile("file:/databricks/driver/characters.csv")
planets_sw = spark.sparkContext.textFile("file:/databricks/driver/planets.csv")
charac_sw.take(10)

In [0]:
planets_sw.take(10)

In [0]:
from itertools import islice

charac_sw_noheader = charac_sw.mapPartitionsWithIndex(
    lambda idx, it: islice(it, 1, None) if idx == 0 else it)

planets_sw_noheader = planets_sw.mapPartitionsWithIndex(
    lambda idx, it: islice(it, 1, None) if idx == 0 else it)

### Exercise 5
Get a list of the population of the planet each Star Wars character belongs to

---


In [0]:
planets_sw_pair = planets_sw_noheader \
.map(lambda line: line.split(";")) \
.map(lambda x: (x[0], x[8]))


characters_sw_pair = charac_sw_noheader \
.map(lambda line: line.split(",")) \
.map(lambda x: (x[8], x[0]))

characters_sw_pair\
.join(planets_sw_pair)\
.map(lambda x: (x[0], x[1][0], x[1][1]))\
.distinct()\
.take(10)