## RDD Creation

##### 1. Creating an RDD from an Object Collection

In [0]:
my_list = ["Spark is awesome", "Spark is cool"]
listRDD = sc.parallelize(my_list)
type(listRDD)

##### 2. Creating an RDD from a File Data Source

In [0]:
fileRDD = sc.textFile("dbfs:/databricks-datasets/README.md")
print(fileRDD.collect())

#fileRDD = sc.textFile("dbfs:/databricks-datasets/README.md", 5)
type(fileRDD)

## RDD Opearations - Transformations & Actions

### RDD Transformations

* map(func)
* flatMap(func)
* filter(func)
* distinct()
* union(otherRDD)
* intersection(otherRDD)
* substract(otherRDD)

### RDD Actions

* collect()
* count()
* first()
* reduce()
* take(n)
* takeOrdered()
* top()
* saveAsTextFile(path)

In [0]:
shakespire = sc.textFile("dbfs:/FileStore/data/shakespeare.txt")
lineLengths = shakespire.map(lambda s: len(s))

In [0]:
totalLength = lineLengths.reduce(lambda a, b: a + b)
totalLength

##### map(func)

In [0]:
all_words= listRDD.map(lambda line : line.split(" ")).collect()
print(all_words)

##### flatMap(func)

In [0]:

all_words_flat = listRDD.flatMap(lambda line : line.split(" ")).collect()
print(all_words_flat)

###### Compute num of line and words

In [0]:
lines_num = shakespire.count()
lines_num

In [0]:
shakespire.flatMap(lambda line : line.split(" ")).count()

##### filter(func)

In [0]:
numbers = range(1,10)
grater_than_5 = sc.parallelize(numbers).filter(lambda x : x > 5).collect()
grater_than_5

##### collect()

In [0]:
numbersRDD = sc.parallelize(numbers)
numbersRDD.collect()


##### count()

In [0]:
numbersRDD.count()

##### first()

In [0]:
numbersRDD.first()


##### take(n)

In [0]:
numbersRDD.take(7)

##### reduce(func)

In [0]:
numbersRDD.reduce(lambda x,y : x+y)

##### takeOrdered(n, [ordering])

In [0]:
numbersRDD.takeOrdered(5, key=lambda x: -x)

##### top(n, [ordering])

In [0]:
another_numbers = [29, 11, 2, 8, 30, 23, 32]
sc.parallelize(another_numbers).top(2)


##### saveAsTextFile(path)

In [0]:
#fileRDD.map(lambda x : x.upper()).saveAsTextFile("/FileStore/data/sampleUpper")

## Key-Value Pairs

### Transformations

* groupByKey()
* reduceByKey(func)
* sortByKey()

### Actions

* countByKey()
* collectAsMap()
* lookup(key)

##### groupByKey()

In [0]:
words = sc.parallelize(["Spark","is","an", "amazing", "piece","of","technology"])
wordPairs = words.map(lambda w : (len(w), w))
wordPairs.collect()



In [0]:
wordsByLen = wordPairs.groupByKey().collect()
wordsByLen

##### reduceByKey()

In [0]:
pairs = shakespire.flatMap(lambda line: line.split(" ")).map(lambda word : (word,1))
counts = pairs.reduceByKey(lambda a, b: a + b)
counts.collect()

In [0]:
counts.lookup("always")

##### CountByKey()

In [0]:
counts2 = pairs.countByKey()
counts2