# (Py)Spark Exercizes
@stravanni



In [None]:
from pyspark import SparkConf, SparkContext
conf = SparkConf()
sc = SparkContext(conf=conf)

----
# Exercises
### Try to solve the following exercizes employing the API presentet above

## A. Wordcount
1. Read the file "example.txt", containing "THE DIVINE COMEDY"
2. Select the 10 most frequent words

In [None]:
FILE_PATH = "data/"

In [None]:
# Write your solution here
# Most common words in "THE DIVINE COMEDY"
rdd = sc.textFile(FILE_PATH + "DivineComedy.txt")
res = (rdd.flatMap(lambda x: x.split())
       .map(lambda x: x.lower())
       .map(lambda x: (x, 1))
       .reduceByKey(lambda x, y: x + y)
       .sortBy(lambda x: x[1], ascending=False)
      ).take(10)
res

## B. Estimating Pi
This code estimates π by "throwing darts" at a circle.

1. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle.
2. The fraction should be π / 4, so we use this to get our estimate.

In [None]:
# SOLUTION
from random import random
NUM_SAMPLES = 100000000

def sample(p):
    x, y = random(), random()
    return 1 if x*x + y*y < 1 else 0


    

sample_nums = sc.parallelize(range(0, NUM_SAMPLES))

sample = sample_nums.map(sample)
def summ(x, y):
    return x + y

count = sample.reduce(summ)
#count = sample.reduce(lambda x, y: x + y)

print("Pi is roughly %f" % (4.0 * count / NUM_SAMPLES))

## C. TMax

In [None]:
import re
import sys

In [None]:
#function to extract the data from the line
#based on position and filter out the invalid records
def extractData(line):
    val = line.strip()
    (year, temp, q) = (str(val[15:19]), str(val[87:92]), str(val[92:93]))
    if (temp != "+9999" and re.match("[01459]", q)):
        return [(year, temp)]
    else:
        return []

In [None]:
#Create an RDD from the input data in HDFS
weatherData = sc.textFile(FILE_PATH + "1902.txt")

#Transform the data to extract/filter and then find the max temperature
temperature_per_year = weatherData.flatMap(extractData)

In [None]:
temperature_per_year.collect()

In [None]:
max_temperature_per_year = temperature_per_year.reduceByKey(lambda x,y: int(x) if int(x)>int(y) else int(y))

In [None]:
max_temperature_per_year.collect()

In [None]:
years = max_temperature_per_year.map(lambda x: x[1])
years.distinct().collect()

In [None]:
#Save the RDD back into HDFS
max_temperature_per_year.saveAsTextFile("out/output")

#### Currently, pyspark doesn't support overwrite or append.

- The function `saveAsTextFile` is
a wrapper around `saveAsHadoopFile` and it's not possible overwrite existing files.

#### in scala
It is however trivial to do this using HDFS directly from Scala:
```
val hadoopConf = new org.apache.hadoop.conf.Configuration()

val hdfs = org.apache.hadoop.fs.FileSystem.get(new java.net.URI("hdfs://localhost:9000"), hadoopConf)
```
#### in shell
- If you need to merge hdfs file, remember to use:
[hadoop getMerge](https://hadoop.apache.org/docs/r2.4.1/hadoop-project-dist/hadoop-common/FileSystemShell.html#getmerge)
- If you simply want to delete it:
```
hdfs dfs -rm -R "hdfs:///output"
```

In [None]:
weatherData_ = sc.textFile("out/output")

In [None]:
weatherData_.collect()