###  RATINGS COUNTER

In [None]:
#librería de configuración (ejecutar en pc o en un cluster) y SC :: esenciales
from pyspark import SparkConf, SparkContext
#collections para ordenadar la salida del script
import collections

#configuración de spark
conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf = conf)

#cargar el fichero
lines = sc.textFile("C:/spark_data/ml-100k/u.data")

#parse data y contar por valor
ratings = lines.map(lambda x: x.split()[2]) #extraer el elemento 2 (valoración) de cada fila y distribuir en un vector RDD
#función de contar valores
result = ratings.countByValue()

#crear histograma/ no spark
sortedResults = collections.OrderedDict(sorted(result.items()))
for key, value in sortedResults.items():
    print("%s %i" % (key, value))

### Friends-by-age

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

#mapping
lines = sc.textFile("C:/spark_data/fakefriends.csv")
rdd = lines.map(parseLine)
#counting & aggregate
#1º transformar el value x en (x,1) y luego reduce by key el vector transformado (x,y) y sumar
totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) #acumulado
averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])
#collect results y print
results = averagesByAge.collect()
for result in results:
    print(result)


### Filtering

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("MinTemperatures")
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

lines = sc.textFile("C:/spark_data/1800.csv")
parsedLines = lines.map(parseLine)

#construye RDD con las observaciones que contengan TMIN, lambda devuelve bool y el filtro se queda con los TRUE
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1]) #TMAX para filtrar por el máximo
#nuevo map, nos quedamos unicamente con stationID y temperature
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
#reduce by key, devuelve el min para cada key
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))

results = minTemps.collect();

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))


### Counting words

In [None]:
import re
from pyspark import SparkConf, SparkContext

def normalizeWords(text):
    return re.compile(r'\W+', re.UNICODE).split(text.lower()) #eliminar puntuaciones y pasar a minúscula

conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf = conf)

input = sc.textFile("C:/spark_data/book.txt")
words = input.flatMap(normalizeWords)
#añadir un contador manual con un mapper y sumar por keys (agrupamos las palabras y creamos el contador)
wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y) 
#cambiamos el orden de los elementos de la lista y ordenamos por key (que era el antiguo valor)
wordCountsSorted = wordCounts.map(lambda x: (x[1], x[0])).sortByKey()  
results = wordCountsSorted.collect()

for result in results:
    count = str(result[0])
    word = result[1].encode('ascii', 'ignore')
    if (word):
        print(word.decode() + ":\t\t" + count)


### Total amount spent by costumer

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("TotalSpent")
sc = SparkContext(conf = conf)

def parseLine(line):
    fields = line.split(',')
    customerID = int(fields[0])
    spent = float(fields[2])
    return (customerID, spent)

input = sc.textFile("C:/spark_data/customer-orders.csv")
rdd = input.map(parseLine)

spendByClient = rdd.reduceByKey(lambda x, y: (x + y))
spendByClientSorted = spendByClient.map(lambda x: (x[1], x[0])).sortByKey()  

results = spendByClientSorted.collect(); #python object

for result in results:
    print(result)