## pySpark

In [1]:
from pyspark import  SparkContext
sc

<pyspark.context.SparkContext at 0x7fb12d547e50>

In [2]:
print(sc.master)

local[*]


In [3]:
sc._conf.getAll()

[(u'spark.driver.host', u'172.18.0.2'),
 (u'spark.driver.port', u'40277'),
 (u'spark.sql.catalogImplementation', u'hive'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.master', u'local[*]'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.app.id', u'local-1497109223524'),
 (u'spark.app.name', u'PySparkShell'),
 (u'hive.metastore.warehouse.dir', u'file:/notebook/spark-warehouse')]

### Crear RDD

In [4]:
numbers = range(10)
numbers_rdd = sc.parallelize(numbers)

numbers_rdd

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:475

In [5]:
numbers_rdd.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [6]:
numbers_rdd.take(4)

[0, 1, 2, 3]

In [10]:
def cuadrado(x):
    return x**2

numbers_rdd.map(cuadrado).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [11]:
numbers_rdd.map(lambda x: x**2).collect()

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [18]:
###sumatorio de los elementos de un RDD
numbers_rdd.map(lambda x: x**2).reduce(lambda a,b: a+b)

285

In [13]:
##equivalente al anterior
numbers_rdd.map(lambda x: x**2).sum()

285

### Par/impar

In [15]:
def par_impar(x):
    return "par" if x%2==0 else "impar"
        
    
numbers_rdd.map(lambda x: (par_impar(x), x) ).collect()

[('par', 0),
 ('impar', 1),
 ('par', 2),
 ('impar', 3),
 ('par', 4),
 ('impar', 5),
 ('par', 6),
 ('impar', 7),
 ('par', 8),
 ('impar', 9)]

In [17]:
numbers_rdd.map(lambda x: (par_impar(x), x) ).reduceByKey(lambda a,b: a+b).collect()

[('par', 20), ('impar', 25)]

### Numeros primos

In [21]:
def isprime(n):
    """
    comprueba si es número primo
    """
    # make sure n is a positive integer
    n = abs(int(n))
    # 0 and 1 are not primes
    if n < 2:
        return False
    # 2 is the only even prime number
    if n == 2:
        return True
    # all other even numbers are not primes
    if not n & 1:
        return False
    # range starts with 3 and only needs to go up the square root of n
    # for all odd numbers
    for x in range(3, int(n**0.5)+1, 2):
        if n % x == 0:
            return False
    return True

In [22]:
# Crear un RDD qe representa una colección de números del 0 al 100
nums = sc.parallelize(xrange(100))

# Obtener de esa colección de números cuáles son primos
print(nums.filter(isprime).count())
filter(isprime, xrange(100))

25


[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]

### Contar catacteres,palabras y lineas de un fichero de texto

In [19]:
def file_count(line):
    return [("caracteres", len(line)), \
            ("palabras", len(line.split())), \
            ("lineas", 1)]

print (sc.textFile("about_spark.txt")
 .flatMap(file_count)
 .reduceByKey(lambda a,b: a+b)
 .collectAsMap())

{'palabras': 206, 'caracteres': 1356, 'lineas': 4}


In [20]:
import re
WORD_RE = re.compile(r"[\w']+")

print (sc.textFile("about_spark.txt")
 .flatMap(lambda line: WORD_RE.findall(line))
 .map(lambda word: (word.lower(), 1))
 .reduceByKey(lambda a,b: a+b)
 .map(lambda (k,v): (v,k))
 .takeOrdered(1, key = lambda x: -x[0]))

[(9, u'the')]


In [21]:
print (sc.textFile("about_spark.txt")
 .flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)])
 .reduceByKey(lambda a,b: a+b)
 .takeOrdered(1, key = lambda x: -x[1]))

[(u'the', 9)]


### Palabras más frecuentes
#### Obtener la frecuencia de cada palabra en un fichero de texto

In [15]:
from operator import add

def tokenize(text):
    return text.split()

# Crear el RDD a partir del fichero de texto
text = sc.textFile("about_spark.txt") 

# Aplicar transformacones sobre el RDD
wc   = text.flatMap(tokenize)
wc   = wc.map(lambda x: (x,1)).reduceByKey(add)

#Obtener las 10 palabras más frecuentes
print(wc.take(10))

#guardar los resulados en un fichero de salida
wc.saveAsTextFile("output_file")

[(u'and', 8), (u'side,', 2), (u'closely', 1), (u'generality', 1), (u'sup\u2010', 1), (u'streaming.', 1), (u'is', 7), (u'built-in', 1), (u'as', 1), (u'including', 3)]
