In [0]:
from pyspark import *
from pyspark.sql import *

## Sesión

La sesión de Spark debe tener un nombre y es lo que une diferentes procesos de Spark

# RDD

Las primeras versiones de Spark utilizaban los RDDs directamente.
Usemos un ejemplo de contar palabras:

In [0]:
%fs ls /FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/PADRON_COMPLETO-1.csv,PADRON_COMPLETO-1.csv,437542254,1749404416000
dbfs:/FileStore/tables/PADRON_COMPLETO.csv,PADRON_COMPLETO.csv,437542254,1748737791000
dbfs:/FileStore/tables/SJ.csv,SJ.csv,140579880,1748917446000
dbfs:/FileStore/tables/distelec.csv,distelec.csv,175692,1749404350000
dbfs:/FileStore/tables/ejemplo.txt,ejemplo.txt,189,1748748188000
dbfs:/FileStore/tables/links.csv,links.csv,197979,1749404504000
dbfs:/FileStore/tables/movies.csv,movies.csv,494431,1749404505000
dbfs:/FileStore/tables/por_ciclo_2016_2018.csv,por_ciclo_2016_2018.csv,904931,1749404351000
dbfs:/FileStore/tables/ratings.csv,ratings.csv,2483723,1749404506000
dbfs:/FileStore/tables/ratings_noheader.csv,ratings_noheader.csv,2483690,1749404505000


In [0]:
Ejemplo_RDD = spark.sparkContext.textFile("dbfs:/FileStore/tables/ejemplo.txt").flatMap(lambda line: line.split(" "))

In [0]:
rdd_map =  Ejemplo_RDD.map(lambda word: (word, 1)) 

In [0]:
rdd_reduce = rdd_map.reduceByKey(lambda x, y: x + y)

In [0]:
print(rdd_reduce)

PythonRDD[6] at RDD at PythonRDD.scala:58


In [0]:
resultado = rdd_reduce.collect()
print(resultado)

[('Este', 3), ('ejemplo', 3), ('archivo', 3), ('que', 3), ('es', 3), ('un', 6), ('de', 3), ('tiene', 3), ('3', 3), ('oraciones', 3), ('iguales', 3)]


# Spark Dataframes

Framework introducido por Spark 2+ para facilitar el manejo de datos

In [0]:
df = spark.read.csv("dbfs:/FileStore/tables/PADRON_COMPLETO.csv")

In [0]:
df.show(10)

+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|      _c0|    _c1|    _c2|       _c3|  _c4|                 _c5|                 _c6|                 _c7|
+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|   CEDULA|CODELEC|RELLENO|FECHACADUC|JUNTA|              NOMBRE|          1_APELLIDO|          2_APELLIDO|
|101053316| 104015|       |  20280207|00000|LUCILA           ...|PORRAS           ...|AGUERO           ...|
|101086526| 101012|       |  20280207|00000|DINORA           ...|OBANDO           ...|GARCIA           ...|
|101141655| 103033|       |  20300204|00000|TRINIDAD         ...|VINDAS           ...|PEREZ            ...|
|101142031| 112007|       |  20300630|00000|INOCENCIA        ...|MEZA             ...|VEGA             ...|
|101164392| 119001|       |  20300306|00000|JOSE FRANCISCO   ...|DUARTE           ...|QUESADA          ...|
|101240037| 823001|       | 

In [0]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [0]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType

padron_schema  = StructType(fields=[
    StructField("CEDULA",IntegerType(),True), 
    StructField("CODELEC",IntegerType(),True),
    StructField("RELLENO",StringType(),True),
    StructField("FECHACADUC",DateType(),True),
    StructField("JUNTA",IntegerType(),True),
    StructField("NOMBRE",StringType(),True),
    StructField("1_APELLIDO",StringType(),True),
    StructField("2_APELLIDO",StringType(),True),
])


In [0]:
df = spark.read\
    .option("header", True)\
    .option("dateFormat", "yyyyMMdd")\
    .schema(padron_schema)\
    .csv("dbfs:/FileStore/tables/PADRON_COMPLETO.csv")

In [0]:
df.show(5)

+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|   CEDULA|CODELEC|RELLENO|FECHACADUC|JUNTA|              NOMBRE|          1_APELLIDO|          2_APELLIDO|
+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|101053316| 104015|       |2028-02-07|    0|LUCILA           ...|PORRAS           ...|AGUERO           ...|
|101086526| 101012|       |2028-02-07|    0|DINORA           ...|OBANDO           ...|GARCIA           ...|
|101141655| 103033|       |2030-02-04|    0|TRINIDAD         ...|VINDAS           ...|PEREZ            ...|
|101142031| 112007|       |2030-06-30|    0|INOCENCIA        ...|MEZA             ...|VEGA             ...|
|101164392| 119001|       |2030-03-06|    0|JOSE FRANCISCO   ...|DUARTE           ...|QUESADA          ...|
+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [0]:
df.printSchema()

root
 |-- CEDULA: integer (nullable = true)
 |-- CODELEC: integer (nullable = true)
 |-- RELLENO: string (nullable = true)
 |-- FECHACADUC: date (nullable = true)
 |-- JUNTA: integer (nullable = true)
 |-- NOMBRE: string (nullable = true)
 |-- 1_APELLIDO: string (nullable = true)
 |-- 2_APELLIDO: string (nullable = true)



In [0]:
df.describe().show()

+-------+--------------------+------------------+-------+-------+--------------------+--------------------+--------------------+
|summary|              CEDULA|           CODELEC|RELLENO|  JUNTA|              NOMBRE|          1_APELLIDO|          2_APELLIDO|
+-------+--------------------+------------------+-------+-------+--------------------+--------------------+--------------------+
|  count|             3641757|           3641757|3641757|3641757|             3641757|             3641757|             3641757|
|   mean|3.1530629119036746E8| 317030.3891742365|   null|    0.0|                null|                 NaN|                 NaN|
| stddev|2.3074491508769274E8|210099.51534055025|   null|    0.0|                null|                null|                 NaN|
|    min|           101053316|            101001|       |      0|A HYUN           ...|AANDERUD         ...|AAGESEN          ...|
|    max|           901530523|            848001|       |      0|�ENYIRO          ...|�URINDA    

In [0]:
df .createOrReplaceTempView("padron")
results = spark.sql("SELECT * FROM padron WHERE FECHACADUC='2028-02-07'")


In [0]:
results.show(5)

+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|   CEDULA|CODELEC|RELLENO|FECHACADUC|JUNTA|              NOMBRE|          1_APELLIDO|          2_APELLIDO|
+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
|101053316| 104015|       |2028-02-07|    0|LUCILA           ...|PORRAS           ...|AGUERO           ...|
|101086526| 101012|       |2028-02-07|    0|DINORA           ...|OBANDO           ...|GARCIA           ...|
|101370578| 103008|       |2028-02-07|    0|ODILI            ...|VILLALOBOS       ...|CHAVES           ...|
|101440643| 201005|       |2028-02-07|    0|DELMA            ...|CARMONA          ...|CORDERO          ...|
|101460525| 102001|       |2028-02-07|    0|APOLONIO         ...|PEREZ            ...|MENA             ...|
+---------+-------+-------+----------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows

