In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.sql.types import *
from pyspark.sql import functions

spark= SparkSession.builder.appName("Trabajando con Spark SQL").enableHiveSupport().getOrCreate()

sc = spark.sparkContext
sqlCtx = SQLContext(sc)

version =spark.version
print('Version',sqlCtx)

estudiantes = spark.read.json("estudiantes.json")
estudiantes.registerTempTable("Estudiantes")
notables = spark.sql("""SELECT nombre, apellidos, nota FROM Estudiantes WHERE nota >= 8 ORDER BY apellidos ASC """)

print("Notables: ")
print(notables.show())

('Version', <pyspark.sql.context.SQLContext object at 0x7f28e207fc50>)
Notables: 
+--------+--------------+----+
|  nombre|     apellidos|nota|
+--------+--------------+----+
|   Irene|     Abad Abad|8.16|
|    Jose|  Abad Antunez| 8.7|
|   Elena|    Abad Aznar|8.76|
|   Mateo|  Abad Barcino|8.99|
|Santiago|  Abad Barcino|8.31|
| Valeria| Abad Belmonte|9.59|
|  Sandra|  Abad Bermejo| 8.6|
|   David|   Abad Bilbao|9.41|
| Claudia|   Abad Bilbao|8.26|
|    Jose|   Abad Bilbao|8.61|
|    Maia|   Abad Blasco|8.89|
|  Daniel|   Abad Blasco|8.38|
|Santiago|    Abad Comas|9.62|
|   Irene|Abad Corominas|8.14|
|   Elena|   Abad Crespo|8.65|
| Dolores|   Abad Cuenca|8.77|
|    Sara|   Abad Cuervo|8.14|
|   Maria|   Abad Cuesta|8.08|
|    Juan|   Abad Cuesta|9.39|
|   Pablo|   Abad Garcia|9.12|
+--------+--------------+----+
only showing top 20 rows

None


In [2]:
## Funciones definidas por el usuario
def notatxt(nota):
    if nota < 5:
        return "suspenso"
    if nota < 6.5:
        return "aprobado"
    if nota < 9:
        return "notable"
    if nota < 9.9:
        return "excelente"
    return "matrícula"

# Creates a temporary view using the DataFrame
estudiantes.createOrReplaceTempView("Estudiantes")

#registrar función definida por el usuario para luego aplicarla en la sentencia sql
spark.udf.register("notatxt",notatxt)

publicada = spark.sql("""SELECT apellidos,nombre,notatxt(nota) AS Expediente FROM Estudiantes ORDER BY apellidos """)
print("Notas txt: ")
print(publicada.show())

Notas txt: 
+-------------+--------+----------+
|    apellidos|  nombre|Expediente|
+-------------+--------+----------+
|    Abad Abad|   Irene|   notable|
|  Abad Abadia|    Lope|   notable|
|  Abad Abadia|  Alvaro|   notable|
|  Abad Abadia|  Romina|   notable|
| Abad Abascal|    Jose|   notable|
| Abad Antunez|    Jose|   notable|
|   Abad Aznar| Soledad|   notable|
|   Abad Aznar|    Luna|   notable|
|   Abad Aznar|   Elena|   notable|
|   Abad Aznar|   Maria|   notable|
|   Abad Aznar| Enrique|   notable|
| Abad Barcino|   Mateo|   notable|
| Abad Barcino|Santiago|   notable|
| Abad Barcino|   Elena|   notable|
|Abad Belmonte|    Jose|   notable|
|Abad Belmonte| Valeria| excelente|
| Abad Bermejo|    Maia|   notable|
| Abad Bermejo|   Oriol|  aprobado|
| Abad Bermejo|  Sandra|   notable|
|  Abad Bernal|    Sara|  aprobado|
+-------------+--------+----------+
only showing top 20 rows

None


In [3]:
estudiantes = spark.read.json("estudiantes.json",StructType([StructField("_id",StringType(),False),
                                                        StructField("nombre",StringType(),False),
                                                        StructField("apellidos",StringType(),False),
                                                        StructField("edad",ByteType(),False),
                                                        StructField("email",StringType(),False),
                                                        StructField("nota",DecimalType(),False)]))

estadistica = estudiantes.agg(functions.min(estudiantes.edad),functions.max(estudiantes.edad),functions.avg(estudiantes.nota))

## Estadística
print(estadistica.collect())

## GroupBy
print(estudiantes.groupBy('nombre').avg().show())

## Select
print( (estudiantes.select('apellidos','nombre').orderBy(estudiantes.apellidos.asc()).show()) )

[Row(min(edad)=18, max(edad)=37, avg(nota)=Decimal('7.9600'))]
+---------+------------------+---------+
|   nombre|         avg(edad)|avg(nota)|
+---------+------------------+---------+
|    Oriol|22.504504504504503|   7.9730|
|Valentina| 21.85483870967742|   8.1935|
|  Agustin|             21.68|   7.7600|
|  Antonio|21.663716814159294|   8.0177|
|     Luna|22.111940298507463|   7.8806|
|     Iker|21.949579831932773|   7.9496|
|  Enrique| 22.37735849056604|   8.0000|
|    Diego|22.057142857142857|   7.9810|
|    Mauro|22.276785714285715|   8.1250|
|Guadalupe| 21.92248062015504|   7.8915|
| Victoria| 21.81512605042017|   8.0420|
|    Tomas| 22.23148148148148|   7.8333|
|    Pablo|22.289405684754524|   7.9509|
|  Dolores|21.982905982905983|   7.9573|
|   Alonso|22.486238532110093|   8.0734|
|    Pedro|22.192307692307693|   7.9615|
|    Irene| 22.24774774774775|   7.9640|
|   Judith|22.684615384615384|   7.9846|
|     Jose| 22.17966903073286|   7.9693|
|   Sergio|22.100371747211895|   7.

In [4]:
print("Data Frame Columns: \n")
estudiantes.show()

# We can now select data columns/fields by field_name
print("Looking at the nombre column: \n")
estudiantes.select("nombre").show()

Data Frame Columns: 

+--------------------+--------+-----------------+----+--------------------+----+
|                 _id|  nombre|        apellidos|edad|               email|nota|
+--------------------+--------+-----------------+----+--------------------+----+
|33a624e7-e6f1-40b...| Valeria| Sebastian Garcia|  23|Valeria.Sebastian...|   8|
|2cd47675-43f3-415...|    Emma|  Sanchez Abascal|  23|Sanchez.Abascal@g...|   8|
|594ea4e7-75e3-456...| Agustin|    Sarabia Lopez|  20|Sarabia.Lopez@gma...|   8|
|3b521244-d2d4-40b...| Martina|Corominas Sarabia|  25|MartinaySebastian...|   8|
|e6f52130-362f-4a5...|   David|   Miranda Grande|  19|DavidyValeria@gma...|   7|
|cee04454-f6ea-48b...|    Laia|     Lopez Bernal|  20|Lopez.Bernal@outl...|   7|
|6e5b75cd-0d5f-41f...|  Marcos|     Garcia Aznar|  22|MarcosySantiago@h...|   7|
|47435195-80b1-473...|  Judith|      Garcia Cruz|  18|Judith.Garcia@gma...|   9|
|fbdf66dc-49da-467...|    Iker|    Seco Coronado|  21| IkerySara@gmail.com|   8|
|0df69

In [5]:
# Probar métodos groupBy(), count(),  orderBy()
print("Histograma para la variable edad: \n")
estudiantes.groupBy("edad").count().orderBy("edad").show()

Histograma para la variable edad: 

+----+-----+
|edad|count|
+----+-----+
|  18| 1256|
|  19| 1170|
|  20| 1193|
|  21| 1158|
|  22| 1093|
|  23| 1017|
|  24|  821|
|  25|  688|
|  26|  517|
|  27|  410|
|  28|  278|
|  29|  177|
|  30|   94|
|  31|   66|
|  32|   30|
|  33|   17|
|  34|    8|
|  35|    5|
|  36|    1|
|  37|    1|
+----+-----+

