In [1]:
import pandas as pd
import numpy as np
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.types import * ## Librería para realizacion de cast
from pyspark.sql.functions import *

In [3]:
#import findspark
#findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, SparkSession, SQLContext

In [4]:
def getContext(procedure, source, data):
    strPar = "Proc:{0} Source:{1} Data:{2}".format(procedure, source, data)
    conf = SparkConf().setAppName(strPar)##Nombre de la aplicación
    conf.set("spark.hadoop.validateOutputSpecs", "false") ## Configuracion para HDFS
    sc = SparkContext.getOrCreate(conf=conf)##Creación de Spark Context
    sqlContext = SQLContext(sc) ##
    return sqlContext, sc

sqlContext, sc = getContext('TrainingSpark', 'local', 'local')

### Conexión a fuentes de información

In [5]:
#from impala.dbapi import connect
#conn = connect('<Impala Daemon>', port=21050)
#cursor = conn.cursor()
#cursor.execute('SHOW DATABASES')
#cursor.fetchall()

In [6]:
#from pyhive import hive
#conn = hive.connect('elbahidata00.risorse.enel', port=10000)
#cursor = conn.cursor()
#cursor.execute('SHOW DATABASES')
#cursor.fetchall()

In [7]:
sparkSession = (SparkSession.builder.appName('example-pyspark-read-from-hive').config("hive.metastore.uris","thrift://elbahidata00.risorse.enel:10000").enableHiveSupport().getOrCreate())
sparkSession.sql('show tables').show()

#config("hive.metastore.uris","thrift://elbahidata01.risorse.enel:10000")

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



### Creación de sesión en Spark

In [8]:
spark = SparkSession.builder.getOrCreate()
#print(spark)

In [9]:
spark

### Copia de información desde HDFS a filesystem

In [10]:
os.system('hdfs dfs -get /user/co_digital_hub/raw/forcebeat/tdc/ /carga_datos_co/jupyter_notebook/tmp/Data')

256

In [11]:
os.listdir('/carga_datos_co/jupyter_notebook/tmp/Data')

['tdc', 'circuitos_bogota']

### Lectura de parquet desde filesystem to spark dataframe

In [12]:
df = spark.read.parquet('/carga_datos_co/jupyter_notebook/tmp/Data/tdc')

In [13]:
df.head()

Row(CENTRO_OPERATIVO='INS_DELTEC_SUR_OCC', COD_CONTRATISTA='2000039364', DES_CONTRATISTA='DELTEC S.A.', COD_CONTRATO='8400140413', DES_CONTRATO='Control Perdidas Bogotá  Suroccidente', FECHA_CREACION=1571293628000, COD_INT_TDC='5465408', COD_SIST_EXT='CODSYN', COD_EXT_TDC='942326172', PROCESO='Gestión Perdidas', TIPO_TDC='Orden de Inspección ', CICLO_TRABAJO='Ciclo 36 - MANTENIMIENTO MACROMEDIDOR', ESTADO_TDC='Finalizado', CAUSAL_ANUL_SOSP=None, NOMBRE_TDC=None, CODIGO_CLIENTE='117707165', NOMBRE_CLIENTE='OSCAR CESAR LEÓN RIVERA', RUTA_LECTURA='910003043200075', TELEFONO_CLIENTE='3143337746', SUCURSAL='1000', ZONA='3', LONGITUD='-74.1386795', LATITUD='4.6116628', CENTRO_OPERATIVO_CLIENTE=None, TIPO_VIA='URBANA', NUM_VIA_DIRECCION='05', NUM_PLACA_DIRECCION='69', COD_UNIDAD_PREDIAL=None, NUM_UNIDAD_PREDIAL=None, COD_MOD_UNIDAD_PREDIAL='PISO', NUM_MOD_UNIDAD_PREDIAL='1', LOCALIZACCION_TERRENO='FACHADA', MUNICIPIO='BOGOTÁ', TIPO_AGRUPACION=None, CODIGO_POSTAL=None, BARRIO='ZARAGOZA', TEXTO

In [14]:
#df3 = spark.read.load("tabla_temporal")

### Lectura de CSV

In [15]:
os.system('hdfs dfs -get /user/co_digital_hub/raw/forcebeat/circuitos_bogota/ /carga_datos_co/jupyter_notebook/tmp/Data')
os.listdir('/carga_datos_co/jupyter_notebook/tmp/Data')

['tdc', 'circuitos_bogota']

In [16]:
#df = spark.read.format("csv").option("header", "true").load("circuitos_bogota.csv")
lines = sc.textFile("/carga_datos_co/jupyter_notebook/tmp/Data/circuitos_bogota/circuitos_bogota.csv")

In [17]:
lines.collect() ## Mostrar información RDD

['Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~1~-7,415,768,394,390,870~4,646,809,504,846,840~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~1~-7,415,780,099,126,900~4,646,936,285,171,930~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~AER~2~-7,416,175,310,960,750~4,651,644,789,345,930~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~AER~2~-7,416,190,647,625,710~4,651,554,514,375,100~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~3~-7,415,854,926,426,680~4,646,928,133,196,410~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~3~-7,415,849,513,010,300~4,646,964,251,554,860~8400131501',
 'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~4~-7,415,849,525,883,550~4,646,855,702,732,230~8400131501',
 'Sur~Suroccidente~3~840014

In [18]:
lines.first()

'Sur~Suroccidente~3~8400140422~CAM COLOMBIA MULTISERVICIOS S. ~2_AVENIDAS~SUB~1~-7,415,768,394,390,870~4,646,809,504,846,840~8400131501'

In [19]:
parts = lines.map(lambda l: l.split("~"))

In [20]:
#rdd_distribuido = parts.map(lambda p: Row(name=p[0],age=int(p[1])))

In [21]:
#dd_distribuido.collect()

In [22]:
data_df = spark.createDataFrame(parts)

In [23]:
headers = [ 'zona',
            'sub_zona',
            'cuadrante',
            'cod_contrato',
            'contrato',
            'circuito',
            'tipo_ubicacion',
            'grupo',
            'longitud',
            'latitud',
            'cod_contrato_ant']

columns = data_df.columns
str_name = ['col("'+columns[i]+'").alias("'+headers[i]+'")' for i in range(0,len(headers)) ]
data_df = eval('data_df.select('+(',').join(str_name)+')')

In [24]:
data_df.head()

Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,768,394,390,870', latitud='4,646,809,504,846,840', cod_contrato_ant='8400131501')

In [25]:
data_df.dtypes 

[('zona', 'string'),
 ('sub_zona', 'string'),
 ('cuadrante', 'string'),
 ('cod_contrato', 'string'),
 ('contrato', 'string'),
 ('circuito', 'string'),
 ('tipo_ubicacion', 'string'),
 ('grupo', 'string'),
 ('longitud', 'string'),
 ('latitud', 'string'),
 ('cod_contrato_ant', 'string')]

In [26]:
data_df.select(data_df.zona).describe().show()

+-------+------+
|summary|  zona|
+-------+------+
|  count|612632|
|   mean|  null|
| stddev|  null|
|    min|Centro|
|    max|   Sur|
+-------+------+



In [27]:
data_df.columns

['zona',
 'sub_zona',
 'cuadrante',
 'cod_contrato',
 'contrato',
 'circuito',
 'tipo_ubicacion',
 'grupo',
 'longitud',
 'latitud',
 'cod_contrato_ant']

In [28]:
 data_df.count() 

612632

In [29]:
data_df.select(data_df.zona).distinct().show()

+------+
|  zona|
+------+
|   Sur|
| Norte|
|Centro|
+------+



In [30]:
data_df.select(data_df.zona).distinct().count()

3

In [31]:
 data_df.printSchema()

root
 |-- zona: string (nullable = true)
 |-- sub_zona: string (nullable = true)
 |-- cuadrante: string (nullable = true)
 |-- cod_contrato: string (nullable = true)
 |-- contrato: string (nullable = true)
 |-- circuito: string (nullable = true)
 |-- tipo_ubicacion: string (nullable = true)
 |-- grupo: string (nullable = true)
 |-- longitud: string (nullable = true)
 |-- latitud: string (nullable = true)
 |-- cod_contrato_ant: string (nullable = true)



In [32]:
data_df.show()

+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+----------------+
|zona|    sub_zona|cuadrante|cod_contrato|            contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|cod_contrato_ant|
+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+----------------+
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2_AVENIDAS|           SUB|    1|-7,415,768,394,39...|4,646,809,504,846...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2_AVENIDAS|           SUB|    1|-7,415,780,099,12...|4,646,936,285,171...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2_AVENIDAS|           AER|    2|-7,416,175,310,96...|4,651,644,789,345...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2_AVENI

In [33]:
data_df.explain()

== Physical Plan ==
*Project [_1#108 AS zona#131, _2#109 AS sub_zona#132, _3#110 AS cuadrante#133, _4#111 AS cod_contrato#134, _5#112 AS contrato#135, _6#113 AS circuito#136, _7#114 AS tipo_ubicacion#137, _8#115 AS grupo#138, _9#116 AS longitud#139, _10#117 AS latitud#140, _11#118 AS cod_contrato_ant#141]
+- Scan ExistingRDD[_1#108,_2#109,_3#110,_4#111,_5#112,_6#113,_7#114,_8#115,_9#116,_10#117,_11#118]


In [34]:
data_df.head()

Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,768,394,390,870', latitud='4,646,809,504,846,840', cod_contrato_ant='8400131501')

In [35]:
data_df.first()

Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,768,394,390,870', latitud='4,646,809,504,846,840', cod_contrato_ant='8400131501')

In [36]:
data_df.take(5)

[Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,768,394,390,870', latitud='4,646,809,504,846,840', cod_contrato_ant='8400131501'),
 Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,780,099,126,900', latitud='4,646,936,285,171,930', cod_contrato_ant='8400131501'),
 Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS', tipo_ubicacion='AER', grupo='2', longitud='-7,416,175,310,960,750', latitud='4,651,644,789,345,930', cod_contrato_ant='8400131501'),
 Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2_AVENIDAS

In [37]:
#data_df = data_df.withColumn('grupo', col('grupo').cast('float'))
#data_df = data_df.withColumn('longitud', col('longitud').cast('float'))
#data_df = data_df.withColumn('latitud', col('latitud').cast(FloatType()))

In [38]:
data_df = data_df.withColumn('circuito', regexp_replace('circuito', '[_]', ' '))

In [39]:
data_df.head()

Row(zona='Sur', sub_zona='Suroccidente', cuadrante='3', cod_contrato='8400140422', contrato='CAM COLOMBIA MULTISERVICIOS S. ', circuito='2 AVENIDAS', tipo_ubicacion='SUB', grupo='1', longitud='-7,415,768,394,390,870', latitud='4,646,809,504,846,840', cod_contrato_ant='8400131501')

In [40]:
data_df.schema 

StructType(List(StructField(zona,StringType,true),StructField(sub_zona,StringType,true),StructField(cuadrante,StringType,true),StructField(cod_contrato,StringType,true),StructField(contrato,StringType,true),StructField(circuito,StringType,true),StructField(tipo_ubicacion,StringType,true),StructField(grupo,StringType,true),StructField(longitud,StringType,true),StructField(latitud,StringType,true),StructField(cod_contrato_ant,StringType,true)))

In [41]:
data_df.select('sub_zona').show()

+------------+
|    sub_zona|
+------------+
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
|Suroccidente|
+------------+
only showing top 20 rows



In [42]:
data_df.dtypes 

[('zona', 'string'),
 ('sub_zona', 'string'),
 ('cuadrante', 'string'),
 ('cod_contrato', 'string'),
 ('contrato', 'string'),
 ('circuito', 'string'),
 ('tipo_ubicacion', 'string'),
 ('grupo', 'string'),
 ('longitud', 'string'),
 ('latitud', 'string'),
 ('cod_contrato_ant', 'string')]

In [43]:
#df.select("circuito","zona",explode("cuadrante").alias("Cuadrante Alias")).select("zona","circuito","cuadrante").show()

### Select

In [44]:
data_df.select(data_df["zona"],data_df["longitud"]+ 1).filter(data_df["longitud"].isNotNull()).show()

+----+--------------+
|zona|(longitud + 1)|
+----+--------------+
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
| Sur|          null|
+----+--------------+
only showing top 20 rows



In [45]:
data_df.filter(data_df["longitud"].isNotNull()).show()

+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+----------------+
|zona|    sub_zona|cuadrante|cod_contrato|            contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|cod_contrato_ant|
+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+----------------+
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,768,394,39...|4,646,809,504,846...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,780,099,12...|4,646,936,285,171...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           AER|    2|-7,416,175,310,96...|4,651,644,789,345...|      8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENI

In [46]:
data_df.select(data_df['grupo'] > 5).show()

+-----------+
|(grupo > 5)|
+-----------+
|      false|
|      false|
|      false|
|      false|
|      false|
|      false|
|      false|
|      false|
|      false|
|      false|
|       true|
|       true|
|       true|
|       true|
|       true|
|       true|
|       true|
|       true|
|       true|
|       true|
+-----------+
only showing top 20 rows



### When

In [47]:
from pyspark.sql import functions as F
when = data_df.select("contrato",F.when(data_df.cuadrante > 3, 1).otherwise(0)).show()

+--------------------+-------------------------------------------+
|            contrato|CASE WHEN (cuadrante > 3) THEN 1 ELSE 0 END|
+--------------------+-------------------------------------------+
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                          0|
|CAM COLOMBIA MULT...|                                        

### Like

In [48]:
data_df.select("contrato",data_df.contrato.like("%COLOMBIA%")).show()

+--------------------+------------------------+
|            contrato|contrato LIKE %COLOMBIA%|
+--------------------+------------------------+
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                    true|
|CAM COLOMBIA MULT...|                  

### Startswith - Endswith

In [49]:
data_df.select("contrato",data_df.contrato.startswith("CAM")).show()

+--------------------+-------------------------+
|            contrato|startswith(contrato, CAM)|
+--------------------+-------------------------+
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT...|                     true|
|CAM COLOMBIA MULT..

In [50]:
data_df.select("circuito","contrato",data_df.contrato.startswith("CAM")).show()

+----------+--------------------+-------------------------+
|  circuito|            contrato|startswith(contrato, CAM)|
+----------+--------------------+-------------------------+
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|                     true|
|2 AVENIDAS|CAM COLOMBIA MULT...|       

In [51]:
data_df.select("circuito",data_df.circuito.endswith("DAS")).show()

+----------+-----------------------+
|  circuito|endswith(circuito, DAS)|
+----------+-----------------------+
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
|2 AVENIDAS|                   true|
+----------+-----------------------+
only showing top 20 rows



### Substring

In [52]:
data_df.select(data_df.circuito.substr(3, 9).alias("Circuito 2")).show()

+----------+
|Circuito 2|
+----------+
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
|  AVENIDAS|
+----------+
only showing top 20 rows



### Between

In [53]:
 data_df.select("cuadrante",data_df.cuadrante.between(4, 8)).filter(data_df.cuadrante.between(4, 8) == True).show()

+---------+---------------------------------------+
|cuadrante|((cuadrante >= 4) AND (cuadrante <= 8))|
+---------+---------------------------------------+
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4|                                   true|
|        4| 

### Renaming Columns

In [54]:
data_df = data_df.withColumnRenamed('cod_contrato_ant', 'codigo_contrato_ant')

In [56]:
## data_df = data_df.drop("address", "phoneNumber")
## data_df = data_df.drop(data_df.address).data_df(df.phoneNumber)

In [57]:
data_df.show()

+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|zona|    sub_zona|cuadrante|cod_contrato|            contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,768,394,39...|4,646,809,504,846...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,780,099,12...|4,646,936,285,171...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           AER|    2|-7,416,175,310,96...|4,651,644,789,345...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMB

In [58]:
data_df.groupBy("circuito").count().show()

+----------+-----+
|  circuito|count|
+----------+-----+
|BELLASUIZA| 2120|
|MISERICORD|  642|
| ACUEDUCTO|  406|
|  AYACUCHO|  628|
| COMUNEROS|  864|
| CONSTRUIR|  520|
|  STA INES|  790|
|  ENGATIVA|  852|
|HILANDERIA|  352|
|      INEM| 1190|
|  V ANDINO|  588|
|UM SER AUX|    2|
| BELEN ETB| 1396|
|    BOSCAN|  746|
| IMPRE NAL|  350|
|TOR OVIEDO|  316|
| LUNA PARK|  848|
|   SANTAFE|  222|
|CALLE22 BC|  468|
|  EMISORAS|  886|
+----------+-----+
only showing top 20 rows



### Sort

In [59]:
data_df.groupBy("circuito").count().sort(data_df.circuito.desc()).show()

+----------+-----+
|  circuito|count|
+----------+-----+
|ZONAFRANCA|  428|
| ZARZAMORA|  824|
|ZAPATA BOL|  274|
|     XEROS|  758|
|WORL TRADE|  616|
|    WILSON|  698|
| WEST ARCO| 1156|
|VOZ VICTOR| 2510|
|  VOTO NAL|  510|
|     VOGUE|  242|
|  VL SONIA|  532|
| VL SAUCES|  908|
|  VL PRADO| 1336|
|  VL MAYOR|  498|
|  VL MARIA|  638|
|VL MAGDALA|  808|
|  VL ANITA|  504|
|VL ALSACIA| 1196|
| VIVIENDAS|  458|
|    VIVERO|  912|
+----------+-----+
only showing top 20 rows



In [60]:
 data_df.sort("circuito", ascending=False).show()

+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|  zona|        sub_zona|cuadrante|cod_contrato|           contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|    1|-7,413,548,523,05...|4,671,658,276,148...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|   11|-741,524,773,971,814|4,673,370,574,593...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|    1|-7,413,561,165,76...|4,671,486,563,956...|         8400135559|
|Centro|Centro Occiden

In [61]:
data_df.orderBy(["circuito","cuadrante"],ascending=[0,1]).show()

+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|  zona|        sub_zona|cuadrante|cod_contrato|           contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|    1|-7,413,548,523,05...|4,671,658,276,148...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|   11|-741,524,773,971,814|4,673,370,574,593...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |ZONAFRANCA|           SUB|    1|-7,413,561,165,76...|4,671,486,563,956...|         8400135559|
|Centro|Centro Occiden

In [62]:
data_df.na.fill(0).show()

+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|zona|    sub_zona|cuadrante|cod_contrato|            contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,768,394,39...|4,646,809,504,846...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,780,099,12...|4,646,936,285,171...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           AER|    2|-7,416,175,310,96...|4,651,644,789,345...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMB

In [63]:
data_df.na.drop().show()

+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|zona|    sub_zona|cuadrante|cod_contrato|            contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+----+------------+---------+------------+--------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,768,394,39...|4,646,809,504,846...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           SUB|    1|-7,415,780,099,12...|4,646,936,285,171...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMBIA MULT...|2 AVENIDAS|           AER|    2|-7,416,175,310,96...|4,651,644,789,345...|         8400131501|
| Sur|Suroccidente|        3|  8400140422|CAM COLOMB

### Registering DataFrames as Views

In [77]:
data_df.createGlobalTempView("circuitos_global_temp_view")

In [79]:
data_df.createTempView("circuitos_temp_view")

AnalysisException: "Temporary table 'circuitos_temp_view' already exists;"

In [80]:
data_df.createOrReplaceTempView("circuitos_temp_view")

In [81]:
data_df.write.saveAsTable("circuitos.circuitos_table")

AnalysisException: "Database 'circuitos' not found;"

In [73]:
print(spark.catalog.listTables())

[Table(name='circuitos_table', database='default', description=None, tableType='MANAGED', isTemporary=False), Table(name='circuitos_temp_view', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


### Query Views Spark Catalog

In [75]:
df_query = spark.sql("SELECT * FROM circuitos_table").show()

+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|  zona|        sub_zona|cuadrante|cod_contrato|           contrato|  circuito|tipo_ubicacion|grupo|            longitud|             latitud|codigo_contrato_ant|
+------+----------------+---------+------------+-------------------+----------+--------------+-----+--------------------+--------------------+-------------------+
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |FONTIBON C|           AER|  274|-7,415,180,576,25...|4,684,586,460,431...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |FONTIBON C|           AER|  274|-7,415,223,800,27...|4,685,012,133,699...|         8400135559|
|Centro|Centro Occidente|        3|  8400140428|CONSORCIO SS BOG 2 |FONTIBON C|           SUB|  275|-741,355,927,403,451|4,672,210,195,696...|         8400135559|
|Centro|Centro Occiden

In [None]:
df_query_temp = spark.sql("SELECT * FROM global_temp.people").show()

In [None]:
spark.stop()

### Temporal

In [None]:
hdfs_path = "/carga_datos_co/jupyter_notebook/tmp/Data"

In [None]:
pd_temp2 = pd.DataFrame(np.random.random(10))

In [None]:
# Create spark_temp from pd_temp
spark_temp = spark.createDataFrame(pd_temp2)

In [None]:
# Examine the tables in the catalog
print(spark.catalog.listTables())

In [None]:
# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView("tabla_temporal2")

In [None]:
# Examine the tables in the catalog again
print(spark.catalog.listTables())

In [None]:
#spark = SparkSession.builder.appName('abc').getOrCreate()
#df = pd.read_csv('path')
lista=[1,2,3,4,5,6]
lista
#spark.conf
df=spark.read.csv('filename.csv',header=True)