# Tutorial: creación de ETLs con PySpark

In [1]:
import os 
from pyspark.sql import functions as f, SparkSession, types as t
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import udf, col, length, isnan, when, count, regexp_replace
from datetime import datetime

In [2]:
# Configuración servidor base de datos transaccional
# Recuerde usar Estudiante_i como usuario y la contraseña asigana en el excel de conexión a maquina virtual como contraseña
db_user = 'Estudiante_36_202314'
db_psswd = 'aabb1122'
source_db_connection_string = 'jdbc:mysql://157.253.236.116:8080/ProyectoTransaccional'
dest_db_connection_string = 'jdbc:mysql://157.253.236.116:8080/Proyecto_G3_202314'
# Driver de conexion
path_jar_driver = 'C:\Program Files (x86)\MySQL\Connector J 8.0\mysql-connector-java-8.0.28.jar'

In [None]:
#Configuración de la sesión
conf=SparkConf() \
    .set('spark.driver.extraClassPath', path_jar_driver)

spark_context = SparkContext(conf=conf)
sql_context = SQLContext(spark_context)
spark = sql_context.sparkSession

In [4]:
def obterner_dataframe_desde_csv(_PATH, _sep):
    return spark.read.load(_PATH, format="csv", sep=_sep, inferSchema="true", header='true')

def obtener_dataframe_de_bd(db_connection_string, sql, db_user, db_psswd):
    df_bd = spark.read.format('jdbc')\
        .option('url', db_connection_string) \
        .option('dbtable', sql) \
        .option('user', db_user) \
        .option('password', db_psswd) \
        .option('driver', 'com.mysql.cj.jdbc.Driver') \
        .load()
    return df_bd

def guardar_db(db_connection_string, df, tabla, db_user, db_psswd):
    df.select('*').write.format('jdbc') \
      .mode('append') \
      .option('url', db_connection_string) \
      .option('dbtable', tabla) \
      .option('user', db_user) \
      .option('password', db_psswd) \
      .option('driver', 'com.mysql.cj.jdbc.Driver') \
      .save()

In [None]:
Consulta de fechas desde la tabla Vuelos y Aeropuertos

In [6]:
sql_fechas =  '''
(
SELECT
    CAST(CONCAT(ano, LPAD(mes, 2, '0'), LPAD('1', 2, '0')) AS UNSIGNED) AS idFecha,
    STR_TO_DATE(CONCAT(ano, '-', LPAD(mes, 2, '0'), '-', LPAD('1', 2, '0')), '%Y-%m-%d') as descripcion,
    ano AS anio,
    mes AS mes,
    1 AS dia
FROM ProyectoTransaccional.vuelos
union
SELECT
    CONCAT(YEAR(fecha_construccion), LPAD(MONTH(fecha_construccion), 2, '0'), LPAD(DAY(fecha_construccion), 2, '0')) AS idFecha,
    fecha_construccion AS descripcion,
    YEAR(fecha_construccion) AS anio,
    MONTH(fecha_construccion) AS mes,
    DAY(fecha_construccion) AS dia
FROM
    ProyectoTransaccional.aeropuertos
union
SELECT
    CONCAT(YEAR(fecha_vigencia), LPAD(MONTH(fecha_vigencia), 2, '0'), LPAD(DAY(fecha_vigencia), 2, '0')) AS idFecha,
    fecha_vigencia AS descripcion,
    YEAR(fecha_vigencia) AS anio,
    MONTH(fecha_vigencia) AS mes,
    DAY(fecha_vigencia) AS dia
FROM
    ProyectoTransaccional.aeropuertos
WHERE
    fecha_vigencia IS NOT NULL AND fecha_vigencia <> ''
union
SELECT
    CONCAT(YEAR(STR_TO_DATE(CONCAT(anio, '-01-01'), '%Y-%m-%d')), '0101') AS idFecha,
    DATE_FORMAT(STR_TO_DATE(CONCAT(anio, '-01-01'), '%Y-%m-%d'), '%Y-%m-%d') AS descripcion,
    anio AS anio,
    1 AS mes,
    1 AS dia
FROM
    ProyectoTransaccional.aeropuertos
) as Fecha
'''
Fecha = obtener_dataframe_de_bd(source_db_connection_string, sql_fechas, db_user, db_psswd)
Fecha.show(10)

+--------+-----------+----+---+---+
| idFecha|descripcion|anio|mes|dia|
+--------+-----------+----+---+---+
|20120901| 2012-09-01|2012|  9|  1|
|20121101| 2012-11-01|2012| 11|  1|
|20121201| 2012-12-01|2012| 12|  1|
|20120101| 2012-01-01|2012|  1|  1|
|20120201| 2012-02-01|2012|  2|  1|
|20040101| 2004-01-01|2004|  1|  1|
|20120301| 2012-03-01|2012|  3|  1|
|20040201| 2004-02-01|2004|  2|  1|
|20120401| 2012-04-01|2012|  4|  1|
|20040301| 2004-03-01|2004|  3|  1|
+--------+-----------+----+---+---+
only showing top 10 rows



In [None]:
Transformaciones 


In [8]:
Fecha = Fecha.dropDuplicates(["IdFecha"])
print("Distinct of IdFecha : "+str(Fecha.count()))
Fecha.show(truncate=False)

Distinct of IdFecha : 405
+--------+-----------+----+---+---+
|idFecha |descripcion|anio|mes|dia|
+--------+-----------+----+---+---+
|19570130|1957-01-30 |1957|1  |30 |
|19570424|1957-04-24 |1957|4  |24 |
|19570723|1957-07-23 |1957|7  |23 |
|19580205|1958-02-05 |1958|2  |5  |
|19580219|1958-02-19 |1958|2  |19 |
|19580228|1958-02-28 |1958|2  |28 |
|19591204|1959-12-04 |1959|12 |4  |
|19600817|1960-08-17 |1960|8  |17 |
|19650902|1965-09-02 |1965|9  |2  |
|19670505|1967-05-05 |1967|5  |5  |
|19670605|1967-06-05 |1967|6  |5  |
|19670828|1967-08-28 |1967|8  |28 |
|19680109|1968-01-09 |1968|1  |9  |
|19680124|1968-01-24 |1968|1  |24 |
|19680308|1968-03-08 |1968|3  |8  |
|19680503|1968-05-03 |1968|5  |3  |
|19680517|1968-05-17 |1968|5  |17 |
|19680612|1968-06-12 |1968|6  |12 |
|19680703|1968-07-03 |1968|7  |3  |
|19680710|1968-07-10 |1968|7  |10 |
+--------+-----------+----+---+---+
only showing top 20 rows



In [None]:
Fecha.show().orderBy(col('idFecha'))

In [10]:
Fecha = Fecha.withColumn('idFecha',col('idFecha').cast('int')).orderBy(col('idFecha'))
Fecha.show()

+--------+-----------+----+---+---+
| idFecha|descripcion|anio|mes|dia|
+--------+-----------+----+---+---+
|19570130| 1957-01-30|1957|  1| 30|
|19570424| 1957-04-24|1957|  4| 24|
|19570723| 1957-07-23|1957|  7| 23|
|19580205| 1958-02-05|1958|  2|  5|
|19580219| 1958-02-19|1958|  2| 19|
|19580228| 1958-02-28|1958|  2| 28|
|19591204| 1959-12-04|1959| 12|  4|
|19600817| 1960-08-17|1960|  8| 17|
|19650902| 1965-09-02|1965|  9|  2|
|19670505| 1967-05-05|1967|  5|  5|
|19670605| 1967-06-05|1967|  6|  5|
|19670828| 1967-08-28|1967|  8| 28|
|19680109| 1968-01-09|1968|  1|  9|
|19680124| 1968-01-24|1968|  1| 24|
|19680308| 1968-03-08|1968|  3|  8|
|19680503| 1968-05-03|1968|  5|  3|
|19680517| 1968-05-17|1968|  5| 17|
|19680612| 1968-06-12|1968|  6| 12|
|19680703| 1968-07-03|1968|  7|  3|
|19680710| 1968-07-10|1968|  7| 10|
+--------+-----------+----+---+---+
only showing top 20 rows



In [11]:
# CARGUE
guardar_db(dest_db_connection_string,Fecha,'Proyecto_G3_202314.Fecha', db_user, db_psswd)