In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_2") \
        .master("local[*]") \
        .getOrCreate()

In [2]:
spark

In [3]:
contracts_df  = spark.read.option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","true")\
        .csv("../../resources/data/csv/contracts.csv")

In [4]:
contracts_df.show(2)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|          6|         100|2012-05-01|  true|
|  30001|          6|         200|2014-05-01|  true|
+-------+-----------+------------+----------+------+
only showing top 2 rows



In [5]:
contracts_df.printSchema()

root
 |-- cod_iuc: integer (nullable = true)
 |-- cod_titular: integer (nullable = true)
 |-- cod_producto: integer (nullable = true)
 |-- fec_alta: date (nullable = true)
 |-- activo: boolean (nullable = true)



#### - Clase Column
#### - Package Types
#### - Object functions
#### - Clase Dataframe

### Transformaciones

In [6]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

resulted_df = contracts_df \
    .withColumnRenamed("fec_alta", "fec_alta_ini")\
    .withColumn("actual_date", f.current_date())\
    .withColumn("randm_num", f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()))\
    .withColumn("fec_alta_fin", f.date_add(f.col("fec_alta_ini"), f.col("randm_num"))) \
    .withColumn("diff", f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini"))) \
    .drop("randm_num")

### Acciones

In [7]:
resulted_df.show(2)

+-------+-----------+------------+------------+------+-----------+------------+----+
|cod_iuc|cod_titular|cod_producto|fec_alta_ini|activo|actual_date|fec_alta_fin|diff|
+-------+-----------+------------+------------+------+-----------+------------+----+
|  30000|          6|         100|  2012-05-01|  true| 2024-01-16|  2012-05-09|   8|
|  30001|          6|         200|  2014-05-01|  true| 2024-01-16|  2014-05-06|   5|
+-------+-----------+------------+------------+------+-----------+------------+----+
only showing top 2 rows



In [8]:
resulted_df.count()

32

In [9]:
resulted_df.collect() ## WARNING

[Row(cod_iuc=30000, cod_titular=6, cod_producto=100, fec_alta_ini=datetime.date(2012, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2012, 5, 9), diff=8),
 Row(cod_iuc=30001, cod_titular=6, cod_producto=200, fec_alta_ini=datetime.date(2014, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2014, 5, 6), diff=5),
 Row(cod_iuc=30002, cod_titular=6, cod_producto=300, fec_alta_ini=datetime.date(2006, 2, 1), activo=False, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2006, 2, 2), diff=1),
 Row(cod_iuc=30003, cod_titular=6, cod_producto=150, fec_alta_ini=datetime.date(2012, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2012, 5, 4), diff=3),
 Row(cod_iuc=30002, cod_titular=5, cod_producto=300, fec_alta_ini=datetime.date(2012, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2012, 5, 8), diff=7),
 Row(cod_iuc=30004, cod_

In [10]:
resulted_df.isEmpty()

False

In [11]:
#first() -> primer elemento del df -> head()
#head(n) -> primer N-elementos del df -> take(n)
#take -> primer N-elementos del df -> limit().collect()

resulted_df.take(4)

[Row(cod_iuc=30000, cod_titular=6, cod_producto=100, fec_alta_ini=datetime.date(2012, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2012, 5, 9), diff=8),
 Row(cod_iuc=30001, cod_titular=6, cod_producto=200, fec_alta_ini=datetime.date(2014, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2014, 5, 6), diff=5),
 Row(cod_iuc=30002, cod_titular=6, cod_producto=300, fec_alta_ini=datetime.date(2006, 2, 1), activo=False, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2006, 2, 2), diff=1),
 Row(cod_iuc=30003, cod_titular=6, cod_producto=150, fec_alta_ini=datetime.date(2012, 5, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2012, 5, 4), diff=3)]

In [12]:
resulted_df.tail(4)

[Row(cod_iuc=30008, cod_titular=7, cod_producto=800, fec_alta_ini=datetime.date(2004, 9, 1), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2004, 9, 10), diff=9),
 Row(cod_iuc=30008, cod_titular=2, cod_producto=800, fec_alta_ini=datetime.date(2014, 3, 21), activo=True, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2014, 3, 29), diff=8),
 Row(cod_iuc=30004, cod_titular=2, cod_producto=400, fec_alta_ini=datetime.date(2008, 2, 1), activo=False, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2008, 2, 10), diff=9),
 Row(cod_iuc=30010, cod_titular=7, cod_producto=1000, fec_alta_ini=datetime.date(2014, 8, 1), activo=False, actual_date=datetime.date(2024, 1, 16), fec_alta_fin=datetime.date(2014, 8, 8), diff=7)]

In [13]:
resulted_df.describe("cod_iuc","cod_titular").show()

+-------+------------------+------------------+
|summary|           cod_iuc|       cod_titular|
+-------+------------------+------------------+
|  count|                32|                32|
|   mean|        30005.0625|            4.4375|
| stddev|3.1819805153393363|2.0150642349482912|
|    min|             30000|                 1|
|    max|             30011|                 7|
+-------+------------------+------------------+



In [14]:
resulted_df.summary("count", "stddev").show()

+-------+------------------+------------------+-----------------+------------------+
|summary|           cod_iuc|       cod_titular|     cod_producto|              diff|
+-------+------------------+------------------+-----------------+------------------+
|  count|                32|                32|               32|                32|
| stddev|3.1819805153393363|2.0150642349482912|310.1402843355599|2.8239685527811345|
+-------+------------------+------------------+-----------------+------------------+



### Functions

In [15]:
resulted_df.columns

['cod_iuc',
 'cod_titular',
 'cod_producto',
 'fec_alta_ini',
 'activo',
 'actual_date',
 'fec_alta_fin',
 'diff']

In [16]:
resulted_df.printSchema()

root
 |-- cod_iuc: integer (nullable = true)
 |-- cod_titular: integer (nullable = true)
 |-- cod_producto: integer (nullable = true)
 |-- fec_alta_ini: date (nullable = true)
 |-- activo: boolean (nullable = true)
 |-- actual_date: date (nullable = false)
 |-- fec_alta_fin: date (nullable = true)
 |-- diff: integer (nullable = true)



In [17]:
resulted_df.schema

StructType([StructField('cod_iuc', IntegerType(), True), StructField('cod_titular', IntegerType(), True), StructField('cod_producto', IntegerType(), True), StructField('fec_alta_ini', DateType(), True), StructField('activo', BooleanType(), True), StructField('actual_date', DateType(), False), StructField('fec_alta_fin', DateType(), True), StructField('diff', IntegerType(), True)])

In [18]:
resulted_df.dtypes

[('cod_iuc', 'int'),
 ('cod_titular', 'int'),
 ('cod_producto', 'int'),
 ('fec_alta_ini', 'date'),
 ('activo', 'boolean'),
 ('actual_date', 'date'),
 ('fec_alta_fin', 'date'),
 ('diff', 'int')]

In [19]:
resulted_df.rdd.foreach(print)

In [20]:
resulted_df.explain(True)

== Parsed Logical Plan ==
Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta_ini#54, activo#21, actual_date#60, fec_alta_fin#75, diff#84]
+- Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta_ini#54, activo#21, actual_date#60, randm_num#67, fec_alta_fin#75, datediff(fec_alta_fin#75, fec_alta_ini#54) AS diff#84]
   +- Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta_ini#54, activo#21, actual_date#60, randm_num#67, date_add(fec_alta_ini#54, randm_num#67) AS fec_alta_fin#75]
      +- Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta_ini#54, activo#21, actual_date#60, cast(round((rand(0) * cast(10 as double)), 0) as int) AS randm_num#67]
         +- Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta_ini#54, activo#21, current_date(Some(GMT-06:00)) AS actual_date#60]
            +- Project [cod_iuc#17, cod_titular#18, cod_producto#19, fec_alta#20 AS fec_alta_ini#54, activo#21]
               +- Relation [cod_iuc#17,cod_titular#1