<a href="https://colab.research.google.com/github/iGhostlp/Albus/blob/severus/Proyecto_BBVA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Armado del entorno

In [90]:
# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

In [91]:
# Unzip the file
!tar xf spark-3.3.2-bin-hadoop3.tgz

In [92]:
!readlink -f $(which java) | sed "s:bin/java::"

/usr/lib/jvm/java-11-openjdk-amd64/


In [93]:
# Set up the environment for Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64/"
os.environ["SPARK_HOME"] = '/content/spark-3.3.2-bin-hadoop3'

In [94]:
# Install library for finding Spark
!pip install -q findspark

# Import the libary
import findspark

# Initiate findspark
findspark.init()

In [95]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config('spark.sql.parquet.datetimeRebaseModeInRead','CORRECTED').getOrCreate()

# Check Spark Session Information
spark

# Importado de funciones

In [96]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, concat_ws, col, row_number, desc, collect_list, to_json, struct, year, current_date, datediff, floor, when, lit
from pyspark.sql.window import Window

In [97]:
#Creación de un SQL context
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)



# Carga de parquets

In [98]:
df_customer = spark.read.parquet('/content/Datasets/customer_basics.snappy.parquet')
df_phones = spark.read.parquet('/content/Datasets/phones.snappy.parquet')
df_address = spark.read.parquet('/content/Datasets/address.snappy.parquet')
df_emails = spark.read.parquet('/content/Datasets/emails.snappy.parquet')
df_marital_status = spark.read.parquet('/content/Datasets/marital_status_type.parquet')
df_segment_type = spark.read.parquet('/content/Datasets/segment_type.parquet')
df_customer_documents = spark.read.parquet('/content/Datasets/customer_documents.parquet')
df_address_type = spark.read.parquet('/content/Datasets/address_type.parquet')
df_gender = spark.read.parquet('/content/Datasets/gender.parquet')
df_nationality = spark.read.parquet('/content/Datasets/nationality.parquet')
df_personal_type = spark.read.parquet('/content/Datasets/personal_type.parquet')
df_phone_type = spark.read.parquet('/content/Datasets/phone_type.parquet')
df_province = spark.read.parquet('/content/Datasets/province.parquet')
df_customer_segment = spark.read.parquet('/content/Datasets/customer_segment.parquet')
df_customer_info_temp = spark.read.parquet('/content/Datasets/customer_info_temp.parquet')

### Normalización de datos

In [99]:
df_customer = df_customer.withColumnRenamed('last_change_date', 'l_c_d_customer')

In [100]:
df_customer = df_customer.withColumn("job_type_desc", 
                                             when(df_customer.job_type == 1, 'REL.DEPENDENCIA/FIJO')
                                             .when(df_customer.job_type == 2, 'TEMPORAL')
                                             .when(df_customer.job_type == 3, 'AUTONOMO/INDEPENDIENTE')
                                             .when(df_customer.job_type == 3, 'OTROS')
                                             .otherwise("-"))

In [101]:
df_marital_status_ok = df_marital_status.withColumn("marital_status_type", 
                                             when(df_marital_status.martial_status_short_desc == "NO INFORMA", 0)
                                             .when(df_marital_status.martial_status_short_desc == "CASADO/A  ", 1)
                                             .when(df_marital_status.martial_status_short_desc == "VIUDO/A   ", 2)
                                             .when(df_marital_status.martial_status_short_desc == "SEPARADO/A", 3)
                                             .when(df_marital_status.martial_status_short_desc == "DIVORCIADO", 4)
                                             .when(df_marital_status.martial_status_short_desc == "SOLTERO/A ", 5)
                                             .when(df_marital_status.martial_status_short_desc == "CONVIVIENT", 6)
                                             .when(df_marital_status.martial_status_short_desc == "OTROS     ", 7)
                                             .when(df_marital_status.martial_status_short_desc == "CONV. INSC", 8)
                                             .otherwise("-"))

# Extraccion de datos



### - 24 - Extraccion de datos desde parquet, clientes y teléfonos.
####Crear un DataFrame que contenga el JOIN de la tabla t_abtq_customer_basics y tabla t_abtq_customer_phones.

In [102]:
df_customer_phones = df_phones.join(df_customer, 'customer_id')

In [103]:
df_customer_phones.show()

+-----------+--------------+-------------------+-----------------+----------+----------------+---------------+-------------+-------------------+-----------------+-------------+------------+---------------+-----------------------+------------------+---------------------+-----------------+----------+-----------+--------------------------+---------------------+--------------------+---------------------+----------------+-------------------------+-------------------------+------------------+-------------------+-----------------+--------------------+---------------------+-----------------------+----------------------+--------------------+----------------------+------------------------------+----------------------------+-------------------+----------------+----------------+-------------------+--------------------+-----------------------+---------------------+-------------+-------------+--------------+---------------+--------------------+---------+----------------+-----------+-----------------

### - 25 - Extraccion de datos desde .parquet, clientes y direcciones
####Crear un DataFrame que contenga el JOIN de la tabla t_abtq_customer_basics y tabla t_abtq_customer_adress.

In [104]:
df_customer_address = df_address.join(df_customer, 'customer_id')

In [105]:
df_customer_address.show()

+-----------+-----------------------+-------------------+--------------+-----------+------------------+---------------------------+-----------------+-------------+---------------------+-----------------+-----------------------+----------+---------------+-----------+------------------+----------------------+-------------------------+------------------+---------------------+------------------------------+-----------------------+--------------------+---------------------+------------------+-------------------------+-------------------------+------------------+-----------------------------+--------------------+---------------------+-----------------------+----------------------+--------------------+----------------------+------------------------------+----------------------------+------------------------+-------------------+----------------+----------------+-------------------+--------------------+-----------------------+-----------------+---------------------+-------------+-----------+---

### - 26 - Extraccion de datos desde .parquet, clientes y correos electrónicos
#### Crear un DataFrame que contenga el JOIN de la tabla t_abtq_customer_basics y tabla t_abtq_customer_email.   

In [106]:
df_customer_emails = df_emails.join(df_customer, 'customer_id')

In [107]:
df_customer_emails.show()

+-----------+---------+-------------------+--------------+----------+------------------+----------+-----------------+--------------+-------------------+-------------+--------------------------+--------------+----------------+-------------------+--------------------+-----------------------+---------------------+--------------------------+-------------------+----------------+-------------+-----------+--------------+---------------+--------------------+---------+----------------+-----------+-----------------------+-----------------------+----------+-------------------+-----------------+--------------------------+---------+----------------+----------+-------------------+-----------+----------+----------------------+--------------------+---------------------+-----------+--------------------------+----------------------------+-------------------------------+------------------------+------------------------+--------------------------+------------------------+--------------------------+-------

# Filtrar datos, reducir volumen

####- 27 - Filtrar el DataFrame de contactos telefónicos de clientes y resguardar los 3 contactos más actuales por cliente.

In [108]:
df_customer_phones_sorted = df_customer_phones.orderBy([df_customer_phones.customer_id, desc('last_change_date')])

In [109]:
window = Window.partitionBy(df_customer_phones_sorted.customer_id).orderBy(desc(df_customer_phones_sorted.last_change_date))

In [110]:
df_phone_contact = df_customer_phones_sorted.withColumn('row_num', row_number().over(window))
df_phone_contact = df_phone_contact.filter(df_phone_contact.row_num <= 3)

In [111]:
df_phone_contact = df_phone_contact.withColumn("full_phone", concat_ws("-", 'prefix_phone_id', 'phone_area_id', 'phone_exchange_id', 'phone_line_id'))

In [112]:
df_pivot_pc = df_phone_contact.groupBy('customer_id').agg(collect_list('full_phone').alias('last_3_changes_list'))

In [113]:
df_pivot_phone = df_pivot_pc.selectExpr('customer_id', 'last_3_changes_list[0] as phone_1', 'last_3_changes_list[1] as phone_2', 'last_3_changes_list[2] as phone_3')

In [114]:
df_pivot_phone = df_pivot_phone.na.fill('---')
df_pivot_phone.show()

+-----------+---------------+---------------+-------+
|customer_id|        phone_1|        phone_2|phone_3|
+-----------+---------------+---------------+-------+
|   06987700|54-11-4451-4104|            ---|    ---|
|   19801715|54-11-4544-8655|            ---|    ---|
|   21162204|54-11-6286-0946|            ---|    ---|
|   23417903|54-11-4788-3665|            ---|    ---|
|   24401530|54-11-5656-3942|            ---|    ---|
|   24935788|54-3447-64-4382|            ---|    ---|
|   25633010|54-11-5667-3375|54-11-4342-8605|    ---|
|   26183611|54-11-4341-0084|54-11-4341-0084|    ---|
|   26293280|54-2323-67-7206|            ---|    ---|
|   26488506|54-2964-54-9154|            ---|    ---|
|   26923214|54-11-6705-6375|            ---|    ---|
|   27008407|54-2657-22-6503|54-2657-31-3231|    ---|
|   28445138|54-11-6161-5326|            ---|    ---|
|   28633515|54-351-716-8741|            ---|    ---|
|   28660498|54-223-451-2038|            ---|    ---|
+-----------+---------------

####- 28 - Filtrar el DataFrame de direcciones de clientes y resguardar los 3 contactos más actuales por cliente.

In [115]:
df_customer_address_sorted = df_customer_address.orderBy([df_customer_address.customer_id, desc('last_change_date')])

In [116]:
window = Window.partitionBy(df_customer_address_sorted.customer_id).orderBy(desc(df_customer_address_sorted.last_change_date))

In [117]:
df_address_contact = df_customer_address_sorted.withColumn('row_num', row_number().over(window))
df_address_contact = df_address_contact.filter(df_address_contact.row_num <= 3)

In [118]:
df_address_contact = df_address_contact.withColumn("full_address", concat('street_name', 'address_outdoor_id', 'address_indoor_id', 'indoor_number', 'address_department_name', 'province_id', 'zipcode_id'))

In [119]:
df_pivot_ad = df_address_contact.groupBy('customer_id').agg(collect_list('full_address').alias('last_3_changes_list'))

In [120]:
df_pivot_address = df_pivot_ad.selectExpr('customer_id', 'last_3_changes_list[0] as address_1', 'last_3_changes_list[1] as address_2', 'last_3_changes_list[2] as address_3')

In [121]:
df_pivot_address = df_pivot_address.na.fill('---')
df_pivot_address.toPandas()

Unnamed: 0,customer_id,address_1,address_2,address_3


####- 29 - Filtrar el DataFrame de correos electrónicos de clientes  y resguardar los 3 contactos más actuales por cliente. 

In [122]:
df_customer_emails_sorted = df_customer_emails.orderBy([df_customer_emails.customer_id, desc('last_change_date')])

In [123]:
window = Window.partitionBy(df_customer_emails_sorted.customer_id).orderBy(desc(df_customer_emails_sorted.last_change_date))

In [124]:
df_email_contact = df_customer_emails_sorted.withColumn('row_num', row_number().over(window))
df_email_contact = df_email_contact.filter(df_email_contact.row_num <= 3)

In [125]:
df_pivot_ec = df_email_contact.groupBy('customer_id').agg(collect_list('email_desc').alias('last_3_changes_list'))

In [126]:
df_pivot_email = df_pivot_ec.selectExpr('customer_id', 'last_3_changes_list[0] as email_1', 'last_3_changes_list[1] as email_2', 'last_3_changes_list[2] as email_3')

In [127]:
df_pivot_email = df_pivot_email.na.fill('---')
df_pivot_email.toPandas()

Unnamed: 0,customer_id,email_1,email_2,email_3


# Enrequecimiento de datos

####- 30 - Agregar una nueva columna a los DataFrame de contactos, indicando el contact_type según corresponda (address, email, phone)

In [128]:
df_phones_contact_col = df_phones.withColumn('contact_type_phones', lit('phone'))
df_phones_contact_col.select('customer_id','contact_type_phones').show()

+-----------+-------------------+
|customer_id|contact_type_phones|
+-----------+-------------------+
|   29377168|              phone|
|   29385404|              phone|
|   29385404|              phone|
|   29385454|              phone|
|   27626595|              phone|
|   24349744|              phone|
|   29389994|              phone|
|   29389999|              phone|
|   01192665|              phone|
|   29359207|              phone|
|   29354182|              phone|
|   22538341|              phone|
|   29387551|              phone|
|   29387551|              phone|
|   29353051|              phone|
|   29132899|              phone|
|   29390010|              phone|
|   27617687|              phone|
|   27617687|              phone|
|   29387672|              phone|
+-----------+-------------------+
only showing top 20 rows



In [129]:
df_emails_contact_col = df_emails.withColumn('contact_type_emails', lit('e-mail'))
df_emails_contact_col.select('customer_id','contact_type_emails').show()

+-----------+-------------------+
|customer_id|contact_type_emails|
+-----------+-------------------+
|   00000000|             e-mail|
|   00000000|             e-mail|
|   00027555|             e-mail|
|   00027568|             e-mail|
|   00027595|             e-mail|
|   00027609|             e-mail|
|   27514336|             e-mail|
|   00027726|             e-mail|
|   00027766|             e-mail|
|   00027780|             e-mail|
|   00027781|             e-mail|
|   00027867|             e-mail|
|   00027877|             e-mail|
|   00027942|             e-mail|
|   00027990|             e-mail|
|   00028043|             e-mail|
|   00028080|             e-mail|
|   00028091|             e-mail|
|   00028104|             e-mail|
|   00028284|             e-mail|
+-----------+-------------------+
only showing top 20 rows



In [130]:
df_address_contact_col = df_address.withColumn('contact_type_address', lit('address'))
df_address_contact_col.select('customer_id','contact_type_address').show()

+-----------+--------------------+
|customer_id|contact_type_address|
+-----------+--------------------+
|   00000003|             address|
|   00000009|             address|
|   00000009|             address|
|   00000011|             address|
|   00000027|             address|
|   00000027|             address|
|   00000027|             address|
|   00000028|             address|
|   00000035|             address|
|   00000035|             address|
|   00000035|             address|
|   00000035|             address|
|   00000050|             address|
|   00000050|             address|
|   00000052|             address|
|   00000052|             address|
|   00000052|             address|
|   00000052|             address|
|   25567249|             address|
|   01096851|             address|
+-----------+--------------------+
only showing top 20 rows



####- 31 - Agregar una nueva columna al DataFrame de contactos telefónicos de clientes, resguardando el contacto en formato json contenido en string, con los datos: Phone_type (mobile, landline ), Código país., Código de Área, Número teléfono.

In [131]:
df_phone_contact = df_phone_contact.withColumn("phone_contact", to_json(struct(df_phone_contact.phone_type, df_phone_contact.phone_country_id, df_phone_contact.prefix_phone_id, df_phone_contact.phone_area_id, df_phone_contact.cellphone_prefix_id, df_phone_contact.phone_exchange_id, df_phone_contact.phone_line_id)))

In [132]:
df_phone_contact.show(truncate=False)

+-----------+--------------+-------------------+-----------------+----------+----------------+---------------+-------------+-------------------+-----------------+-------------+------------+---------------+-----------------------+------------------+---------------------+-----------------+----------+-----------+--------------------------+---------------------+--------------------+---------------------+----------------+-------------------------+-------------------------+------------------+-------------------+-----------------+--------------------+---------------------+-----------------------+----------------------+--------------------+----------------------+------------------------------+----------------------------+-------------------+----------------+----------------+-------------------+--------------------+-----------------------+-----------------------+-------------+-------------+--------------+---------------+--------------------+---------+----------------+-----------+---------------

####- 32 - Agregar una nueva columna al DataFrame de direcciones de clientes, resguardando el contacto en formato json contenido en string, con los datos: Calle, Número, Piso, Depto, Localidad, Provincia, Código postal

In [133]:
df_address_contact = df_address_contact.withColumn("address_contact",to_json(struct(df_address_contact.full_address)))

In [134]:
df_address_contact.show(truncate=False)

+-----------+-----------------------+-------------------+--------------+-----------+------------------+---------------------------+-----------------+-------------+---------------------+-----------------+-----------------------+----------+---------------+-----------+------------------+----------------------+-------------------------+------------------+---------------------+------------------------------+-----------------------+--------------------+---------------------+------------------+-------------------------+-------------------------+------------------+-----------------------------+--------------------+---------------------+-----------------------+----------------------+--------------------+----------------------+------------------------------+----------------------------+------------------------+-------------------+----------------+----------------+-------------------+--------------------+-----------------------+-----------------+---------------------+-------------+-----------+---

In [135]:
df_address_contact.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- address_priority_number: integer (nullable = true)
 |-- address_sequence_id: integer (nullable = true)
 |-- residence_type: string (nullable = true)
 |-- street_name: string (nullable = true)
 |-- address_outdoor_id: string (nullable = true)
 |-- address_without_number_type: string (nullable = true)
 |-- address_indoor_id: string (nullable = true)
 |-- indoor_number: string (nullable = true)
 |-- address_district_name: string (nullable = true)
 |-- address_town_name: string (nullable = true)
 |-- address_department_name: string (nullable = true)
 |-- zipcode_id: string (nullable = true)
 |-- long_zipcode_id: string (nullable = true)
 |-- province_id: string (nullable = true)
 |-- address_country_id: string (nullable = true)
 |-- other_information_desc: string (nullable = true)
 |-- address_relationship_type: integer (nullable = true)
 |-- address_start_date: date (nullable = true)
 |-- address_verified_date: date (nullable = true)
 |

####- 33 - Combinar los DataFrame de contactos telefónicos de clientes, direcciones de clientes y email de clientes en uno solo.

In [136]:
df_contacts = df_pivot_phone.join(df_pivot_email, "customer_id", how='full').join(df_pivot_address, "customer_id", how='full')

In [137]:
df_contacts.toPandas()

Unnamed: 0,customer_id,phone_1,phone_2,phone_3,email_1,email_2,email_3,address_1,address_2,address_3
0,26293280,54-2323-67-7206,---,---,,,,,,
1,28445138,54-11-6161-5326,---,---,,,,,,
2,27008407,54-2657-22-6503,54-2657-31-3231,---,,,,,,
3,6987700,54-11-4451-4104,---,---,,,,,,
4,26488506,54-2964-54-9154,---,---,,,,,,
5,24401530,54-11-5656-3942,---,---,,,,,,
6,26923214,54-11-6705-6375,---,---,,,,,,
7,24935788,54-3447-64-4382,---,---,,,,,,
8,26183611,54-11-4341-0084,54-11-4341-0084,---,,,,,,
9,19801715,54-11-4544-8655,---,---,,,,,,


In [138]:
df_short_contacts = df_contacts.select('customer_id','Phone_1','Email_1','Address_1')
df_short_contacts.toPandas()

Unnamed: 0,customer_id,Phone_1,Email_1,Address_1
0,26293280,54-2323-67-7206,,
1,28445138,54-11-6161-5326,,
2,27008407,54-2657-22-6503,,
3,6987700,54-11-4451-4104,,
4,26488506,54-2964-54-9154,,
5,24401530,54-11-5656-3942,,
6,26923214,54-11-6705-6375,,
7,24935788,54-3447-64-4382,,
8,26183611,54-11-4341-0084,,
9,19801715,54-11-4544-8655,,


----

In [139]:
df_contact_types = df_phones_contact_col.join(df_address_contact_col, 'customer_id', how='full').join(df_emails_contact_col, 'customer_id', how='full')

In [140]:
df_contact_types.select('customer_id','contact_type_phones','contact_type_address','contact_type_emails').toPandas()

Unnamed: 0,customer_id,contact_type_phones,contact_type_address,contact_type_emails
0,00000000,,,e-mail
1,00000000,,,e-mail
2,00000007,phone,,
3,00000027,,address,
4,00000027,,address,
...,...,...,...,...
432743,30976146,phone,,
432744,30976148,phone,,
432745,30976151,phone,,
432746,30976154,phone,,


# Creacion de vistas temporales

####- 35 - Generar una vista temporal a partir del DataFrame de contactos.

In [141]:
df_short_contacts.createTempView('tw_contacts')

AnalysisException: ignored

####- 36 - Generar una vista temporal a partir del archivo t_abtq_customer_basics.

In [None]:
df_customer.createOrReplaceTempView('tw_customer_basics')

####- 37 - Generar una vista temporal a partir del archivo t_acog_marital_status_type.

In [None]:
df_marital_status_ok.createOrReplaceTempView('tw_marital_status')

####- 38 - Generar una vista temporal a partir del archivo t_acog_nationality.

In [None]:
df_nationality.createOrReplaceTempView("tw_nationality")

# Querys en spark SQL.

####- 39 - Generar un público objetivo (1) que cumpla los siguientes puntos:
* Cliente
* Antigüedad superior a 5 años
* Asalariado Fijo.
* Estado civil: Casado


In [None]:
publico_obj1 = sqlContext.sql('SELECT cb.customer_id, floor(datediff(current_date(), admission_date)/365) as years_diff, customer_condition_type, cb.job_type_desc, ms.martial_status_short_desc FROM tw_customer_basics as cb INNER JOIN tw_marital_status as ms ON cb.marital_status_type = ms.marital_status_type WHERE floor(datediff(current_date(), admission_date)/365) > 5 AND cb.marital_status_type = 01 AND cb.job_type = 001 AND cb.customer_condition_type = 1')

In [None]:
publico_obj1.toPandas()

####- 40 -  Generar un público objetivo (2) que cumpla los siguientes puntos:
* Potencial Cliente
* Sexo Femenino
* Entre 30 y 45 años de Edad.
* Nacionalidad No Argentina

In [None]:
publico_obj2 = sqlContext.sql('SELECT cb.customer_id, customer_condition_type, gender_type, floor(datediff(current_date(), birth_date)/365) as customer_age, n.country_name FROM tw_customer_basics AS cb INNER JOIN tw_nationality AS n ON cb.country_nationality_id == n.country_nationality_id WHERE floor(datediff(current_date(), birth_date)/365) > 30 AND floor(datediff(current_date(), birth_date)/365) < 45 AND gender_type = "F" AND customer_condition_type = 1 AND n.country_nationality_id <> 80')
publico_obj2.toPandas()

####- 41 - Generar un público objetivo (3) que cumpla los siguientes puntos:
* Potencial Cliente
* Sexo Masculino.
* Mayor a 25 años.
* Estado civil Soltero

In [None]:
publico_obj3 = sqlContext.sql('SELECT cb.customer_id, customer_condition_type, gender_type, floor(datediff(current_date(), birth_date)/365) as customer_age, ms.martial_status_short_desc FROM tw_customer_basics AS cb INNER JOIN tw_marital_status as ms ON cb.marital_status_type = ms.marital_status_type WHERE customer_condition_type = 1 AND floor(datediff(current_date(), birth_date)/365) > 25 AND gender_type = "M" AND cb.marital_status_type = 5')
publico_obj3.toPandas()

####- 42 - Generar un público objetivo (4) que cumpla los siguientes puntos:
* Cliente
* Edad superior a 52 años
* Nacionalidad Argentina
* Estado civil Viudo

In [None]:
publico_obj4 = sqlContext.sql('SELECT cb.customer_id, floor(datediff(current_date(), birth_date)/365) as customer_age, n.country_name, ms.martial_status_short_desc FROM tw_customer_basics as cb INNER JOIN tw_nationality AS n ON cb.country_nationality_id == n.country_nationality_id INNER JOIN tw_marital_status as ms ON cb.marital_status_type = ms.marital_status_type WHERE floor(datediff(current_date(), birth_date)/365) > 52 AND cb.country_nationality_id = 80 AND cb.marital_status_type = 2')
publico_obj4.toPandas()

# Carga de datos, generacion de archivos .csv
#### Los archivos deben cumplir los siguientes requerimientos: 
* Contener cabecera. 
* Separador “|”. 
* Máximo de 1000 registros por archivo (si el público objetivo tiene más de 1000 registros, se deberá crear más de un archivo)
* Ser guardados en HDFS.

#### - 43 -  Generar archivos .csv a partir de los público objetivo 1. 


#### - 44 - Generar archivos .csv a partir del público objetivo 2. 

####- 45 - Generar archivos .csv a partir del público objetivo 3. 

####- 46 - Generar archivos .csv a partir del público objetivo 4. 

# Extraccion de datos desde .csv, públicos objetivo

#### - 47 - Cargar los datos de los publicos generados en un DataFrame.

# Calculo de agregaciones.

####- 48 -  Contar al cantidad de personas contactada por publico objetivo.

####- 49 - Contar la cantidad de direcciones, mails y telefonos por publico objetivo.

####- 50 - Contar la cantidad de hombres y mujeres por publico objetivo.

#### - 51 - Calcular la edad promedio por publico objetivo.

# Carga de datos, generacion de archivos .parquet

####- 52 - Guardar las agregaciones generadas en un archivo parquet, particionado por mes de campaña y publico objetivo.

####- 53 - Guardar las agregaciones generadas en un archivo parquet, particionado por mes de campaña y publico objetivo.

####- 54 - Guardar las agregaciones generadas en un archivo parquet, particionado por mes de campaña y publico objetivo.

####- 55 - Guardar las agregaciones generadas en un archivo parquet, particionado por mes de campaña y publico objetivo.

# Borrador

In [None]:
df_customer.show() # admission_date marital_status_type gender_type birth_date job_type profession_id
#df_phones.toPandas()
#df_address.toPandas()
#df_emails.toPandas()
#df_marital_status.toPandas() # marital_status_type martial_status_short_desc
#df_segment_type.toPandas() 
#df_customer_documents.toPandas() 
#df_address_type.toPandas() # address_type address_type_short_desc ??
#df_gender.toPandas() # gender_type gender_desc
#df_nationality.toPandas() # country_nationality_id # country_nationality_id
#df_personal_type.toPandas()
#df_phone_type.toPandas() # phone_type	phone_type_desc	 ??
#df_province.toPandas()
#df_customer_segment.show() 
#df_customer_info_temp.show()

# VER DE NO IMPORTAR PARQUETS QUE NO USEMOS