<a href="https://colab.research.google.com/github/iGhostlp/Albus/blob/Sin-Dientes/Copia_de_Proyecto_BBVA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Armado del entorno

In [1]:
# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

In [2]:
# Unzip the file
!tar xf spark-3.3.2-bin-hadoop3.tgz

In [3]:
!readlink -f $(which java) | sed "s:bin/java::"

/usr/lib/jvm/java-11-openjdk-amd64/


In [4]:
# Set up the environment for Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64/"
os.environ["SPARK_HOME"] = '/content/spark-3.3.2-bin-hadoop3'

In [5]:
# Install library for finding Spark
!pip install -q findspark

# Import the libary
import findspark

# Initiate findspark
findspark.init()

In [6]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config('spark.sql.parquet.datetimeRebaseModeInRead','CORRECTED').getOrCreate()

# Check Spark Session Information
spark

## Extraccion de datos desde parquet, clientes y teléfonos.

In [7]:
df_customer = spark.read.parquet('/content/Datasets/customer_basics.snappy.parquet')
df_phones = spark.read.parquet('/content/Datasets/phones.snappy.parquet')

In [8]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

In [9]:
#df_customer_phones = df_phones.join(df_customer, 'customer_id')
df_customer_phones = df_phones.join(df_customer.select(col("customer_id"), col("last_change_date").alias("l_c_d_customer")), "customer_id")


In [10]:
df_customer_phones.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- phone_use_type: string (nullable = true)
 |-- address_sequence_id: integer (nullable = true)
 |-- phone_sequence_id: integer (nullable = true)
 |-- phone_type: string (nullable = true)
 |-- phone_country_id: string (nullable = true)
 |-- prefix_phone_id: string (nullable = true)
 |-- phone_area_id: string (nullable = true)
 |-- cellphone_prefix_id: string (nullable = true)
 |-- phone_exchange_id: string (nullable = true)
 |-- phone_line_id: string (nullable = true)
 |-- phone_id: string (nullable = true)
 |-- phone_intern_id: string (nullable = true)
 |-- aditional_info_txt_desc: string (nullable = true)
 |-- primary_phone_type: string (nullable = true)
 |-- address_sequence_type: string (nullable = true)
 |-- address_town_name: string (nullable = true)
 |-- zipcode_id: string (nullable = true)
 |-- province_id: string (nullable = true)
 |-- customer_phone_status_type: string (nullable = true)
 |-- phone_status_mod_date: date (nullab

### Filtrado phones

In [11]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat,col, row_number, desc, collect_list
from pyspark.sql.window import Window

In [55]:
df_phones_cut = df_customer_phones.drop('contact_channel_type','wrong_phone_type','registry_entry_date','register_user_id','last_change_user_id','l_c_d_customer','last_change_hms_date','last_change_terminal_id','phone_intern_id','phone_country_id', 'aditional_info_txt_desc', 'primary_phone_type','address_sequence_type','address_town_name','zipcode_id', 'province_id','sender_application_id','normalization_status_type','normalization_reason_name','validity_start_date','validity_end_date','dlvy_day_monday_type','dlvy_day_tuesday_type','dlvy_day_wednesday_type','dlvy_day_thursday_type','dlvy_day_friday_type','dlvy_day_friday_type','dlvy_day_saturday_type','delivery_contact_start_hm_date','delivery_contact_end_hm_date','operational_load_date','normalization_date')

In [56]:
df_phones_sorted = df_phones_cut.orderBy([df_phones_cut.customer_id, desc('last_change_date')])
df_phones_sorted.show()

+-----------+--------------+-------------------+-----------------+----------+---------------+-------------+-------------------+-----------------+-------------+-------------+--------------------------+---------------------+----------------+
|customer_id|phone_use_type|address_sequence_id|phone_sequence_id|phone_type|prefix_phone_id|phone_area_id|cellphone_prefix_id|phone_exchange_id|phone_line_id|     phone_id|customer_phone_status_type|phone_status_mod_date|last_change_date|
+-----------+--------------+-------------------+-----------------+----------+---------------+-------------+-------------------+-----------------+-------------+-------------+--------------------------+---------------------+----------------+
|   00000442|            00|                  0|                1|         3|             54|          297|                 15|              472|         9337| 384498667593|                 DECLARADO|           2022-07-20|      2022-07-20|
|   00000442|            02|            

In [14]:
#df_phones_sorted = df_phones_sorted.select(concat(df_phones_sorted.prefix_phone_id,df_phones_sorted.phone_area_id,df_phones_sorted.phone_exchange_id,df_phones_sorted.phone_line_id).alias('Full_Phone'),'customer_id','last_change_date')

In [57]:
from pyspark.sql.functions import concat_ws

df_phones_sorted = df_phones_sorted.select(concat_ws('-', df_phones_sorted.prefix_phone_id,df_phones_sorted.phone_area_id,df_phones_sorted.phone_exchange_id,df_phones_sorted.phone_line_id).alias('Full_Phone'),'customer_id','last_change_date')


In [58]:
df_phones_sorted.show()

+----------------+-----------+----------------+
|      Full_Phone|customer_id|last_change_date|
+----------------+-----------+----------------+
| 54-297-472-9337|   00000442|      2022-07-20|
| 54-297-529-6284|   00000442|      2022-07-20|
| 54-11-6679-3207|   00001419|      2022-11-01|
| 54-11-3870-0150|   00001939|      2022-09-21|
| 54-299-447-7116|   00002707|      2022-08-17|
| 54-342-466-2478|   00002790|      2022-11-18|
| 54-342-488-3620|   00002790|      2022-10-08|
|54-3541-5988-799|   00004287|      2022-08-29|
| 54-11-2408-4447|   00004724|      2022-11-04|
| 54-385-406-5887|   00007932|      2022-09-29|
| 54-3489-49-3578|   00011850|      2022-10-27|
| 54-11-2578-1080|   00011850|      2022-10-24|
| 54-3401-53-4381|   00012051|      2022-10-06|
| 54-11-4344-3165|   00012611|      2022-08-30|
| 54-11-6114-1855|   00012611|      2022-08-30|
| 54-221-620-8511|   00013498|      2022-08-22|
| 54-11-6505-0605|   00014664|      2022-10-06|
| 54-11-5105-2460|   00015497|      2022

In [59]:
window = Window.partitionBy(df_phones_sorted.customer_id).orderBy(desc(df_phones_sorted.last_change_date))

In [60]:
df_phone = df_phones_sorted.withColumn('row_num', row_number().over(window))

In [61]:
df_phone = df_phone.filter(df_phone.row_num <= 3)

In [62]:
df_phone.show()

+----------------+-----------+----------------+-------+
|      Full_Phone|customer_id|last_change_date|row_num|
+----------------+-----------+----------------+-------+
| 54-297-472-9337|   00000442|      2022-07-20|      1|
| 54-297-529-6284|   00000442|      2022-07-20|      2|
| 54-11-6679-3207|   00001419|      2022-11-01|      1|
| 54-11-3870-0150|   00001939|      2022-09-21|      1|
| 54-299-447-7116|   00002707|      2022-08-17|      1|
| 54-342-466-2478|   00002790|      2022-11-18|      1|
| 54-342-488-3620|   00002790|      2022-10-08|      2|
|54-3541-5988-799|   00004287|      2022-08-29|      1|
| 54-11-2408-4447|   00004724|      2022-11-04|      1|
| 54-385-406-5887|   00007932|      2022-09-29|      1|
| 54-3489-49-3578|   00011850|      2022-10-27|      1|
| 54-11-2578-1080|   00011850|      2022-10-24|      2|
| 54-3401-53-4381|   00012051|      2022-10-06|      1|
| 54-11-6114-1855|   00012611|      2022-08-30|      1|
| 54-11-4344-3165|   00012611|      2022-08-30| 

In [63]:
df_pivot_phone = df_phone.groupBy('customer_id').agg(collect_list('Full_Phone').alias('last_3_changes_list'))

In [64]:
df_pivot_phone = df_pivot_phone.selectExpr('customer_id', 'last_3_changes_list[0] as Phone_1', 'last_3_changes_list[1] as Phone_2', 'last_3_changes_list[2] as Phone_3')

In [65]:
df_pivot_phone.show()

+-----------+----------------+---------------+-------+
|customer_id|         Phone_1|        Phone_2|Phone_3|
+-----------+----------------+---------------+-------+
|   00000442| 54-297-472-9337|54-297-529-6284|   null|
|   00001419| 54-11-6679-3207|           null|   null|
|   00001939| 54-11-3870-0150|           null|   null|
|   00002707| 54-299-447-7116|           null|   null|
|   00002790| 54-342-466-2478|54-342-488-3620|   null|
|   00004287|54-3541-5988-799|           null|   null|
|   00004724| 54-11-2408-4447|           null|   null|
|   00007932| 54-385-406-5887|           null|   null|
|   00011850| 54-3489-49-3578|54-11-2578-1080|   null|
|   00012051| 54-3401-53-4381|           null|   null|
|   00012611| 54-11-6114-1855|54-11-4344-3165|   null|
|   00013498| 54-221-620-8511|           null|   null|
|   00014664| 54-11-6505-0605|           null|   null|
|   00015497| 54-11-5105-2460|           null|   null|
|   00051576| 54-11-2653-1080|           null|   null|
|   000539

#Tabla filtrada de phones

In [66]:
df_pivot_phone = df_pivot_phone.na.fill('---')
df_pivot_phone.show()

+-----------+----------------+---------------+-------+
|customer_id|         Phone_1|        Phone_2|Phone_3|
+-----------+----------------+---------------+-------+
|   00000442| 54-297-472-9337|54-297-529-6284|    ---|
|   00001419| 54-11-6679-3207|            ---|    ---|
|   00001939| 54-11-3870-0150|            ---|    ---|
|   00002707| 54-299-447-7116|            ---|    ---|
|   00002790| 54-342-466-2478|54-342-488-3620|    ---|
|   00004287|54-3541-5988-799|            ---|    ---|
|   00004724| 54-11-2408-4447|            ---|    ---|
|   00007932| 54-385-406-5887|            ---|    ---|
|   00011850| 54-3489-49-3578|54-11-2578-1080|    ---|
|   00012051| 54-3401-53-4381|            ---|    ---|
|   00012611| 54-11-6114-1855|54-11-4344-3165|    ---|
|   00013498| 54-221-620-8511|            ---|    ---|
|   00014664| 54-11-6505-0605|            ---|    ---|
|   00015497| 54-11-5105-2460|            ---|    ---|
|   00051576| 54-11-2653-1080|            ---|    ---|
|   000539

## Extraccion de datos desde parquet, clientes y emails.

In [24]:
df_emails = spark.read.parquet('/content/Datasets/emails.snappy.parquet')

In [38]:
sorted(df_emails.columns)

['address_sequence_id',
 'comments_desc',
 'customer_email_status_date',
 'customer_email_status_type',
 'customer_id',
 'email_app_type',
 'email_desc',
 'email_domain_type',
 'email_type',
 'encripted_type',
 'field_length_number',
 'last_change_date',
 'last_change_hms_date',
 'last_change_terminal_id',
 'last_change_user_id',
 'operational_load_date',
 'primary_email_type',
 'register_user_id',
 'registry_entry_date',
 'residence_type',
 'role_type']

In [40]:
#df_customer_emails = df_emails.join(df_customer, 'customer_id', how="right")
df_customer_emails = df_customer.join(df_emails.select(col("customer_id"),col("email_desc"), col("last_change_date").alias("last_change_date_e")), "customer_id")

In [41]:
sorted(df_customer_emails.columns)

['admission_date',
 'balance_closing_date',
 'bbva_cust_number',
 'bbva_family_lnk_weighted_number',
 'bbva_inhabilitation_type',
 'birth_date',
 'branch_id',
 'business_area_id',
 'campaing_id',
 'coholder_customer_id',
 'country_nationality_id',
 'credit_manager_id',
 'credit_recovery_manager_id',
 'cust_bcra_econ_activity_type',
 'customer_bcra_situation_id',
 'customer_block_bbva_date',
 'customer_block_bcra_date',
 'customer_condition_type',
 'customer_cuit_id',
 'customer_end_date',
 'customer_entry_type',
 'customer_first_block_date',
 'customer_id',
 'customer_level_access_type',
 'customer_linked_type',
 'customer_pending_notice_type',
 'customer_pep_type',
 'customer_position_vat_type',
 'customer_registration_date',
 'customer_seniority_number',
 'customer_situation_type',
 'customer_status_type',
 'customer_subject_type',
 'customer_type',
 'customer_unblock_bbva_date',
 'customer_unblock_bcra_date',
 'email_desc',
 'first_name',
 'gender_type',
 'internal_economic_activity

### Filtrado emails

In [42]:
#df_emails_cut = df_customer_emails.drop('role_type', 'email_type','address_sequence_id','residence_type','primary_email_type','email_domain_type','encripted_type','field_length_number','comments_desc','customer_email_status_type','email_app_type','register_user_id','last_change_user_id','last_change_hms_date','last_change_terminal_id','operational_load_date','customer_email_status_date','registry_entry_date')
df_emails_cut = df_customer_emails.select('customer_id','email_desc','last_change_date_e')

In [43]:
df_emails_sorted = df_emails_cut.orderBy([df_emails_cut.customer_id, desc('last_change_date_e')])

df_emails_sorted.toPandas()

Unnamed: 0,customer_id,email_desc,last_change_date_e
0,00001419,JU_LY1@HOTMAIL.COM,2019-05-11
1,00001419,EMILIA.RUBIANES@HOTMAIL.COM,2015-10-08
2,00002790,NOTIENE@HOIMAIL.COM,2019-07-29
3,00002790,DIGITALIZACION@EECC.COM,1900-01-01
4,00014664,alq@ciudad.com.ar,2009-06-27
...,...,...,...
369,28990339,ELSAMO@GMAIL.COM,2019-11-08
370,28993945,navyig@fibertel.com.ar,2009-06-27
371,29003190,ANLAU_08@LIVE.COM.AR,2020-06-01
372,29008648,ROBERTOWINY@GMAIL.COM,2013-01-23


In [44]:
window = Window.partitionBy(df_emails_sorted.customer_id).orderBy(desc(df_emails_sorted.last_change_date_e))

In [45]:
df_email = df_emails_sorted.withColumn('row_num', row_number().over(window))

In [46]:
df_email = df_email.filter(df_email.row_num <= 3)

In [47]:
df_email.toPandas()

Unnamed: 0,customer_id,email_desc,last_change_date_e,row_num
0,00001419,JU_LY1@HOTMAIL.COM,2019-05-11,1
1,00001419,EMILIA.RUBIANES@HOTMAIL.COM,2015-10-08,2
2,00002790,NOTIENE@HOIMAIL.COM,2019-07-29,1
3,00002790,DIGITALIZACION@EECC.COM,1900-01-01,2
4,00014664,alq@ciudad.com.ar,2009-06-27,1
...,...,...,...,...
369,28990339,ELSAMO@GMAIL.COM,2019-11-08,1
370,28993945,navyig@fibertel.com.ar,2009-06-27,1
371,29003190,ANLAU_08@LIVE.COM.AR,2020-06-01,1
372,29008648,ROBERTOWINY@GMAIL.COM,2013-01-23,1


In [48]:
df_pivot_email = df_email.groupBy('customer_id').agg(collect_list('email_desc').alias('last_3_changes_list'))

In [49]:
df_pivot_email = df_pivot_email.selectExpr('customer_id', 'last_3_changes_list[0] as Email_1', 'last_3_changes_list[1] as Email_2', 'last_3_changes_list[2] as Email_3')

In [50]:
df_pivot_email.toPandas()

Unnamed: 0,customer_id,Email_1,Email_2,Email_3
0,00001419,JU_LY1@HOTMAIL.COM,EMILIA.RUBIANES@HOTMAIL.COM,
1,00002790,NOTIENE@HOIMAIL.COM,DIGITALIZACION@EECC.COM,
2,00014664,alq@ciudad.com.ar,,
3,00056407,NELLY.S.GEREZ@GMAIL.COM,,
4,00058801,SOPLETESHLANDA@HOTMAIL.COM,SOPLETESHLANDA@HOTMAIL.COM,
...,...,...,...,...
317,28990339,ELSAMO@GMAIL.COM,,
318,28993945,navyig@fibertel.com.ar,,
319,29003190,ANLAU_08@LIVE.COM.AR,,
320,29008648,ROBERTOWINY@GMAIL.COM,,


In [51]:
df_pivot_email = df_pivot_email.na.fill('---')
df_pivot_email.toPandas()

Unnamed: 0,customer_id,Email_1,Email_2,Email_3
0,00001419,JU_LY1@HOTMAIL.COM,EMILIA.RUBIANES@HOTMAIL.COM,---
1,00002790,NOTIENE@HOIMAIL.COM,DIGITALIZACION@EECC.COM,---
2,00014664,alq@ciudad.com.ar,---,---
3,00056407,NELLY.S.GEREZ@GMAIL.COM,---,---
4,00058801,SOPLETESHLANDA@HOTMAIL.COM,SOPLETESHLANDA@HOTMAIL.COM,---
...,...,...,...,...
317,28990339,ELSAMO@GMAIL.COM,---,---
318,28993945,navyig@fibertel.com.ar,---,---
319,29003190,ANLAU_08@LIVE.COM.AR,---,---
320,29008648,ROBERTOWINY@GMAIL.COM,---,---


## Extraccion de datos desde .parquet, clientes y direcciones.

In [53]:
df_address = spark.read.parquet('/content/Datasets/address.snappy.parquet')

In [67]:
sorted(df_address.columns)

['address_country_id',
 'address_department_name',
 'address_district_name',
 'address_indoor_id',
 'address_outdoor_id',
 'address_priority_number',
 'address_relationship_type',
 'address_sequence_id',
 'address_start_date',
 'address_status_mod_date',
 'address_town_name',
 'address_verified_date',
 'address_without_number_type',
 'contact_channel_type',
 'customer_id',
 'customer_locator_verified_type',
 'delivery_contact_end_hm_date',
 'delivery_contact_start_hm_date',
 'dlvy_day_friday_type',
 'dlvy_day_monday_type',
 'dlvy_day_saturday_type',
 'dlvy_day_thursday_type',
 'dlvy_day_tuesday_type',
 'dlvy_day_wednesday_type',
 'indoor_number',
 'last_change_date',
 'last_change_hms_date',
 'last_change_terminal_id',
 'last_change_user_id',
 'long_zipcode_id',
 'normalization_date',
 'normalization_reason_name',
 'normalization_status_type',
 'normalized_level_match_number',
 'operational_load_date',
 'other_information_desc',
 'prev_address_sequence_id',
 'province_id',
 'register_u

In [70]:
df_address.show()

+-----------+-----------------------+-------------------+--------------+--------------------+------------------+---------------------------+-----------------+-------------+---------------------+--------------------+-----------------------+----------+---------------+-----------+------------------+----------------------+-------------------------+------------------+---------------------+------------------------------+-----------------------+--------------------+---------------------+------------------+-------------------------+-------------------------+------------------+-----------------------------+--------------------+---------------------+-----------------------+----------------------+--------------------+----------------------+------------------------------+----------------------------+------------------------+-------------------+----------------+----------------+-------------------+--------------------+-----------------------+-----------------+---------------------+
|customer_id|add

In [71]:
df_customer.show()

+-----------+-------------+-------------+--------------+---------------+--------------------+---------+----------------+-----------+-----------------------+-----------------------+----------+-------------------+-----------------+--------------------------+------------+----------------+-------------------+-------------------+-----------+----------+----------------------+--------------------+---------------------+-----------+--------------------------+----------------------------+-------------------------------+------------------------+------------------------+--------------------------+------------------------+--------------------------+----------------------------+------------------+------------------------+-------------------+-------------------+---------------------------+-----------------+-------------------+-----------------+-------------------------------+--------------------+---------------------+------------------+----------------+--------------------+--------------------------+

In [None]:
#df_customer_address = df_address.join(df_customer, 'customer_id')
#df_customer_address = df_address.join(df_customer.select(col("customer_id"), col("last_change_date").alias("l_c_d_customer")), "customer_id")

### Filtrado Address

In [None]:
#antes de hacer el join, voy a hacer un nuevo dataframe, con los campos que se solicitan en el Ticket 32: --- Calle, Número, Piso, Depto, Localidad, Provincia, Código postal ---

In [79]:
df_address_cut = df_address.select(col("customer_id"),col("street_name"),col("address_outdoor_id"),col("address_indoor_id"),col("indoor_number"),col("address_department_name"),col("province_id"),col("zipcode_id"),col("last_change_date").alias("last_change_date_a"))

In [80]:
df_address_cut.toPandas()

Unnamed: 0,customer_id,street_name,address_outdoor_id,address_indoor_id,indoor_number,address_department_name,province_id,zipcode_id,last_change_date_a
0,00000660,PASCUALA DEL JUNCAL,0000850,,,112233114455,02,01646,2011-07-01
1,07121078,SAN NICOLAS,0002478,,,FLORENCIO VARELA,02,01888,2016-09-29
2,22374047,J DE LA CRUZ CONTRERAS,0000408,,,FLORENCIO VARELA,02,01888,2011-07-01
3,00002450,BACACAY,0001466,,,ITUZAINGO,02,01714,2009-01-26
4,00003925,MANUELA PEDRAZA,0001715,5,B,CAPITAL FEDERAL,01,01429,2009-01-26
...,...,...,...,...,...,...,...,...,...
2495,00005200,PARAGUAY,0003091,,,CAPITAL FEDERAL,01,01425,2009-01-26
2496,08341355,AV CORRIENTES,0004923,2,F,CAPITAL FEDERAL,01,01414,2009-01-26
2497,06987700,ESTADOS UNIDOS,0002772,2,B,CAPITAL FEDERAL,01,01227,2009-01-26
2498,00015314,AV ALVAREZ THOMAS,0000195,13,A,CAPITAL FEDERAL,01,01427,2009-01-26


In [81]:
df_address_sorted = df_address_cut.orderBy([df_address_cut.customer_id, desc('last_change_date_a')])


In [82]:
#df_address_sorted = df_address_sorted.select('customer_id', concat(df_address_sorted.street_name,df_address_sorted.address_outdoor_id,df_address_sorted.address_indoor_id,df_address_sorted.indoor_number).alias('Full_Address'),'address_district_name','address_town_name','address_department_name','zipcode_id','long_zipcode_id','last_change_date')
df_address_sorted = df_address_sorted.select('customer_id', concat_ws('-', df_address_sorted.street_name,df_address_sorted.address_outdoor_id,df_address_sorted.address_indoor_id,df_address_sorted.indoor_number,df_address_sorted.address_department_name,df_address_sorted.province_id,df_address_sorted.zipcode_id).alias('Full_Address'),'last_change_date_a')

In [83]:
df_address_sorted.toPandas()

Unnamed: 0,customer_id,Full_Address,last_change_date_a
0,00000003,AV PRES BARTOLOME MITRE -0001500- - ...,2011-09-07
1,00000050,DR A ALSINA -0002849- - ...,2017-04-18
2,00000050,DR A ALSINA -0002849- - ...,2016-10-12
3,00000173,CALLE 150 -0003726- - ...,2016-10-12
4,00000173,CALLE 150 -0003726- - ...,2016-10-12
...,...,...,...
2495,29015563,IBANEZ TENIENTE 1 RO -0001355- - ...,2009-01-26
2496,29015902,GARIBALDI -0001554- - ...,2009-01-26
2497,29017100,DOMINGO MATHEU -0000972- - ...,2009-01-26
2498,29017191,VIRREY ARREDONDO -0002641-3 -B ...,2009-01-26


In [84]:
window = Window.partitionBy(df_address_sorted.customer_id).orderBy(desc(df_address_sorted.last_change_date_a))

In [85]:
df_address = df_address_sorted.withColumn('row_num', row_number().over(window))

In [86]:
df_address = df_address.filter(df_address.row_num <= 3)
df_address.toPandas()

Unnamed: 0,customer_id,Full_Address,last_change_date_a,row_num
0,00000003,AV PRES BARTOLOME MITRE -0001500- - ...,2011-09-07,1
1,00000050,DR A ALSINA -0002849- - ...,2017-04-18,1
2,00000050,DR A ALSINA -0002849- - ...,2016-10-12,2
3,00000173,CALLE 150 -0003726- - ...,2016-10-12,1
4,00000173,CALLE 150 -0003726- - ...,2016-10-12,2
...,...,...,...,...
2478,29015563,IBANEZ TENIENTE 1 RO -0001355- - ...,2009-01-26,1
2479,29015902,GARIBALDI -0001554- - ...,2009-01-26,1
2480,29017100,DOMINGO MATHEU -0000972- - ...,2009-01-26,1
2481,29017191,VIRREY ARREDONDO -0002641-3 -B ...,2009-01-26,1


In [88]:
df_pivot_address = df_address.groupBy('customer_id').agg(collect_list('Full_Address').alias('last_3_changes_list'))

In [90]:
df_pivot_address = df_pivot_address.selectExpr('customer_id', 'last_3_changes_list[0] as Address_1', 'last_3_changes_list[1] as Address_2', 'last_3_changes_list[2] as Address_3')


In [91]:
df_pivot_address = df_pivot_address.na.fill('---')
df_pivot_address.toPandas()

Unnamed: 0,customer_id,Address_1,Address_2,Address_3
0,00000003,AV PRES BARTOLOME MITRE -0001500- - ...,---,---
1,00000050,DR A ALSINA -0002849- - ...,DR A ALSINA -0002849- - ...,---
2,00000173,CALLE 150 -0003726- - ...,CALLE 150 -0003726- - ...,CALLE 150 -0003726- - ...
3,00000188,CALLE 156 -0004344- - ...,---,---
4,00000204,GRAL PINTO -0002441- - ...,---,---
...,...,...,...,...
2395,29015563,IBANEZ TENIENTE 1 RO -0001355- - ...,---,---
2396,29015902,GARIBALDI -0001554- - ...,---,---
2397,29017100,DOMINGO MATHEU -0000972- - ...,---,---
2398,29017191,VIRREY ARREDONDO -0002641-3 -B ...,---,---


#Join de las 3 tablas de contactos

In [93]:
df_contactos = df_pivot_phone.join(df_pivot_email, "customer_id") \
                   .join(df_pivot_address, "customer_id")

In [94]:
sorted(df_contactos.columns)

['Address_1',
 'Address_2',
 'Address_3',
 'Email_1',
 'Email_2',
 'Email_3',
 'Phone_1',
 'Phone_2',
 'Phone_3',
 'customer_id']

In [96]:
df_contactos.show()

+-----------+---------------+---------------+-------+--------------------+--------------------+-------+--------------------+---------+---------+
|customer_id|        Phone_1|        Phone_2|Phone_3|             Email_1|             Email_2|Email_3|           Address_1|Address_2|Address_3|
+-----------+---------------+---------------+-------+--------------------+--------------------+-------+--------------------+---------+---------+
|   00001419|54-11-6679-3207|            ---|    ---|  JU_LY1@HOTMAIL.COM|EMILIA.RUBIANES@H...|    ---|ALTE G BROWN     ...|      ---|      ---|
|   00002790|54-342-466-2478|54-342-488-3620|    ---| NOTIENE@HOIMAIL.COM|DIGITALIZACION@EE...|    ---|AV CORDOBA       ...|      ---|      ---|
|   00014664|54-11-6505-0605|            ---|    ---|   alq@ciudad.com.ar|                 ---|    ---|AV TRIUNVIRATO   ...|      ---|      ---|
|   00056407|54-336-457-7255|            ---|    ---|NELLY.S.GEREZ@GMA...|                 ---|    ---|AV DR J BAUTISTA ...|      