<a href="https://colab.research.google.com/github/iGhostlp/Albus/blob/Hermione/Proyecto_BBVA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Armado del entorno

In [None]:
# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

In [None]:
# Unzip the file
!tar xf spark-3.3.2-bin-hadoop3.tgz

In [None]:
!readlink -f $(which java) | sed "s:bin/java::"

/usr/lib/jvm/java-11-openjdk-amd64/


In [None]:
# Set up the environment for Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64/"
os.environ["SPARK_HOME"] = '/content/spark-3.3.2-bin-hadoop3'

In [None]:
# Install library for finding Spark
!pip install -q findspark

# Import the libary
import findspark

# Initiate findspark
findspark.init()

In [None]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config('spark.sql.parquet.datetimeRebaseModeInRead','CORRECTED').getOrCreate()

# Check Spark Session Information
spark

## Extraccion de datos desde parquet, clientes y teléfonos.

In [38]:
df_customer = spark.read.parquet('/content/Datasets/customer_basics.snappy.parquet')
df_phones = spark.read.parquet('/content/Datasets/phones.snappy.parquet')

In [39]:
df_customer_phones = df_phones.join(df_customer, 'customer_id')

### Filtrado phones

In [None]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat,col, row_number, desc, collect_list
from pyspark.sql.window import Window

In [None]:
df_phones_cut = df_phones.drop('phone_intern_id','phone_country_id', 'aditional_info_txt_desc', 'primary_phone_type','address_sequence_type','address_town_name','zipcode_id', 'province_id','sender_application_id','normalization_status_type','normalization_reason_name','validity_start_date','validity_end_date','dlvy_day_monday_type','dlvy_day_tuesday_type','dlvy_day_wednesday_type','dlvy_day_thursday_type','dlvy_day_friday_type','dlvy_day_friday_type','dlvy_day_saturday_type','delivery_contact_start_hm_date','delivery_contact_end_hm_date','operational_load_date','normalization_date')

In [None]:
df_phones_sorted = df_phones_cut.orderBy([df_phones_cut.customer_id, desc('last_change_date')])
df_phones_sorted.show()

In [None]:
df_phones_sorted = df_phones_sorted.select(concat(df_phones_sorted.prefix_phone_id,df_phones_sorted.phone_area_id,df_phones_sorted.phone_exchange_id,df_phones_sorted.phone_line_id).alias('Full_Phone'),'customer_id','last_change_date')

In [None]:
df_phones_sorted.show()

In [None]:
window = Window.partitionBy(df_phones_sorted.customer_id).orderBy(desc(df_phones_sorted.last_change_date))

In [None]:
df_phone = df_phones_sorted.withColumn('row_num', row_number().over(window))

In [None]:
df_phone = df_phone.filter(df_phone.row_num <= 3)

In [None]:
df_phone.show()

In [None]:
df_pivot_phone = df_phone.groupBy('customer_id').agg(collect_list('Full_Phone').alias('last_3_changes_list'))

In [None]:
df_pivot_phone = df_pivot_phone.selectExpr('customer_id', 'last_3_changes_list[0] as Phone_1', 'last_3_changes_list[1] as Phone_2', 'last_3_changes_list[2] as Phone_3')

In [None]:
df_pivot_phone.show()

In [37]:
df_pivot_phone = df_pivot_phone.na.fill('---')
df_pivot_phone.show()

+-----------+------------+------------+-------+
|customer_id|     Phone_1|     Phone_2|Phone_3|
+-----------+------------+------------+-------+
|   00000007|542664697946|         ---|    ---|
|   00000381|543815909885|         ---|    ---|
|   00041884|541161577947|         ---|    ---|
|   00048225|541165187983|         ---|    ---|
|   00052103|541169951912|         ---|    ---|
|   00064339|542966425661|         ---|    ---|
|   00071569|543584112643|         ---|    ---|
|   00077558|541151522021|         ---|    ---|
|   00084439|542974779188|542214218843|    ---|
|   00091603|542901645082|         ---|    ---|
|   00099043|541130913482|         ---|    ---|
|   00101705|541155257944|         ---|    ---|
|   00107748|541168784699|         ---|    ---|
|   00111074|541147445559|         ---|    ---|
|   00114702|543704907067|         ---|    ---|
|   00122414|542994276786|         ---|    ---|
|   00125032|541140549658|         ---|    ---|
|   00126967|541135737613|         ---| 

## Extraccion de datos desde parquet, clientes y emails.

In [102]:
df_emails = spark.read.parquet('/content/Datasets/email.snappy.parquet')

In [103]:
df_customer_emails = df_emails.join(df_customer, 'customer_id', how="right")

### Filtrado emails

In [126]:
df_emails_cut = df_emails.drop('role_type', 'email_type','address_sequence_id','residence_type','primary_email_type','email_domain_type','encripted_type','field_length_number','comments_desc','customer_email_status_type','email_app_type','register_user_id','last_change_user_id','last_change_hms_date','last_change_terminal_id','operational_load_date','customer_email_status_date','registry_entry_date')

In [127]:
df_emails_sorted = df_emails_cut.orderBy([df_emails_cut.customer_id, desc('last_change_date')])

df_emails_sorted.toPandas()

Unnamed: 0,customer_id,email_desc,last_change_date
0,00000000,ALVAROJOSE.DAGAND@BBVA.COM,2019-12-18
1,00000000,LIBRERIAURUGUAY@GMAIL.COM,2015-10-27
2,00027555,ALEFRID666@YAHOO.COM.AR,2012-07-08
3,00027568,SCATIVA@GMAIL.COM,2016-03-15
4,00027595,NO@GMAIL.COM,2018-08-07
...,...,...,...
495,27514366,NICOLAS.SZEBUN@HOTMAIL.COM,2021-01-25
496,27514374,TOMASBORBAS@HOTMAIL.COM,2021-01-25
497,27514379,FSANFRANCISCO@GMAIL.COM,2021-01-26
498,27514380,JUANCA.BIANCU@GMAIL.COM,2021-01-25


In [106]:
window = Window.partitionBy(df_emails_sorted.customer_id).orderBy(desc(df_emails_sorted.last_change_date))

In [107]:
df_email = df_emails_sorted.withColumn('row_num', row_number().over(window))

In [108]:
df_email = df_email.filter(df_email.row_num <= 3)

In [129]:
df_email.toPandas()

Unnamed: 0,customer_id,email_desc,last_change_date,row_num
0,00000000,ALVAROJOSE.DAGAND@BBVA.COM,2019-12-18,1
1,00000000,LIBRERIAURUGUAY@GMAIL.COM,2015-10-27,2
2,00027555,ALEFRID666@YAHOO.COM.AR,2012-07-08,1
3,00027568,SCATIVA@GMAIL.COM,2016-03-15,1
4,00027595,NO@GMAIL.COM,2018-08-07,1
...,...,...,...,...
495,27514366,NICOLAS.SZEBUN@HOTMAIL.COM,2021-01-25,1
496,27514374,TOMASBORBAS@HOTMAIL.COM,2021-01-25,1
497,27514379,FSANFRANCISCO@GMAIL.COM,2021-01-26,1
498,27514380,JUANCA.BIANCU@GMAIL.COM,2021-01-25,1


In [110]:
df_pivot_email = df_email.groupBy('customer_id').agg(collect_list('email_desc').alias('last_3_changes_list'))

In [111]:
df_pivot_email = df_pivot_email.selectExpr('customer_id', 'last_3_changes_list[0] as Email_1', 'last_3_changes_list[1] as Email_2', 'last_3_changes_list[2] as Email_3')

In [130]:
df_pivot_email.toPandas()

Unnamed: 0,customer_id,Email_1,Email_2,Email_3
0,00000000,ALVAROJOSE.DAGAND@BBVA.COM,LIBRERIAURUGUAY@GMAIL.COM,---
1,00027555,ALEFRID666@YAHOO.COM.AR,---,---
2,00027568,SCATIVA@GMAIL.COM,---,---
3,00027595,NO@GMAIL.COM,---,---
4,00027609,lindocampo@yahoo.com,---,---
...,...,...,...,...
424,27514366,NICOLAS.SZEBUN@HOTMAIL.COM,---,---
425,27514374,TOMASBORBAS@HOTMAIL.COM,---,---
426,27514379,FSANFRANCISCO@GMAIL.COM,---,---
427,27514380,JUANCA.BIANCU@GMAIL.COM,---,---


In [131]:
df_pivot_email = df_pivot_email.na.fill('---')
df_pivot_email.toPandas()

Unnamed: 0,customer_id,Email_1,Email_2,Email_3
0,00000000,ALVAROJOSE.DAGAND@BBVA.COM,LIBRERIAURUGUAY@GMAIL.COM,---
1,00027555,ALEFRID666@YAHOO.COM.AR,---,---
2,00027568,SCATIVA@GMAIL.COM,---,---
3,00027595,NO@GMAIL.COM,---,---
4,00027609,lindocampo@yahoo.com,---,---
...,...,...,...,...
424,27514366,NICOLAS.SZEBUN@HOTMAIL.COM,---,---
425,27514374,TOMASBORBAS@HOTMAIL.COM,---,---
426,27514379,FSANFRANCISCO@GMAIL.COM,---,---
427,27514380,JUANCA.BIANCU@GMAIL.COM,---,---


## Extraccion de datos desde .parquet, clientes y direcciones.

In [74]:
df_address = spark.read.parquet('/content/Datasets/address.snappy.parquet')

In [75]:
df_customer_address = df_address.join(df_customer, 'customer_id')

### Filtrado Address

In [93]:
df_address_cut = df_address.drop('address_priority_number','address_sequence_id','residence_type','address_without_number_type','province_id','address_country_id','other_information_desc','address_relationship_type','address_start_date','address_verified_date','customer_locator_verified_type','address_status_mod_date','contact_channel_type','sender_application_id','returned_mail_type','normalization_status_type','normalization_reason_name','normalization_date','normalized_level_match_number','dlvy_day_monday_type','dlvy_day_tuesday_type','dlvy_day_wednesday_type','dlvy_day_thursday_type','dlvy_day_friday_type','dlvy_day_saturday_type','delivery_contact_start_hm_date','delivery_contact_end_hm_date','prev_address_sequence_id','registry_entry_date','register_user_id','last_change_user_id','last_change_hms_date','last_change_terminal_id','registration_type','operational_load_date')

In [123]:
df_address_cut.toPandas()

Unnamed: 0,customer_id,street_name,address_outdoor_id,address_indoor_id,indoor_number,address_district_name,address_town_name,address_department_name,zipcode_id,long_zipcode_id,last_change_date,row_num
0,00000003,AV PRES BARTOLOME MITRE,0001500,,,,CRUCESITA,AVELLANEDA,01870,B1873AMN,2011-09-07,1
1,00000009,VENEZUELA 538,23,1,CAPITAL,MONSERRAT,,112233114455,11111,11111111,2017-04-18,1
2,00000009,CNEL AREVALO,0002391,,,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01425,C1425FZE,2014-07-22,2
3,00000011,AV FELIX DE OLAZABAL,0005392,1,C,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01431,C1431CGW,2007-11-02,1
4,00000027,AVENIDA DE MAYO,0001465,2,AA,CABA,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01098,C1098ABC,2010-01-25,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2478,27282128,SAAVEDRA,0002548,,,,VICTORIA,SAN FERNANDO,01644,B1644GHB,2011-07-01,2
2479,27408820,ELOY FUENTES,0000041,,,CENTRO,RIO TERCERO,TERCERO ARRIBA,05850,X5850EQA,2021-12-14,1
2480,27564067,ZEBALLOS,0000900,,,,AVELLANEDA,AVELLANEDA,01870,,2014-02-17,1
2481,27568614,AVDA 25,0002500,,,SAN LORENZO AEROPARQUE,SAN LORENZO,LA PLATA,01900,B1912BLN,2020-11-25,1


In [122]:
df_address_sorted = df_address_cut.orderBy([df_address_cut.customer_id, desc('last_change_date')])
df_address_sorted.toPandas()

Unnamed: 0,customer_id,street_name,address_outdoor_id,address_indoor_id,indoor_number,address_district_name,address_town_name,address_department_name,zipcode_id,long_zipcode_id,last_change_date,row_num
0,00000003,AV PRES BARTOLOME MITRE,0001500,,,,CRUCESITA,AVELLANEDA,01870,B1873AMN,2011-09-07,1
1,00000009,VENEZUELA 538,23,1,CAPITAL,MONSERRAT,,112233114455,11111,11111111,2017-04-18,1
2,00000009,CNEL AREVALO,0002391,,,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01425,C1425FZE,2014-07-22,2
3,00000011,AV FELIX DE OLAZABAL,0005392,1,C,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01431,C1431CGW,2007-11-02,1
4,00000027,AVENIDA DE MAYO,0001465,2,AA,CABA,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01098,C1098ABC,2010-01-25,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2478,27282128,SAAVEDRA,0002548,,,,VICTORIA,SAN FERNANDO,01644,B1644GHB,2011-07-01,2
2479,27408820,ELOY FUENTES,0000041,,,CENTRO,RIO TERCERO,TERCERO ARRIBA,05850,X5850EQA,2021-12-14,1
2480,27564067,ZEBALLOS,0000900,,,,AVELLANEDA,AVELLANEDA,01870,,2014-02-17,1
2481,27568614,AVDA 25,0002500,,,SAN LORENZO AEROPARQUE,SAN LORENZO,LA PLATA,01900,B1912BLN,2020-11-25,1


In [96]:
df_address_sorted = df_address_sorted.select('customer_id', concat(df_address_sorted.street_name,df_address_sorted.address_outdoor_id,df_address_sorted.address_indoor_id,df_address_sorted.indoor_number).alias('Full_Address'),'address_district_name','address_town_name','address_department_name','zipcode_id','long_zipcode_id','last_change_date')

In [121]:
df_address_sorted.toPandas()

Unnamed: 0,customer_id,Full_Address,address_district_name,address_town_name,address_department_name,zipcode_id,long_zipcode_id,last_change_date
0,00000003,AV PRES BARTOLOME MITRE 0001500,,CRUCESITA,AVELLANEDA,01870,B1873AMN,2011-09-07
1,00000009,VENEZUELA 538 23 1 CAPITAL,MONSERRAT,,112233114455,11111,11111111,2017-04-18
2,00000009,CNEL AREVALO 0002391,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01425,C1425FZE,2014-07-22
3,00000011,AV FELIX DE OLAZABAL 00053921 C,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01431,C1431CGW,2007-11-02
4,00000027,AVENIDA DE MAYO 00014652 AA,CABA,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01098,C1098ABC,2010-01-25
...,...,...,...,...,...,...,...,...
2478,27282128,SAAVEDRA 0002548,,VICTORIA,SAN FERNANDO,01644,B1644GHB,2011-07-01
2479,27408820,ELOY FUENTES 0000041,CENTRO,RIO TERCERO,TERCERO ARRIBA,05850,X5850EQA,2021-12-14
2480,27564067,ZEBALLOS 0000900,,AVELLANEDA,AVELLANEDA,01870,,2014-02-17
2481,27568614,AVDA 25 0002500,SAN LORENZO AEROPARQUE,SAN LORENZO,LA PLATA,01900,B1912BLN,2020-11-25


In [98]:
window = Window.partitionBy(df_address_sorted.customer_id).orderBy(desc(df_address_sorted.last_change_date))

In [99]:
df_address = df_address_sorted.withColumn('row_num', row_number().over(window))

In [120]:
df_address = df_address.filter(df_address.row_num <= 3)
df_address.toPandas()

Unnamed: 0,customer_id,Full_Address,address_district_name,address_town_name,address_department_name,zipcode_id,long_zipcode_id,last_change_date,row_num
0,00000003,AV PRES BARTOLOME MITRE 0001500,,CRUCESITA,AVELLANEDA,01870,B1873AMN,2011-09-07,1
1,00000009,VENEZUELA 538 23 1 CAPITAL,MONSERRAT,,112233114455,11111,11111111,2017-04-18,1
2,00000009,CNEL AREVALO 0002391,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01425,C1425FZE,2014-07-22,2
3,00000011,AV FELIX DE OLAZABAL 00053921 C,,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01431,C1431CGW,2007-11-02,1
4,00000027,AVENIDA DE MAYO 00014652 AA,CABA,CIUDAD AUTONOMA BUENOS AIRES,CAPITAL FEDERAL,01098,C1098ABC,2010-01-25,1
...,...,...,...,...,...,...,...,...,...
2478,27282128,SAAVEDRA 0002548,,VICTORIA,SAN FERNANDO,01644,B1644GHB,2011-07-01,2
2479,27408820,ELOY FUENTES 0000041,CENTRO,RIO TERCERO,TERCERO ARRIBA,05850,X5850EQA,2021-12-14,1
2480,27564067,ZEBALLOS 0000900,,AVELLANEDA,AVELLANEDA,01870,,2014-02-17,1
2481,27568614,AVDA 25 0002500,SAN LORENZO AEROPARQUE,SAN LORENZO,LA PLATA,01900,B1912BLN,2020-11-25,1


In [114]:
df_pivot_address = df_address.groupBy('customer_id').agg(collect_list('Full_Address').alias('last_3_changes_list'))

In [115]:
df_pivot_address = df_pivot_address.selectExpr('customer_id', 'last_3_changes_list[0] as Address_1', 'last_3_changes_list[1] as Address_2', 'last_3_changes_list[2] as Address_3')

In [119]:
df_pivot_address = df_pivot_address.na.fill('---')
df_pivot_address.toPandas()

Unnamed: 0,customer_id,Address_1,Address_2,Address_3
0,00000003,AV PRES BARTOLOME MITRE 0001500,---,---
1,00000009,VENEZUELA 538 23 1 CAPITAL,CNEL AREVALO 0002391,---
2,00000011,AV FELIX DE OLAZABAL 00053921 C,---,---
3,00000027,AVENIDA DE MAYO 00014652 AA,PJE JACHAL 0003547,HEROES DE MALVI 000194400
4,00000028,NECOCHEA 000465335 1,---,---
...,...,...,...,...
2395,27282128,SAAVEDRA 0002548,SAAVEDRA 0002548,---
2396,27408820,ELOY FUENTES 0000041,---,---
2397,27564067,ZEBALLOS 0000900,---,---
2398,27568614,AVDA 25 0002500,---,---
