In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

  from pandas.core import (


In [2]:
# cargando el dataset con todas las probabilidades de compra de cada cliente
# obtenidas con el modelo de machine learning en el notebook Recomendacion_part_1.ipynb
df_probab_compra = pd.read_parquet('df_proba_compra.parquet')
df_probab_compra.head()

Unnamed: 0_level_0,prob_compra
pk_cid,Unnamed: 1_level_1
15891,0
16063,1
16203,1
16502,1
17457,1


In [3]:
# cargando el dataset con las variables sociodemograficas de cada cliente desde S3
df_socio = pd.read_parquet("https://easy-money-project-bucket.s3.eu-west-3.amazonaws.com/sociodemographic_df.parquet")
df_socio.head()

Unnamed: 0.1,Unnamed: 0,pk_cid,pk_partition,country_id,region_code,gender,age,deceased,salary
0,0,1375586,2018-01-28,ES,29,H,35,N,87218.0
1,1,1050611,2018-01-28,ES,13,V,23,N,35549.0
2,2,1050612,2018-01-28,ES,13,V,23,N,122179.0
3,3,1050613,2018-01-28,ES,50,H,22,N,119776.0
4,4,1050614,2018-01-28,ES,50,V,23,N,


In [4]:
# cargando el dataset con las variables de productos de cada cliente
df_prod = pd.read_parquet("https://easy-money-project-bucket.s3.eu-west-3.amazonaws.com/products_df.parquet")
df_prod.head()

Unnamed: 0.1,Unnamed: 0,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
0,0,1375586,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,1050611,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2,1050612,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,1050613,2018-01-28,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,1050614,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [5]:
# se hace un merge de los dos datasets anteriores para tener un dataset completo con todas las variables
df_full = pd.merge(df_prod, df_socio, on=["pk_cid","pk_partition"], how="left")
df_full.head() 

Unnamed: 0,Unnamed: 0_x,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount,Unnamed: 0_y,country_id,region_code,gender,age,deceased,salary
0,0,1375586,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,ES,29,H,35,N,87218.0
1,1,1050611,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,ES,13,V,23,N,35549.0
2,2,1050612,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,ES,13,V,23,N,122179.0
3,3,1050613,2018-01-28,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,ES,50,H,22,N,119776.0
4,4,1050614,2018-01-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,ES,50,V,23,N,


Para este analisis utilizaremos la partición más reciente:

- La partición más reciente contiene los datos más actuales sobre el comportamiento, preferencias y transacciones de los clientes, por tanto las recomendaciones serán más relevantes y precisas.

- La probabilidad de que las recomendaciones sean bien recibidas y que los clientes estén receptivos a ellas sería mayor.


In [6]:
# particion mas reciente
df_last_partition = df_full[df_full['pk_partition']=='2019-05-28']
df_last_partition

Unnamed: 0,Unnamed: 0_x,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount,Unnamed: 0_y,country_id,region_code,gender,age,deceased,salary
5519929,12715896,657826,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,12715896,ES,25,H,44,N,54493
5519930,12715899,657817,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12715899,ES,8,V,32,N,
5519931,12715982,657986,2019-05-28,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,12715982,ES,41,H,39,N,100993
5519932,12716026,657905,2019-05-28,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,12716026,ES,28,H,85,N,154059
5519933,12716082,657336,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,12716082,ES,28,V,38,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962919,13647304,1166765,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647304,ES,50,V,22,N,43912
5962920,13647305,1166764,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647305,ES,26,V,23,N,23335
5962921,13647306,1166763,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647306,ES,50,H,47,N,
5962922,13647307,1166789,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647307,ES,50,H,22,N,199593


Eliminamos todos los clientes que aparecen en deceased como S

In [7]:
# se han eliminado los clientes fallecidos, 86 en total
df_last_partition = df_last_partition[df_last_partition["deceased"] != "S"]
df_last_partition

Unnamed: 0,Unnamed: 0_x,pk_cid,pk_partition,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount,Unnamed: 0_y,country_id,region_code,gender,age,deceased,salary
5519929,12715896,657826,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,12715896,ES,25,H,44,N,54493
5519930,12715899,657817,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12715899,ES,8,V,32,N,
5519931,12715982,657986,2019-05-28,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,12715982,ES,41,H,39,N,100993
5519932,12716026,657905,2019-05-28,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,12716026,ES,28,H,85,N,154059
5519933,12716082,657336,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,12716082,ES,28,V,38,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962919,13647304,1166765,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647304,ES,50,V,22,N,43912
5962920,13647305,1166764,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647305,ES,26,V,23,N,23335
5962921,13647306,1166763,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647306,ES,50,H,47,N,
5962922,13647307,1166789,2019-05-28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,13647307,ES,50,H,22,N,199593


Ahora nos quedamos con las columnas que nos interesan para el analisis

In [8]:
columnas_relev = ['pk_cid','short_term_deposit', 'loans', 'mortgage', 'funds', 'securities', 
                  'long_term_deposit',"em_account_pp",	"credit_card"	,"payroll"	,"pension_plan",
                  "payroll_account"	,"emc_account",	"debit_card",	"em_account_p"	,"em_acount"]

df_last_partition = df_last_partition[columnas_relev]
df_last_partition

Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount
5519929,657826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5519930,657817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5519931,657986,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0
5519932,657905,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
5519933,657336,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962919,1166765,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962920,1166764,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962921,1166763,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5962922,1166789,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Enfoque Global

A continuacion se crea un ranking de productos para recomendar, basándonos en los que el cliente no tiene y ordenándolos según la probabilidad de compra. Esta estrategia permite mejorar la eficiencia de la campaña de marketing y aumentar la satisfacción y fidelización de los clientes.


In [9]:
# 1. Identificar las columnas de productos (todas excepto 'pk_cid')
product_cols = df_last_partition.columns.drop('pk_cid')

# 2. Sumar los valores de cada columna de productos
product_totals = df_last_partition[product_cols].sum()

# 3. Crear un DataFrame con los totales
product_ranking = product_totals.reset_index()
product_ranking.columns = ['Producto', 'Total']

# 4. Ordenar los productos por total en orden descendente
product_ranking = product_ranking.sort_values(by='Total', ascending=False)
product_ranking

Unnamed: 0,Producto,Total
14,em_acount,296334
12,debit_card,43247
10,payroll_account,26521
11,emc_account,24733
9,pension_plan,17353
8,payroll,16333
5,long_term_deposit,6119
7,credit_card,4801
4,securities,1786
3,funds,1315


In [10]:
product_ranking["ranking"] = product_ranking["Total"].rank(ascending=False)
product_ranking

Unnamed: 0,Producto,Total,ranking
14,em_acount,296334,1
12,debit_card,43247,2
10,payroll_account,26521,3
11,emc_account,24733,4
9,pension_plan,17353,5
8,payroll,16333,6
5,long_term_deposit,6119,7
7,credit_card,4801,8
4,securities,1786,9
3,funds,1315,10


Se crea un DataFrame que contiene para cada cliente (pk_cid) una lista de recomendaciones de productos basado en el ranking. Esto se realiza con la funcion get_top_n_recommendations, que identifica el producto de mayor ranking que el cliente no tiene y se sugiere como recomendación. A continuación se describe especificamente que hace la función:

1. Identifica los productos que el cliente no tiene.
2. Se crea un nuevo ranking de esos productos (que el cliente no tiene), basado en el Ranking principal
3. Selecciona los productos primeros en el ranking y se retornar en una lista de 3 parametros(dichos prametros se pueden cambiar y poner directamente n=1 y te devuelve un solo producto)

In [11]:

def get_top_n_recommendations(row, n=3):
    # Obtener los productos que el cliente tiene
    client_products = row[product_cols]
     # Identificar los productos que el cliente NO tiene
    not_owned_products = client_products[client_products == 0].index.tolist()
    # Filtrar el ranking para incluir solo los productos que el cliente no tiene
    ranking = product_ranking.set_index('Producto')['ranking']
    not_owned_ranking = ranking.loc[not_owned_products]
     # Verificar si hay productos que recomendar
    if not not_owned_ranking.empty:
        # Obtener los top N productos con el ranking más alto
        recommended_products = not_owned_ranking.nsmallest(n).index.tolist()
        return recommended_products
    else:
        return None  # El cliente tiene todos los productos

# Aplicar la función para obtener las top N recomendaciones
df_last_partition['recomendaciones'] = df_last_partition.apply(get_top_n_recommendations, axis=1, n=3)

# Crear el DataFrame final con las recomendaciones
df_recommendations = df_last_partition[['pk_cid', 'recomendaciones']]

# Mostrar el DataFrame de recomendaciones
print(df_recommendations)

          pk_cid                             recomendaciones
5519929   657826  [debit_card, payroll_account, emc_account]
5519930   657817    [em_acount, debit_card, payroll_account]
5519931   657986  [em_acount, long_term_deposit, securities]
5519932   657905  [debit_card, payroll_account, emc_account]
5519933   657336  [debit_card, payroll_account, emc_account]
...          ...                                         ...
5962919  1166765  [debit_card, payroll_account, emc_account]
5962920  1166764  [debit_card, payroll_account, emc_account]
5962921  1166763  [debit_card, payroll_account, emc_account]
5962922  1166789  [debit_card, payroll_account, emc_account]
5962923  1550586  [debit_card, payroll_account, emc_account]

[442909 rows x 2 columns]


Para quedarnos con la primera recomendación

In [12]:
df_recommendations["recomend"] = df_recommendations["recomendaciones"].apply(lambda x: x[0] if x else None)
df_recommendations.head()

Unnamed: 0,pk_cid,recomendaciones,recomend
5519929,657826,"[debit_card, payroll_account, emc_account]",debit_card
5519930,657817,"[em_acount, debit_card, payroll_account]",em_acount
5519931,657986,"[em_acount, long_term_deposit, securities]",em_acount
5519932,657905,"[debit_card, payroll_account, emc_account]",debit_card
5519933,657336,"[debit_card, payroll_account, emc_account]",debit_card


Ahora se agrega el precio de cada producto

In [13]:
# Definir las listas de productos
cuenta_bancaria_products = ['em_account_pp', 'payroll', 'payroll_account', 'emc_account', 'debit_card', 'em_account_p', 'em_acount']
inversion_products = ['short_term_deposit', 'funds', 'securities', 'long_term_deposit', 'pension_plan']
financiacion_products = ['loans', 'mortgage', 'credit_card']

In [14]:
products_precios = {
    'em_account_pp': 10, 'payroll': 10, 'payroll_account': 10, 'emc_account': 10, 'debit_card': 10, 'em_account_p': 10, 'em_acount': 10,
    'short_term_deposit': 40, 'funds': 40, 'securities': 40, 'long_term_deposit': 40, 'pension_plan': 40,
    'loans': 60, 'mortgage': 60, 'credit_card': 60
}

df_recommendations["precio"] = df_recommendations["recomend"].apply(lambda x: products_precios[x] if x else None)
df_recommendations

Unnamed: 0,pk_cid,recomendaciones,recomend,precio
5519929,657826,"[debit_card, payroll_account, emc_account]",debit_card,10
5519930,657817,"[em_acount, debit_card, payroll_account]",em_acount,10
5519931,657986,"[em_acount, long_term_deposit, securities]",em_acount,10
5519932,657905,"[debit_card, payroll_account, emc_account]",debit_card,10
5519933,657336,"[debit_card, payroll_account, emc_account]",debit_card,10
...,...,...,...,...
5962919,1166765,"[debit_card, payroll_account, emc_account]",debit_card,10
5962920,1166764,"[debit_card, payroll_account, emc_account]",debit_card,10
5962921,1166763,"[debit_card, payroll_account, emc_account]",debit_card,10
5962922,1166789,"[debit_card, payroll_account, emc_account]",debit_card,10


In [15]:
df_recommendations.value_counts("recomend")

recomend
debit_card           268860
em_acount            146575
payroll_account       26597
emc_account             549
long_term_deposit       217
pension_plan             96
credit_card               6
payroll                   5
securities                4
Name: count, dtype: int64

In [16]:
df_recommendations["precio"].value_counts()

precio
10    442586
40       317
60         6
Name: count, dtype: int64

Análisis de los Resultados:

- Concentración de Recomendaciones: La gran mayoría de las recomendaciones se concentran en los primeros tres productos del ranking global, que a su vez representan al grupo de productos Cuentas.

- Poca Diversidad en Recomendaciones: Los demás productos reciben muy pocas recomendaciones en comparación.

- Posible Falta de Relevancia Personalizada: Este enfoque global no considera las diferencias individuales entre los clientes.

Limitaciones del Enfoque Global:

- Falta de Personalización: Todos los clientes reciben recomendaciones basadas en el mismo ranking, sin considerar sus características individuales. Es probable que las necesidades y preferencias varíen significativamente entre diferentes grupos de clientes.
  
- Recomendaciones Poco Relevantes: Algunos clientes pueden no estar interesados en los productos más recomendados. Productos que podrían ser más relevantes para ciertos clientes están siendo ignorados debido al ranking global.

	
- Oportunidades Perdidas: Al no segmentar, se podría estar perdiendo oportunidades para ofrecer productos que tienen mayor potencial de aceptación en ciertos grupos.



Teniendo en cuenta todo lo anterior se toma la decision de hacer un analisis de ranking de productos por grupos, a partir de la segmentación obtenida en la Tarea 2:

- Se adaptan las recomendaciones según las características y necesidades específicas de cada grupo lo que aumenta la probabilidad de aceptación.

- Mayor y mejor diversidad en recomendaciones ya que productos que son menos recomendados en el enfoque global podrían ser más relevantes para ciertos grupos y, por tanto, recomendados con mayor frecuencia.
  
Por lo tanto, implementar un análisis de recomendaciones por grupos de clientes mejorará significativamente la relevancia de las recomendaciones y se aprovechará mejor el potencial del catálogo de productos.

---

# Enfoque por grupos de clientes

Cargando el dataset de la segmentación de la Tarea 2

In [17]:
df_clustering = pd.read_parquet("s3://easy-money-project-bucket/df_clustering.parquet")
df_clustering

Unnamed: 0_level_0,num_products_contracts,entry_date,active_customer,mes_partition,age,cuentas,ahorro_inversion,financiacion,entry_channel_group_Canal Secundario,salary_category_Ingreso Bajo,salary_category_Ingreso Medio,cluster_6
pk_cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15891,0,0,0,3,2,-1,0,0,1,0,0,3
16063,-1,1,1,0,3,-2,0,0,1,0,0,5
16203,0,1,1,0,3,-0,0,0,1,0,0,4
16502,1,1,1,0,2,1,0,0,1,0,0,4
17457,2,0,1,0,2,1,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1553685,-1,1,0,0,2,-2,0,0,1,0,0,5
1553686,-1,1,0,0,0,-2,0,0,1,0,1,2
1553687,-1,1,0,0,-0,-2,0,0,1,0,0,2
1553688,-1,1,0,0,1,-2,0,0,1,0,0,5


In [18]:
# Uniendo el DataFrame de recomendaciones con el DataFrame de clustering
df_last_partition = pd.merge(df_last_partition, df_clustering["cluster_6"], on="pk_cid", how="left")
df_last_partition.drop("recomendaciones", axis=1, inplace=True)

In [19]:
df_last_partition

Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount,cluster_6
0,657826,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
1,657817,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5
2,657986,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,1
3,657905,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,4
4,657336,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442904,1166765,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
442905,1166764,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
442906,1166763,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
442907,1166789,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


Diviediendo el dataset en varios dataset por los grupos existentes

In [20]:
df_cluster_0 = df_last_partition[df_last_partition["cluster_6"] == 0]
df_cluster_1 = df_last_partition[df_last_partition["cluster_6"] == 1]
df_cluster_2 = df_last_partition[df_last_partition["cluster_6"] == 2]
df_cluster_3 = df_last_partition[df_last_partition["cluster_6"] == 3]
df_cluster_4 = df_last_partition[df_last_partition["cluster_6"] == 4]
df_cluster_5 = df_last_partition[df_last_partition["cluster_6"] == 5]


In [21]:
df_cluster_0

Unnamed: 0,pk_cid,short_term_deposit,loans,mortgage,funds,securities,long_term_deposit,em_account_pp,credit_card,payroll,pension_plan,payroll_account,emc_account,debit_card,em_account_p,em_acount,cluster_6
25,659822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
28,659080,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
38,649838,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
41,651450,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
66,673857,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442902,1166767,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
442903,1166766,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
442904,1166765,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
442905,1166764,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [22]:
# 1. Identificar las columnas de productos (todas excepto 'pk_cid')
product_cols = ['short_term_deposit', 'loans', 'mortgage', 'funds', 'securities',
                'long_term_deposit', 'em_account_pp', 'credit_card', 'payroll',
       'pension_plan', 'payroll_account', 'emc_account', 'debit_card',
       'em_account_p', 'em_acount']

dicc_cluster = {"cluster_0": df_cluster_0, "cluster_1": df_cluster_1,
                 "cluster_2": df_cluster_2, "cluster_3": df_cluster_3, 
                 "cluster_4": df_cluster_4, "cluster_5": df_cluster_5}

df_rankind_cluster = pd.DataFrame()

for i,j in dicc_cluster.items():
       # 2. Sumar los valores de cada columna de productos
       product_totals = j[product_cols].sum()
       
       # 3. Crear un DataFrame con los totales
       product_ranking = product_totals.reset_index()
       product_ranking.columns = [f'Producto_{i}', f'Total_{i}']
       
       # 4. Ordenar los productos por total en orden descendente
       product_ranking.sort_values(by=f'Total_{i}', ascending=False, inplace=True, ignore_index=True)
       
       df_rankind_cluster = pd.concat([df_rankind_cluster, product_ranking], axis=1)
       #df_rankind_cluster.reset_index(drop=True, inplace=True)

df_rankind_cluster

Unnamed: 0,Producto_cluster_0,Total_cluster_0,Producto_cluster_1,Total_cluster_1,Producto_cluster_2,Total_cluster_2,Producto_cluster_3,Total_cluster_3,Producto_cluster_4,Total_cluster_4,Producto_cluster_5,Total_cluster_5
0,em_acount,235182,payroll_account,24006,em_acount,349,short_term_deposit,0,em_acount,50045,em_acount,381
1,debit_card,8162,debit_card,19771,credit_card,39,loans,0,debit_card,15303,credit_card,220
2,emc_account,2511,pension_plan,16426,emc_account,27,mortgage,0,emc_account,10569,emc_account,115
3,payroll_account,684,payroll,15890,pension_plan,15,funds,0,long_term_deposit,3545,securities,18
4,long_term_deposit,156,emc_account,11511,securities,6,securities,0,payroll_account,1825,funds,13
5,pension_plan,154,em_acount,10377,debit_card,6,long_term_deposit,0,pension_plan,757,payroll_account,5
6,securities,88,credit_card,3973,loans,2,em_account_pp,0,securities,520,debit_card,5
7,credit_card,60,long_term_deposit,2418,payroll_account,1,credit_card,0,credit_card,509,loans,1
8,payroll,60,securities,1154,short_term_deposit,0,payroll,0,funds,403,payroll,1
9,funds,30,funds,869,mortgage,0,pension_plan,0,payroll,382,pension_plan,1


Se puede observar que el analisis del ranking de los productos según los grupos varía es más especifico con respecto al enfoque general.

Los productos más populares varían entre clusters. Por ejemplo:

- payroll_account es el más popular en Cluster 1 pero no lo es en los demás.
- credit_card es el segundo más popular en Cluster 2, pero no tiene la misma posición en otros clusters.
- En algunos clusters, ciertos productos tienen totales muy bajos o incluso cero, indicando poca o ninguna adopción, como se observa en el cluster_3

Los datos muestran claramente que los clientes en diferentes clusters tienen preferencias y comportamientos distintos. Esto fundamenta la necesidad de hacer el análisis por grupos.

## Recomendación para cada cluster

In [37]:
ranking_cluster_0 = df_rankind_cluster[["Producto_cluster_0", "Total_cluster_0"]]
ranking_cluster_1 = df_rankind_cluster[["Producto_cluster_1", "Total_cluster_1"]]
ranking_cluster_2 = df_rankind_cluster[["Producto_cluster_2", "Total_cluster_2"]]
ranking_cluster_3 = df_rankind_cluster[["Producto_cluster_3", "Total_cluster_3"]]
ranking_cluster_4 = df_rankind_cluster[["Producto_cluster_4", "Total_cluster_4"]]
ranking_cluster_5 = df_rankind_cluster[["Producto_cluster_5", "Total_cluster_5"]]

ranking_cluster_0["ranking"] = ranking_cluster_0["Total_cluster_0"].rank(ascending=False)
ranking_cluster_1["ranking"] = ranking_cluster_1["Total_cluster_1"].rank(ascending=False)
ranking_cluster_2["ranking"] = ranking_cluster_2["Total_cluster_2"].rank(ascending=False)
ranking_cluster_3["ranking"] = ranking_cluster_3["Total_cluster_3"].rank(ascending=False)
ranking_cluster_4["ranking"] = ranking_cluster_4["Total_cluster_4"].rank(ascending=False)
ranking_cluster_5["ranking"] = ranking_cluster_5["Total_cluster_5"].rank(ascending=False)


In [40]:
dicc_cluster_ranking = {0:ranking_cluster_0, 1:ranking_cluster_1,
                2:ranking_cluster_2, 3:ranking_cluster_3,
                4:ranking_cluster_4, 5:ranking_cluster_5}


for i,j in dicc_cluster_ranking.items():
    
    def get_top_n_recommendations(row, n=3):
        # Obtener los productos que el cliente tiene
        client_products = row[product_cols]
        # Identificar los productos que el cliente NO tiene
        not_owned_products = client_products[client_products == 0].index.tolist()
        # Filtrar el ranking para incluir solo los productos que el cliente no tiene
        ranking = j.set_index(f'Producto_cluster_{i}')['ranking']
        not_owned_ranking = ranking.loc[not_owned_products]
        # Verificar si hay productos que recomendar
        if not not_owned_ranking.empty:
            # Obtener los top N productos con el ranking más alto
            recommended_products = not_owned_ranking.nsmallest(n).index.tolist()
            return recommended_products
        else:
            return None
        
    df_cluster = dicc_cluster[f"cluster_{i}"]
    # Aplicar la función para obtener las top N recomendaciones
    df_cluster[f'recomendaciones_{i}'] = df_cluster.apply(get_top_n_recommendations, axis=1, n=1)
    # Crear el DataFrame final con las recomendaciones
    if i == 0:
        df_cluster_recomend_0 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print(df_cluster_recomend_0[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_0.head())
    elif i == 1:
        df_cluster_recomend_1 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print  (df_cluster_recomend_1[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_1.head())
    elif i == 2:
        df_cluster_recomend_2 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print(df_cluster_recomend_2[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_2.head())
    elif i == 3:
        df_cluster_recomend_3 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print(df_cluster_recomend_3[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_3.head())
    elif i == 4:
        df_cluster_recomend_4 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print(df_cluster_recomend_4[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_4.head())
    elif i == 5:
        df_cluster_recomend_5 = df_cluster[['pk_cid', f'recomendaciones_{i}']]
        print(df_cluster_recomend_5[f"recomendaciones_{i}"].value_counts())
        print(df_cluster_recomend_5.head())




recomendaciones_0
[debit_card]     227587
[em_acount]        8441
[emc_account]      7595
Name: count, dtype: int64
    pk_cid recomendaciones_0
25  659822      [debit_card]
28  659080      [debit_card]
38  649838      [debit_card]
41  651450      [debit_card]
66  673857      [debit_card]
recomendaciones_1
[payroll_account]      10991
[debit_card]            9498
[emc_account]           7318
[pension_plan]          4194
[em_acount]             2389
[payroll]                380
[credit_card]            163
[long_term_deposit]       60
[securities]               4
Name: count, dtype: int64
    pk_cid recomendaciones_1
2   657986       [em_acount]
6   658184       [em_acount]
16  656415     [emc_account]
27  659236     [emc_account]
35  650478         [payroll]
recomendaciones_2
[em_acount]      71797
[credit_card]      349
Name: count, dtype: int64
     pk_cid recomendaciones_2
26   659223       [em_acount]
79   671107       [em_acount]
207  620004       [em_acount]
295  724516       [em