In [1]:
# Librerías principales
import pandas as pd
import numpy as np
from datetime import date
import random
from dateutil.relativedelta import relativedelta

# Librerías Proyecto
from lib.RecSysClusters import ClusterRFM
from lib.RecSys import AprioriSys

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [2]:
df_clusters = pd.read_csv('../data/data_rfm.csv', header=0)[['usuario_id_crp', 'year_month','freq_all_1', 'mon_all_1']]
df_recsys = pd.read_csv('../data/data_trx.csv')

In [3]:
df_test = pd.read_json('../data/df_dummy_leave_one.json')

## GRID SHEARCH: ARIORI CONTRA BENCHMARKS

In [4]:
random.seed(20220720)

list_results = []
top_n_clusters = []


# Definir DataFrames
df_cluster_train = df_clusters[df_clusters['year_month'] < '2022-01-01'].reset_index(drop=True)
df_cluster_test = df_clusters[df_clusters['year_month'] >= '2022-01-01'].reset_index(drop=True)
df_recsys_train = df_recsys[df_recsys['fecha'] < '2022-01-01'].reset_index(drop=True)
df_recsys_test = df_test[df_test['fecha'] >= '2022-01-01'].reset_index(drop=True)

#Definir Subcategorias
subcategorias = list(df_recsys_train['subcat_comercial'].unique())

# Definir Clusters de Entrenamiento
cluster = ClusterRFM(df=df_cluster_train, user_id='usuario_id_crp', frequency_column='freq_all_1', monetary_column='mon_all_1', n_clusters=3)
cluster_cliente_entrenamiento = cluster.cluster_customer()

# Predecir Clusters de Test
cluster_test = ClusterRFM(df=df_cluster_test, user_id='usuario_id_crp', frequency_column='freq_all_1', monetary_column='mon_all_1', n_clusters=3, trained_model=cluster.trained_model, trained_scaled_model=cluster.scaler)
cluster_cliente_test = cluster_test.cluster_customer()

# Asoignar cluster a cada cliente
df_recsys_train['cluster'] = df_recsys_train['usuario_id_crp'].map(cluster_cliente_entrenamiento)
df_recsys_test['cluster'] = df_recsys_test['usuarioidcrp'].map(cluster_cliente_test)

# Definir dataframes por cluster
set_df_recsys_train = {}
set_df_recsys_test = {}
set_top_n_train = {}
listas_top_n_train = {}
for n_cluster in range(cluster.n_clusters):
    set_df_recsys_train[f'cluster_{n_cluster}'] = df_recsys_train[df_recsys_train['cluster'] == n_cluster].drop(columns='cluster',axis=1).reset_index(drop=True)
    set_df_recsys_test[f'cluster_{n_cluster}'] = df_recsys_test[df_recsys_test['cluster'] == n_cluster].drop(columns='cluster',axis=1).set_index('ventaidcrp')

    # Obtener lista de productos populares (top) del cluster y almacenar
    df_top_n = set_df_recsys_train[f'cluster_{n_cluster}'].reset_index(drop=True)
    df_top_n = df_top_n.groupby('subcat_comercial').count().sort_values('venta_id_crp', ascending=False)
    top = list(df_top_n.index)
    listas_top_n_train[n_cluster] = top


# Ejecutar Ejercicio por Cluster
cluster_rules = {}
for n_cluster in range(cluster.n_clusters):

    # Ajustar algoritmo apriori
    apriori_sys = AprioriSys(df=set_df_recsys_train[f'cluster_{n_cluster}'], porduct_column='subcat_comercial', min_support=0.001)
    cluster_rules[f'reglas_cluster_{n_cluster}'] = apriori_sys.fit_rules()

    # Predecir
    resultados={
        'venta_id_crp':[],
        'cluster':[],
        'apriori_metric':[],
        'n_recommendations':[],
        'rec_apriori':[],
        'rec_random':[],
        'rec_top_subcat':[]
    }

    # Iterar por el dataframe del cluster
    for index,row in tqdm(set_df_recsys_test[f'cluster_{n_cluster}'].iterrows()):

        # Iterar por función a maximizar
        for metric in ['confidence', 'lift']:

            # Iterar por cantidad de productos recomendados
            for n_rec in [1,2,3]:

                # Obtener Recomendaciones de los modelos benchmark
                rec_random = random.sample(subcategorias,n_rec)
                top_n = listas_top_n_train[n_cluster][:n_rec]

                # Obtener Recomendaciones del modelo apriori
                recomendacion_apriori = apriori_sys.predict(basket=pd.DataFrame(row[2:]).T, metric=metric, n_recommendations=n_rec)

                # Almacenar Resultados Especificos
                resultados['venta_id_crp'].append(index)
                resultados['cluster'].append(n_cluster)
                resultados['apriori_metric'].append(metric)
                resultados['n_recommendations'].append(n_rec)
                resultados['rec_apriori'].append(recomendacion_apriori)
                resultados['rec_random'].append(rec_random)
                resultados['rec_top_subcat'].append(top_n)

    # Almacenar Resultados Generales
    list_results.append(resultados)


#Generar el dataframe de los ejecrcicios
#df = pd.DataFrame()
#for i in list_results:
#    df = df.append(pd.DataFrame(i), ignore_index=True)

#Exportar resultados en parquet
df.to_csv('./resultados/apriori_con_cluster.csv')

50368it [24:46, 33.89it/s]
6036it [02:09, 46.58it/s]
7338it [03:29, 34.96it/s]


NameError: name 'df' is not defined

In [5]:
df = pd.DataFrame()
for i in list_results:
    df = df.append(pd.DataFrame(i), ignore_index=True)

leave_one = pd.read_csv('../data/leave_one.csv')
df = df.merge(leave_one, on='venta_id_crp')

In [6]:
df['hit_apriori'] = df.apply(lambda x: 1 if x['leave_one'] in x['rec_apriori'] else 0, axis=1)
df['hit_random'] = df.apply(lambda x: 1 if x['leave_one'] in x['rec_random'] else 0, axis=1)
df['hit_top'] = df.apply(lambda x: 1 if x['leave_one'] in x['rec_top_subcat'] else 0, axis=1)

In [7]:
df.to_csv('./resultados/apriori_con_cluster.csv', index=False)

In [94]:
df.groupby(['apriori_metric', 'n_recommendations'])[['hit_apriori', 'hit_random', 'hit_top']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,hit_apriori,hit_random,hit_top
apriori_metric,n_recommendations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
confidence,1,0.265555,0.014794,0.108374
confidence,2,0.391375,0.029353,0.253444
confidence,3,0.456904,0.044743,0.315412
lift,1,0.260174,0.014574,0.108374
lift,2,0.335226,0.030608,0.253444
lift,3,0.393838,0.044147,0.315412


In [4]:
random.seed(20220720)

list_results = []
top_n_clusters = []

for rolling in [False]:
    
    start_date = date(2020,3,1)
    max_date =  date(2022,6,1)
    minimum_windows_size = 12
    
    print(f'rolling {rolling}')
    
    start_train = start_date
    end_train = start_train + relativedelta(months=minimum_windows_size)
    start_test = start_train + relativedelta(months=minimum_windows_size+1)
    end_test = start_test + relativedelta(months=1)

    while start_test <= max_date:

        print(start_train, end_train, start_test, end_test)

        # Definir DataFrames
        df_cluster_train = df_clusters[(df_clusters['year_month'] >= start_train.strftime('%Y-%m-%d')) & (df_clusters['year_month'] <= end_train.strftime('%Y-%m-%d'))].reset_index(drop=True)
        df_cluster_test = df_clusters[(df_clusters['year_month'] >= (end_train + relativedelta(months=1)).strftime('%Y-%m-%d')) & (df_clusters['year_month'] < (end_train + relativedelta(months=2)).strftime('%Y-%m-%d'))].reset_index(drop=True)
        df_recsys_train = df_recsys[(df_recsys['fecha'] >= start_train.strftime('%Y-%m-%d')) & (df_recsys['fecha'] <= end_train.strftime('%Y-%m-%d'))].reset_index(drop=True)
        df_recsys_test = df_test[(df_test['fecha'] >= start_test.strftime('%Y-%m-%d')) & (df_test['fecha'] < end_test.strftime('%Y-%m-%d'))].reset_index(drop=True)
        
        #Definir Subcategorias
        subcategorias = list(df_recsys_train['subcat_comercial'].unique())
                
        # Definir Clusters de Entrenamiento
        cluster = ClusterRFM(df=df_cluster_train, user_id='usuario_id_crp', frequency_column='freq_all_1', monetary_column='mon_all_1', n_clusters=3)
        cluster_cliente_entrenamiento = cluster.cluster_customer()

        # Predecir Clusters de Test
        cluster_test = ClusterRFM(df=df_cluster_test, user_id='usuario_id_crp', frequency_column='freq_all_1', monetary_column='mon_all_1', n_clusters=3, trained_model=cluster.trained_model, trained_scaled_model=cluster.scaler)
        cluster_cliente_test = cluster_test.cluster_customer()

        # Asoignar cluster a cada cliente
        df_recsys_train['cluster'] = df_recsys_train['usuario_id_crp'].map(cluster_cliente_entrenamiento)
        df_recsys_test['cluster'] = df_recsys_test['usuarioidcrp'].map(cluster_cliente_test)
        
        # Definir dataframes por cluster
        set_df_recsys_train = {}
        set_df_recsys_test = {}
        set_top_n_train = {}
        listas_top_n_train = {}
        for n_cluster in range(cluster.n_clusters):
            set_df_recsys_train[f'cluster_{n_cluster}'] = df_recsys_train[df_recsys_train['cluster'] == n_cluster].drop(columns='cluster',axis=1).reset_index(drop=True)
            set_df_recsys_test[f'cluster_{n_cluster}'] = df_recsys_test[df_recsys_test['cluster'] == n_cluster].drop(columns='cluster',axis=1).set_index('ventaidcrp')
            
            # Obtener lista de productos populares (top) del cluster y almacenar
            df_top_n = set_df_recsys_train[f'cluster_{n_cluster}'].reset_index(drop=True)
            df_top_n = df_top_n.groupby('subcat_comercial').count().sort_values('venta_id_crp', ascending=False)
            top = list(df_top_n.index)
            listas_top_n_train[n_cluster] = top
            
            
        # Ejecutar Ejercicio por Cluster
        cluster_rules = {}
        for n_cluster in range(cluster.n_clusters):
            
            # Ajustar algoritmo apriori
            apriori_sys = AprioriSys(df=set_df_recsys_train[f'cluster_{n_cluster}'], porduct_column='subcat_comercial', min_support=0.001)
            cluster_rules[f'reglas_cluster_{n_cluster}'] = apriori_sys.fit_rules()
            
            # Predecir
            resultados={
                'rolling':[],
                'start_train':[],
                'end_train':[],
                'start_test':[],
                'end_test':[],
                'venta_id_crp':[],
                'cluster':[],
                'apriori_metric':[],
                'n_recommendations':[],
                'rec_apriori':[],
                'rec_random':[],
                'rec_top_subcat':[]
            }
            
            # Iterar por el dataframe del cluster
            for index,row in tqdm(set_df_recsys_test[f'cluster_{n_cluster}'].iterrows()):
                
                # Iterar por función a maximizar
                for metric in ['consequent support', 'confidence', 'lift', 'leverage', 'conviction']:
                    
                    # Iterar por cantidad de productos recomendados
                    for n_rec in [1,2,3]:
                        
                        # Obtener Recomendaciones de los modelos benchmark
                        rec_random = random.sample(subcategorias,n_rec)
                        top_n = listas_top_n_train[n_cluster][:n_rec]
                        
                        # Obtener Recomendaciones del modelo apriori
                        recomendacion_apriori = apriori_sys.predict(basket=pd.DataFrame(row[2:]).T, metric=metric, n_recommendations=n_rec)
                        
                        # Almacenar Resultados Especificos
                        resultados['rolling'].append(rolling)
                        resultados['start_train'].append(start_train)
                        resultados['end_train'].append(end_train)
                        resultados['start_test'].append(start_test)
                        resultados['end_test'].append(end_test)
                        resultados['venta_id_crp'].append(index)
                        resultados['cluster'].append(n_cluster)
                        resultados['apriori_metric'].append(metric)
                        resultados['n_recommendations'].append(n_rec)
                        resultados['rec_apriori'].append(recomendacion_apriori)
                        resultados['rec_random'].append(rec_random)
                        resultados['rec_top_subcat'].append(top_n)
            
            # Almacenar Resultados Generales
            list_results.append(resultados)

        # Actualizar Fechas
        if rolling == True:
            start_train += relativedelta(months=1)

        end_train += relativedelta(months=1)
        start_test += relativedelta(months=1)
        end_test += relativedelta(months=1)
        

#Generar el dataframe de los ejecrcicios
df = pd.DataFrame()
for i in list_results:
    df = df.append(pd.DataFrame(i), ignore_index=True)

#Exportar resultados en parquet
df.to_csv('./resultados/recursive_v2.csv')

rolling False
2020-03-01 2021-03-01 2021-04-01 2021-05-01


7016it [10:19, 11.32it/s]
3095it [04:53, 10.53it/s]
1589it [01:50, 14.34it/s]


2020-03-01 2021-04-01 2021-05-01 2021-06-01


2843it [04:35, 10.32it/s]
1485it [01:43, 14.28it/s]
6277it [09:11, 11.38it/s]


2020-03-01 2021-05-01 2021-06-01 2021-07-01


2801it [04:23, 10.65it/s]
1553it [01:47, 14.44it/s]
6243it [09:06, 11.43it/s]


2020-03-01 2021-06-01 2021-07-01 2021-08-01


5542it [08:13, 11.24it/s]
1182it [01:22, 14.40it/s]
2668it [04:00, 11.08it/s]


2020-03-01 2021-07-01 2021-08-01 2021-09-01


5364it [07:52, 11.35it/s]
1415it [01:37, 14.44it/s]
2772it [04:16, 10.79it/s]


2020-03-01 2021-08-01 2021-09-01 2021-10-01


1066it [01:12, 14.74it/s]
2840it [04:16, 11.05it/s]
5218it [07:36, 11.43it/s]


2020-03-01 2021-09-01 2021-10-01 2021-11-01


4175it [06:07, 11.36it/s]
2463it [03:38, 11.25it/s]
732it [00:49, 14.77it/s]


2020-03-01 2021-10-01 2021-11-01 2021-12-01


799it [00:54, 14.60it/s]
2347it [03:27, 11.30it/s]
3706it [05:15, 11.74it/s]


2020-03-01 2021-11-01 2021-12-01 2022-01-01


5028it [07:10, 11.68it/s]
1281it [01:27, 14.72it/s]
2749it [04:19, 10.59it/s]


2020-03-01 2021-12-01 2022-01-01 2022-02-01


2034it [02:58, 11.38it/s]
3725it [05:17, 11.72it/s]
1256it [01:27, 14.43it/s]


2020-03-01 2022-01-01 2022-02-01 2022-03-01


3939it [05:41, 11.55it/s]
2331it [03:23, 11.48it/s]
1421it [01:37, 14.62it/s]


2020-03-01 2022-02-01 2022-03-01 2022-04-01


2538it [03:49, 11.04it/s]
1959it [02:13, 14.62it/s]
4531it [06:24, 11.77it/s]


2020-03-01 2022-03-01 2022-04-01 2022-05-01


3308it [04:44, 11.63it/s]
1906it [02:10, 14.65it/s]
6023it [08:31, 11.77it/s]


2020-03-01 2022-04-01 2022-05-01 2022-06-01


7005it [09:54, 11.79it/s]
2018it [02:17, 14.63it/s]
2983it [04:20, 11.47it/s]


## Pruebas Random

In [35]:
comparacion = pd.read_json('../data/dataset_test.json')

In [36]:
comparacion_2 = comparacion.merge(resultados_2, how='inner', on='venta_id_crp')
comparacion_2

Unnamed: 0,fecha,venta_id_crp,usuario_id_crp,canasta_original,canasta_test,leave_one,recomendacion
0,2021-04-01,428056,1925806,"[POSTRE INDIVIDUAL, SANDWICH, JUGO]","[POSTRE INDIVIDUAL, SANDWICH]",JUGO,"[BARQUILLO, EMPANADA INDIVIDUAL, EMPANADITA, G..."
1,2021-04-01,428057,771075,"[EMPANADITA, BARQUILLO, PANADERIA, MAP, ALFAJOR]","[EMPANADITA, BARQUILLO, ALFAJOR, PANADERIA]",MAP,"[BOLLERIA, CALUGA, FRUTOS SECOS, MAP, POSTRE I..."
2,2021-04-01,428058,1853170,"[EMPANADITA, POSTRE FAMILIAR]",[POSTRE FAMILIAR],EMPANADITA,"[COCKTAIL CONGELADO, LASAÑA, MAP FAMILIAR, PAN..."
3,2021-04-01,428060,1916418,"[POSTRE INDIVIDUAL, PLATO LIVIANO, PANADERIA]","[POSTRE INDIVIDUAL, PLATO LIVIANO]",PANADERIA,"[EMPANADA INDIVIDUAL, EMPANADITA, ENSALADA, GA..."
4,2021-04-01,428068,526798,"[EMPANADITA, PIZZA, PANADERIA]","[EMPANADITA, PIZZA]",PANADERIA,"[BOLLERIA, EMPANADA INDIVIDUAL, PANADERIA, POS..."
...,...,...,...,...,...,...,...
7011,2021-04-30,457941,1396669,"[JUGO, QUESO LAMINADO, PANADERIA]","[QUESO LAMINADO, PANADERIA]",JUGO,"[ALFAJOR, BOLLERIA, DIP, FRUTOS SECOS, JAMON, ..."
7012,2021-04-30,457948,677920,"[GALLETA DULCE, ALFAJOR, EMPANADA INDIVIDUAL, ...","[ALFAJOR, EMPANADA INDIVIDUAL, FRUTOS SECOS]",GALLETA DULCE,"[BARQUILLO, BOLLERIA, CALUGA, GALLETA SALADA, ..."
7013,2021-04-30,457953,1528614,"[PANADERIA, SALAME, QUESO LAMINADO]","[SALAME, PANADERIA]",QUESO LAMINADO,"[BOLLERIA, FRUTOS SECOS, JAMON, QUESO LAMINADO]"
7014,2021-04-30,457961,1477438,"[PANADERIA, GALLETA SALADA, MANTEQUILLA]","[MANTEQUILLA, PANADERIA]",GALLETA SALADA,"[BOLLERIA, JAMON, POSTRE INDIVIDUAL, QUESO LAM..."


In [37]:
res = []
for index,row in tqdm(comparacion_2.iterrows()):
   
    res.append(row['leave_one'] in row['recomendacion'])

comparacion_2['hit'] = res

7016it [00:00, 23149.25it/s]


In [38]:
sum(res) / comparacion_2.shape[0]

0.4605188141391106

In [32]:
sum(res) / comparacion_2.shape[0]

0.4484036488027366

In [26]:
sum(res) / comparacion_2.shape[0]

0.3983751425313569

In [39]:
comparacion_2

Unnamed: 0,fecha,venta_id_crp,usuario_id_crp,canasta_original,canasta_test,leave_one,recomendacion,hit
0,2021-04-01,428056,1925806,"[POSTRE INDIVIDUAL, SANDWICH, JUGO]","[POSTRE INDIVIDUAL, SANDWICH]",JUGO,"[BARQUILLO, EMPANADA INDIVIDUAL, EMPANADITA, G...",True
1,2021-04-01,428057,771075,"[EMPANADITA, BARQUILLO, PANADERIA, MAP, ALFAJOR]","[EMPANADITA, BARQUILLO, ALFAJOR, PANADERIA]",MAP,"[BOLLERIA, CALUGA, FRUTOS SECOS, MAP, POSTRE I...",True
2,2021-04-01,428058,1853170,"[EMPANADITA, POSTRE FAMILIAR]",[POSTRE FAMILIAR],EMPANADITA,"[COCKTAIL CONGELADO, LASAÑA, MAP FAMILIAR, PAN...",False
3,2021-04-01,428060,1916418,"[POSTRE INDIVIDUAL, PLATO LIVIANO, PANADERIA]","[POSTRE INDIVIDUAL, PLATO LIVIANO]",PANADERIA,"[EMPANADA INDIVIDUAL, EMPANADITA, ENSALADA, GA...",True
4,2021-04-01,428068,526798,"[EMPANADITA, PIZZA, PANADERIA]","[EMPANADITA, PIZZA]",PANADERIA,"[BOLLERIA, EMPANADA INDIVIDUAL, PANADERIA, POS...",True
...,...,...,...,...,...,...,...,...
7011,2021-04-30,457941,1396669,"[JUGO, QUESO LAMINADO, PANADERIA]","[QUESO LAMINADO, PANADERIA]",JUGO,"[ALFAJOR, BOLLERIA, DIP, FRUTOS SECOS, JAMON, ...",True
7012,2021-04-30,457948,677920,"[GALLETA DULCE, ALFAJOR, EMPANADA INDIVIDUAL, ...","[ALFAJOR, EMPANADA INDIVIDUAL, FRUTOS SECOS]",GALLETA DULCE,"[BARQUILLO, BOLLERIA, CALUGA, GALLETA SALADA, ...",False
7013,2021-04-30,457953,1528614,"[PANADERIA, SALAME, QUESO LAMINADO]","[SALAME, PANADERIA]",QUESO LAMINADO,"[BOLLERIA, FRUTOS SECOS, JAMON, QUESO LAMINADO]",True
7014,2021-04-30,457961,1477438,"[PANADERIA, GALLETA SALADA, MANTEQUILLA]","[MANTEQUILLA, PANADERIA]",GALLETA SALADA,"[BOLLERIA, JAMON, POSTRE INDIVIDUAL, QUESO LAM...",False


In [54]:
np.unique(lista_b)

array(['BARQUILLO', 'EMPANADITA', 'HELADO', 'PANADERIA',
       'POSTRE INDIVIDUAL', 'SOPAS'], dtype='<U17')