# Pasos del TP
*   Explorar los datos
*   Plantear el problema a resolver
*   Preprocesar los datos a un formato adecuado
*   Elegir algoritmos
*   Fittear y validar
*   Decidir el algoritmo final, y testear

# Problemas que se buscan resolver:

Se ha demostrado que cada vez más empresas (especialmente empresas de comercio electrónico) tienen grandes dificultades en la conversión de propects (clientes potencialess) a clientes activos con la primer compra, además sostener a los clientes actuales en la dinámica e interacción en el tiempo con los productos y la empresa suele ser de difícil comprensión, generando pérdidas de facturación por cancelaciones hasta la pérdida del cliente

Las investigaciones se han centrado en el análisis del producto y el ciclo de valor del cliente en la empresa.

Las preguntas que ayudan a entender el problema:
¿Cuál es el rendimiento de los productos en las ventas?
¿Cómo se relacionan los clientes con los productos en el tiempo?
¿Cómo se agrupan los clientes según sus necesidades e intereses?
¿Cómo retenemos a clientes o mejoramos las tasas de conversión a clientes ? 

## OBJETIVO:
Este proyecto tiene como objetivo explorar diferentes herramientas de conversion, retención y rendimientos de clientes en las ventas a traves de metodologías de Machine Learning

Exploraremos y mediremos la efectividad de las siguientes herramientas:
A. Product Analytics.
B. Recomendación de Productos
C. CLV (Ciclo de vida del Valor cliente)
D. Segmentación de clientes.

Las técnicas algorítmicas en ML a utilizar y explorar:
* No supervisados
  * Clustering Knn
* Supervidados
  * Decision Tree, 
  * SVM, ANN, DNN



# INICIALIZACION DE TOOLS

In [1]:
%matplotlib inline

In [None]:
!pip install seaborn
!pip install kmodes
!pip install nltk
!pip install matplotlib
!pin install pydot

## TOOLS BASICS

In [2]:
import pandas as pd
from pandas import DataFrame
from datetime import datetime, timedelta, date
from pandas.plotting import autocorrelation_plot
from pandas import read_csv
from matplotlib import pyplot as plt


import warnings;
warnings.filterwarnings('ignore')



In [3]:
import seaborn as sns

## TOOLS DE NLP

In [None]:

# NLP
import re
import nltk 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize

In [None]:
# ANALITICA DE TEXTO
from wordcloud import WordCloud
from wordcloud import STOPWORDS

## TOOLS ALGORITMO 1

In [95]:
#Statistical LTV
from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data, summary_data_from_transaction_data

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn import preprocessing

from kmodes.kprototypes import KPrototypes
from pprint import pprint
import numpy as np

In [None]:
import plotly.graph_objects as go

## TOOLS ALGORITMO 2

In [170]:
 
# ML approach to LTV
import tensorflow as tf 
#import tensorflow_probability as tfp
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling as tfmodel
import tensorflow_docs.plots

# Ploteo 
import matplotlib
matplotlib.use('TkAgg')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Evaluación
from sklearn.metrics import r2_score
from sklearn.metrics  import mean_absolute_error

In [None]:
print(f'TensorFlow Version: {tf.__version__}')
print(f'Keras version: {keras.__version__}')
print('Physical devices:')
tf.config.list_physical_devices()

# CARGA DE DATOS

Si estas en Google Colab

In [11]:
import sys
assert sys.version_info >= (3, 5)
import os

In [None]:
RETAIL_PATH = "https://github.com/hcgalvan/UNSAM-Machine-Learning-on-Economics/raw/main/data/"

In [None]:
if 'google.colab' in sys.modules:
  def load_datasets_h1(datasets_path=RETAIL_PATH):
    csv_path = os.path.join(datasets_path, "Year 2009-2010_train.csv")
    return pd.read_csv(csv_path, encoding= 'unicode_escape')

  retail_ol_h1 = load_datasets_h1()
  def load_datasets_h2(datasets_path=RETAIL_PATH):
      csv_path = os.path.join(datasets_path, "Year 2010-2011_train.csv")
      return pd.read_csv(csv_path, encoding= 'unicode_escape')

  retail_ol_h2 = load_datasets_h2()

Utilizar si estas en PC con Code y cualquier otro framework

In [14]:
def carga_inicial():
    retail_ol_h1 = pd.read_csv('./data/Year_2009-2010_train.csv',  encoding= 'unicode_escape')
    retail_ol_h2 = pd.read_csv('./data/Year_2010-2011_train.csv',  encoding= 'unicode_escape')
    return [retail_ol_h1, retail_ol_h2]

De Uso comun para PC y Google Colab

In [15]:
def cargar_archivo_total():
    frames = carga_inicial()
    results = pd.concat(frames)
    df = results.copy()
    return df

#### funcion lectura de un solo archivo 2010-2011

In [22]:
def carga_archivo():
    frames = pd.read_csv('./data/Year_2010-2011_train.csv',  encoding= 'unicode_escape')
    df = frames.copy()
    return df

# EXPLORACION INICIAL DE DATOS

#### Situación actual de los datos

In [79]:
# cargar solo un archivo para pruebas
df = carga_archivo()

In [17]:
# cargar los dos archivos utilizados en TP
df = cargar_archivo_total()

In [24]:
df.shape

(433528, 9)

In [25]:
df.isnull().sum()

Unnamed: 0          0
Invoice             0
StockCode           0
Description      1180
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107935
Country             0
dtype: int64

In [26]:
# Chequeamos datos unicos y actuales en cada features (atributos).

for i in df.columns:
  print("Actual number of values",i,len(df[i]))
  print("Unique number of values",i,len(df[i].unique()))

Actual number of values Unnamed: 0 433528
Unique number of values Unnamed: 0 433528
Actual number of values Invoice 433528
Unique number of values Invoice 24673
Actual number of values StockCode 433528
Unique number of values StockCode 4019
Actual number of values Description 433528
Unique number of values Description 4156
Actual number of values Quantity 433528
Unique number of values Quantity 657
Actual number of values InvoiceDate 433528
Unique number of values InvoiceDate 22310
Actual number of values Price 433528
Unique number of values Price 1415
Actual number of values Customer ID 433528
Unique number of values Customer ID 4348
Actual number of values Country 433528
Unique number of values Country 38


In [27]:
# Chequeamos valores nulos en los features
df.isnull().sum()

Unnamed: 0          0
Invoice             0
StockCode           0
Description      1180
Quantity            0
InvoiceDate         0
Price               0
Customer ID    107935
Country             0
dtype: int64

In [28]:
# Chequeamos datos duplicados
df.duplicated().sum()

0

In [29]:
# meses incompletos
print('Rango de Fecha: %s ~ %s' % (df['InvoiceDate'].min(), df['InvoiceDate'].max()))
df.loc[df['InvoiceDate'] >= '2011-12-01'].shape
df.loc[df['InvoiceDate'] < '2009-12-02' ].shape

Rango de Fecha: 2010-12-01 08:26:00 ~ 2011-12-09 12:50:00


(0, 9)

# PRE-PROCESAMIENTO DATOS

## Preparación de Datos
1. Eliminación de pedidos cancelados.
2. Eliminando registros sin Customer ID, sin descripción de productos y features sin títulos
3. Excluimos meses incompletos.
4. Calcular las ventas totales de los features Cantidad y Precio unitario.
5. Datos por cliente : para analizar segmentos de clientes, necesitamos transformar nuestros datos, de modo que cada registro represente el historial de compras de clientes individuales.

#### 1. Limpieza de Datos
Hay registros con valores negativos en la columna Cantidad, que representan pedidos cancelados. Ignoremos y eliminemos estos registros.

In [30]:
def limpieza_datos(df):
    # Observamos las cantidades negativas
    df.loc[df['Quantity'] <= 0].shape
    df = df.loc[df['Quantity'] > 0]
    #Quitamos la 1er columna vacía
    df.drop(['Unnamed: 0'], axis =1, inplace=True)
    # Quitamos valores nulos en features Customer ID y la Descripcion porque no son imputaciones.
    df.dropna(inplace=True)
    # Quitamos valores duplicados
    df.drop_duplicates(inplace=True)
    # Quitamos fechas incompletas
    df = df.loc[df['InvoiceDate'] < '2011-12-01']
    df = df.loc[df['InvoiceDate'] > '2009-12-01']
    return df

#### 2. Agregados de features

#### NLP - PRE-PROCESADO PARA ANALIZAR CATEGORIAS DE PRODUCTOS

In [31]:
# Esta función busca categorizar a los productos que se ofrecen
def agrega_color(df):
    colours = ['red','orange', 'yellow','green', 'blue', 'indigo', 'violet', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'gray', 'black', 'white', 'cream']

    stop_words = set(stopwords.words('english'))
    Product_type = []
    Colour_type = []
    dataset= df
    # dataset= len(df)
    for row in dataset.iloc[:,2]:
        s=" "
        description = re.sub('[^a-zA-Z]'," ", str(row).lower()) #cleaning of text data
        wordsList = nltk.word_tokenize(description) #tokenization
        wordsList = [nltk.stem.WordNetLemmatizer().lemmatize(w, 'n') for w in wordsList if not w in stop_words] # lemmitization
        flag=False
        for w in wordsList:
            if w in colours:
                Colour_type.append(w)
                flag=True
            break
        if flag==False:
            Colour_type.append("no_color") #taking out colours from description

        tagged = nltk.pos_tag(wordsList)

        for tag in tagged:
            if tag[1]=='NN' :
                s+=tag[0] +  " "
        Product_type.append(s)
    
    return Product_type, Colour_type

In [32]:
def borrar_desc_invoice(df):
    # Quitar columnas "InvoiceDate" y "Description"
    X = df.drop(["Description", "InvoiceDate"], axis=1)
    return X

In [33]:
def cambiar_tipo_datos(df):
    """
    Transformar todas las variables en categoricas y en flotantes 
    Columna 2 es Quantity, 3 es Price y 8 Revenue
    """
    X = df.astype('category')
    X.iloc[:, 2] = X.iloc[:, 2].astype(float)
    X.iloc[:, 3] = X.iloc[:, 3].astype(float)
    X.iloc[:, 8] = X.iloc[:, 8].astype(float)
    return X

#### Funciones para DNN

In [34]:
#  Ingeniería de features
def get_features(data, feature_start, feature_end, target_start, target_end):
    """
    Function that outputs the features and targets on the user-level.
    Inputs:
        * data - a dataframe with raw data
        * feature_start - a string start date of feature period
        * feature_end - a  string end date of feature period
        * target_start - a  string start date of target period
        * target_end - a  string end date of target period
    """
    features_data = data.loc[(data.date >= feature_start) & (data.date <= feature_end), :]
    print(f'Using data from {(pd.to_datetime(feature_end) - pd.to_datetime(feature_start)).days} days')
    print(f'To predict {(pd.to_datetime(target_end) - pd.to_datetime(target_start)).days} days')
    
    # Transacciones/Facturaciones
    total_rev = features_data.groupby('Customer ID')['Revenue'].sum().rename('total_revenue')
    recency = (features_data.groupby('Customer ID')['date'].max() - features_data.groupby('Customer ID')['date'].min()).apply(lambda x: x.days).rename('recency')
    frequency = features_data.groupby('Customer ID')['InvoiceDate'].count().rename('frequency')
    # la "t" es la fecha en mita del año
    t = features_data.groupby('Customer ID')['date'].min().apply(lambda x: (datetime(2011, 6, 11) - x).days).rename('t')
    time_between = (t / frequency).rename('time_between')
    
    avg_basket_value = (total_rev / frequency).rename('avg_basket_value')
    avg_basket_size = (features_data.groupby('Customer ID')['Quantity'].sum() / frequency).rename('avg_basket_Size')
    
    returns = features_data.loc[features_data['Revenue'] < 0, :].groupby('Customer ID')['InvoiceDate'].count().rename('num_returns')
    hour = features_data.groupby('Customer ID')['hour'].median().rename('purchase_hour_med')
    dow = features_data.groupby('Customer ID')['dayofweek'].median().rename('purchase_dow_med')
    weekend =  features_data.groupby('Customer ID')['weekend'].mean().rename('purchase_weekend_prop')
    
    #Datos para entrenamiento
    train_data = pd.DataFrame(index = rfm_train_test.index)
    train_data = train_data.join([total_rev, recency, frequency, t, time_between, avg_basket_value, avg_basket_size, returns, hour, dow, weekend])
    train_data = train_data.fillna(0)
    
    #Target Data
    target_data = data.loc[(data.date >= target_start) & (data.date <= target_end), :]
    target_quant = target_data.groupby(['Customer ID'])['date'].nunique()
    target_rev = target_data.groupby(['Customer ID'])['Revenue'].sum().rename('target_rev')
    # Cambios realizados, joint con target_quant HUG
    train_data = train_data.join(target_rev, target_quant).fillna(0) 
    
    return train_data.iloc[:, :-1], train_data.iloc[:, -1]

In [197]:
#DNN - Red Neuronal Profunda -> Regularizador haciendo con apagado de capas
def build_model():
    
    model = keras.Sequential(layers.ImputLayer(imput_shape=(1,), name='Input'),layers=[
    layers.Dense(256, activation='relu', input_shape=[len(X_train.columns), ]),
    layers.Dropout(rate=0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(rate=0.3),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
    ])


    optimizer = tf.keras.optimizers.Adam(lr=0.001)
   
    model.compile(loss='mse',
            optimizer=optimizer,
            metrics=['mae', 'mse'])
    
    return model
    

In [36]:
# Evaluación de predicción
def evaluate(actual, sales_prediction):
    print(f"Total Ventas Actual: {np.round(actual.sum())}")
    print(f"Total Ventas Predictivo: {np.round(sales_prediction.sum())}")
    print(f"Individual R2 score: {r2_score(actual, sales_prediction)} ")
    print(f"Individual Mean Absolute Error: {mean_absolute_error(actual, sales_prediction)}")
    plt.scatter(sales_prediction, actual)
    plt.xlabel('Prediction')
    plt.ylabel('Actual')      
    plt.show()
    

#### FUNCIONES UTILIZADAS EN GENERAL

In [37]:
# Damos formato de fecha a InvoiceDate para realizar tratamientos posteriores
def tipos_dataset(df):    
    df['InvoiceDate']  = pd.to_datetime(df.InvoiceDate, format = '%Y/%m/%d %H:%M')
    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df['Price']=df['Price'].astype(str)
    df['Price']=df['Price'].astype(float)
    df['Month'] = pd.DatetimeIndex(pd.to_datetime(df['InvoiceDate'])).month
    return df

In [38]:
# Agregamos features para Analizar diferentes casos
# UTILIZAR UNA VEZ SETEADO LOS TIPOS DE DATOS def tipos_dataset(df)
def agregados_features(df):
    # Agregamos features Sales
    df['Sales'] = df['Quantity'] * df['Price']
    # Datos cliente por pedido
    df['date'] = pd.to_datetime(df.InvoiceDate.dt.date, errors='coerce')
    df['time'] = df.InvoiceDate.dt.time
    df['hour'] = df['time'].apply(lambda x: x.hour)

    df['weekend'] = df['date'].apply(lambda x: x.weekday() in [5, 6])
    df['dayofweek'] = df['date'].apply(lambda x: x.dayofweek)

    df['Product Type'] = agrega_color(df)[0]
    df['Colour_type']= agrega_color(df)[1]

    return df

In [89]:
def agregar_fecha_log2(df):
    #Get revenue column
    df['Revenue'] = df['Quantity'] * df['Price']
    #Datetime transformation
    df['date'] = pd.to_datetime(df.InvoiceDate.dt.date)
    df['time'] = df.InvoiceDate.dt.time
    df['hour'] = df['time'].apply(lambda x: x.hour)
    df['weekend'] = df['date'].apply(lambda x: x.weekday() in [5, 6])
    df['dayofweek'] = df['date'].apply(lambda x: x.dayofweek)
    return df

In [78]:
def agregar_fechas(df):

    df['Day'] = df['time'].apply(lambda x: x.day)
    df['Month'] = df['time'].apply(lambda x: x.month)
    df['Year'] = df['time'].apply(lambda x: x.year)
    df['DayOfWeek'] = df['date'].apply(lambda x: x.dayofweek)
    return df

#### FUNCIONES SOBRE DESPCRIPCION DE PRODUCTOS

In [None]:
# Ploteo de letras
def ploteo_word(df):
    stopwords=set(STOPWORDS)
    wordcloud=WordCloud(background_color='White').generate(str(df['Description']))
    print(wordcloud)
    plt.rcParams['figure.figsize']=(12,12)
    plt.axis('off')
    plt.imshow(wordcloud)
    plt.title('Lista de Cantidad veces utilizada una letra descripción productos',fontsize=20,color='red')
    plt.show()
    return

#### FUNCIONES SOBRE PRECIOS

In [None]:
# Ploteo Cantidad y precio
def ploteo_precio_cantidad(df):
    df = df[['Quantity','Price']]
    df['PriceBins'] = pd.cut(df['Price'].tolist(), bins=8)
    sns.barplot(data=df,x="PrecioBins", y="Cantidad")
    return

In [None]:
#Cuántos clientes compraron algo cada mes durante el último año
def precio_cliente_pais(df):
    
    df = df.groupby(['Country' , 'Month']).agg({'Price':'sum' , 'Customer ID' :'count'})
    df.columns = ['PriceSum','CustomerIDCount']
    df = df.reset_index()
    cm = sns.light_palette("blue", as_cmap=True)
    pvd = pd.pivot_table(df, values='CustomerIDCount', index=['Country'],
                    columns=['Month'],
                    aggfunc=np.sum).fillna(0)
    return pvd.style.background_gradient(cmap=cm)

In [None]:
# ¿En qué rangos los precios son más comunes? - Grafica
def rango_precios(df):
    prices = pd.DataFrame([df['Price'].value_counts()
                         .sort_values(ascending=False).to_dict()]).T
    prices = pd.DataFrame(prices['Price'].value_counts())
    prices = prices.reset_index()
    prices.columns = ['Price','CountPrice']
    prices['PriceBins'] = prices.cut(df['Price'].tolist(), bins=8)
    sns.barplot(data=prices, x='PriceBins', y='CountPrice')
    return

In [None]:
# Rango de cambio de precio durante el tiempo - EXPLORACION DE DATOS
def AED(df):
     customer_avg_spending= df[['Price','Customer ID', 'InvoiceDate' , 'Country']]
     avg_selling_of_products = df[['Price','Quantity','InvoiceDate']]
     return customer_avg_spending, avg_selling_of_products 

In [None]:
def customer_avg_spending_insights(df):
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.strftime('%Y-%m-%d')
    df = df.groupby(['InvoiceDate']).agg({'Price':'sum'}).reset_index()
    df = df.reset_index(drop=True)
    df.columns = ['Date','PriceSum']
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df['Date'], y=df['PriceSum'], name="Price Sum",
                         line_color='deepskyblue'))
    fig.update_layout(title_text='Sum range of all prices among time',
                  xaxis_rangeslider_visible=True)
    fig.show()
    return

In [None]:
# Esta función agrupa por continente a paises y suma la cantidad vendida por pais
def cantidad_mensual_pais(df):
    df = df.groupby(['Country' , 'Month']).agg({'Quantity':'sum'})
    df = df.reset_index()
    df = df.sort_values(by=['Month'])
    Europe = ['United Kingdom','France', 'Belgium','EIRE',
              'Germany','Portugal', 'Denmark', 'Netherlands', 'Poland',
             'Spain', 'Cyprus', 'Greece', 'Norway', 'Austria', 'Sweden', 
              'Finland','Italy', 'Switzerland', 'Malta', 'Israel', 
              'Lithuania','Iceland']
    Asia = [ 'Japan','United Arab Emirates','Singapore','Hong Kong',
       'Thailand','West Indies', 'Korea','Lebanon',]
    America = ['Channel Islands','USA','Brazil', 'Canada']
    Australia = ['Australia',]
    df['Continent'] = df['Country'].map(lambda x: 'Europe' if x in Europe else(
                                        'Asia' if x in Asia else
                                        'America' if x in America else
                                        'Australia' if x in Australia else 'None' ))
    fig = px.scatter_geo(df, locations="Country",color="Continent",
                         hover_name="Country", size="Quantity",
                         animation_frame="Month",
                         projection="natural earth")
    fig.show()
    return

In [None]:
def create_prediction(df):
    topic = df['Topic']
    df = df[['InvoiceDate','Price']]
    df.columns = ['ds','y']
    m = Prophet()
    m.fit(df)
    future = m.make_future_dataframe(periods=90)
    forecast = m.predict(future)
    df_cv = cross_validation(m, horizon='90 days')
    df_p = performance_metrics(df_cv)
    fig3 = plot_cross_validation_metric(df_cv, metric='mape')
    return m.plot(forecast) ,  m.plot_components(forecast) , df_p , fig3 


#### FUNCIONES QUE HACEN AGREGACIONES Y MATRICES

In [None]:
def matriz_cliente_item(df):
    # Matriz cliente-item
    # Podemos sumar todas las cantidades compradas para cada artículo, utilizando la función aggfunc.
    customer_item_matrix = df.pivot_table(
        index='Customer ID', 
        columns='StockCode', 
        values='Quantity',
        aggfunc='sum'
    )
    #Convertimos esta matriz y lo codificamos en 0 - 1 a los datos, por lo que el valor de 1 determinando un producto fue comprado por el cliente dado, y el valor 0 determinado por producto que nunca fue comprado por el cliente dado.
    # La función Lambda que estamos usando en este código simplemente codifica todos los elementos cuyos valores son mayores que 0 con 1, y el resto con 0.
    customer_item_matrix = customer_item_matrix.applymap(lambda x: 1 if x > 0 else 0)
    return customer_item_matrix

In [None]:
# Cantidad de pedidos por cliente
def agrupa_ordenesxcliente(df):
    orders_df = df.groupby(['Customer ID', 'Invoice']).agg({
    'Sales': sum,
    'InvoiceDate': max
    })
    return orders_df

In [None]:
# Ventas por cliente
def agrupa_ventasxcliente(df):
    customer_df = df.groupby('Customer ID').agg({
        'Sales': sum,
        'Invoice': lambda x: x.nunique()
    })

    customer_df.columns = ['TotalSales', 'OrderCount']
    customer_df['AvgOrderValue'] = customer_df['TotalSales']/customer_df['OrderCount']
    return customer_df

In [None]:
# Cantidad de Artículos por Orden/Factura
def agrupa_artxventas(df):
    group_by_invoice=pd.DataFrame(b.groupby('InvoiceNo')['StockCode'].nunique())
    group_by_invoice.columns=['No.of Items per Order']
    group_by_invoice.head()
    return 

#### 5. Datos por cliente

In [None]:
# Instantanea de la matriz
customer_item_matrix

In [105]:
rank_df = customer_df.rank(method='first')

In [None]:
customer_df.head(15)

# ANÁLISIS: CARACTERIZACIÓN DE DATOS

Necesitamos convertir InvoiceDate en tipo Date.

In [107]:
df.describe(include='object')

Unnamed: 0,Invoice,StockCode,Description,InvoiceDate,Country
count,613951,613951,613951,613951,613951
unique,35540,4605,5243,33316,41
top,576339,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2011-11-14 15:27:00,United Kingdom
freq,434,4051,4051,434,551712


### Descriptiva de Clientes

In [108]:
# Analizamos clientes
df['Customer ID'].describe()

count    613951.000000
mean      15322.031318
std        1695.361076
min       12346.000000
25%       13971.000000
50%       15251.000000
75%       16794.000000
max       18287.000000
Name: Customer ID, dtype: float64

In [109]:
df['Quantity'].describe()

count    613951.000000
mean         13.385628
std         119.300741
min           1.000000
25%           2.000000
50%           6.000000
75%          12.000000
max       74215.000000
Name: Quantity, dtype: float64

In [110]:
customer_df.describe()

Unnamed: 0,TotalSales,OrderCount,AvgOrderValue
count,5828.0,5828.0,5828.0
mean,2327.814351,6.098147,302.795982
std,11403.148921,12.491479,474.321604
min,0.0,1.0,0.0
25%,276.365,1.0,143.1
50%,684.745,3.0,227.0975
75%,1810.44,7.0,340.546941
max,460080.01,379.0,19633.5


In [111]:
rank_df.describe()

Unnamed: 0,TotalSales,OrderCount,AvgOrderValue
count,5828.0,5828.0,5828.0
mean,2914.5,2914.5,2914.5
std,1682.543016,1682.543016,1682.543016
min,1.0,1.0,1.0
25%,1457.75,1457.75,1457.75
50%,2914.5,2914.5,2914.5
75%,4371.25,4371.25,4371.25
max,5828.0,5828.0,5828.0


### Normalización de datos

In [112]:
normalized_df = (rank_df - rank_df.mean()) / rank_df.std()

In [113]:
normalized_df.head(15)

Unnamed: 0_level_0,TotalSales,OrderCount,AvgOrderValue
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,1.723284,1.244842,1.729228
12347.0,1.327455,0.859711,1.405313
12348.0,0.814541,0.491221,0.844852
12349.0,1.310219,0.203561,1.633539
12350.0,-0.960154,-1.731605,0.246948
12351.0,-0.897748,-1.731011,0.392561
12352.0,1.111116,1.17471,0.254674
12353.0,-0.667145,-0.747381,-0.585126
12354.0,0.085882,-1.730416,1.572322
12355.0,0.122731,-0.746786,1.081399


In [114]:
normalized_df.describe()

Unnamed: 0,TotalSales,OrderCount,AvgOrderValue
count,5828.0,5828.0,5828.0
mean,0.0,0.0,-9.753504000000001e-18
std,1.0,1.0,1.0
min,-1.731605,-1.731605,-1.731605
25%,-0.865803,-0.865803,-0.8658025
50%,0.0,0.0,0.0
75%,0.865803,0.865803,0.8658025
max,1.731605,1.731605,1.731605


### Analisis más Detallado de productos

In [115]:
# ¿Cuál es el produto más vendido?
df.StockCode.mode()

0    85123A
dtype: object

In [None]:
product = df[df.StockCode.str.contains("85123A")]
product.head()

In [117]:
df.Description.mode()

0    WHITE HANGING HEART T-LIGHT HOLDER
dtype: object

### Análisis Agrupados

In [None]:
# ¿Cual es la facturación por cliente?
vtaxcliente = agrupa_ventasxcliente(df)
vtaxcliente

#Caracterización de estos datos
vtaxcliente.describe()

In [None]:
#¿Cuántas Ordenes/Facturas se realizaron por cliente?
ordenes = agrupa_ordenesxcliente(df)
ordenes

# Caracterización de estos datos
ordenes.describe()

In [None]:
# ¿Cuántos artículos se incluyen en cada Orden/Factura?
artxcliente = agrupa_artxventas(df)
artxcliente

# Caracterización de estos datos
artxcliente.describe()

In [None]:
# ¿Cuál es la media por artículos por cada cliente?

# Caracterización de estos datos

### Análisis de Precios y Cantidades

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 613951 entries, 0 to 433525
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      613951 non-null  object 
 1   StockCode    613951 non-null  object 
 2   Description  613951 non-null  object 
 3   Quantity     613951 non-null  int64  
 4   InvoiceDate  613951 non-null  object 
 5   Price        613951 non-null  float64
 6   Customer ID  613951 non-null  float64
 7   Country      613951 non-null  object 
 8   Sales        613951 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 46.8+ MB


In [121]:
df1 = tipos_dataset(df)

In [None]:
price_quantity_plot(df1)

# ALGORITMOS ANALIZADOS

## ALGORITMO 1 - CLASIFICACION Y PREDICCIÓN DE CANTIDAD

In [None]:
# Limpiar datos y establecer tipos
df = limpieza_datos(df)

# Agregar columnas en Dataframe
df['Product Type'] = agrega_color(df)[0]
df['Colour_type'] = agrega_color(df)[1]

# Borrar Descripcion e Invoice Date
df1 = borrar_desc_invoice(df)

# Agregar Ingresos por Ventas
df1['Revenue'] = df1['Price'] * df1['Quantity']

In [None]:
# Label encoding of categorical features
label_encoder = preprocessing.LabelEncoder()

for col in ["Invoice", "StockCode", "Customer ID","Country", "Product Type","Colour_type"]:
  df1[col] = label_encoder.fit_transform(df1[col])

In [None]:
# Definir los tipos de datos en categoricos y Float
df1 = cambiar_tipo_datos(df1)

# Splitear en dataframe en "Train" y "test"
train, test = train_test_split(df1, train_size = 0.8, random_state = 0)

#### Cluster de items similares para el nuevo feature cluster, utiliza en este caso K-prototype clustering

In [None]:
# Chequeo del valor optimo de 'K' // demanda varios minutos de ejecucion para encontrar el optimo

cost = []
for num_clusters in list(range(2,15)):
    kproto = KPrototypes(n_clusters = num_clusters, init='Cao')
    kproto.fit_predict(train, categorical=[0, 1, 4, 5, 6, 7])
    cost.append(kproto.cost_)
    labels=kproto.labels_
plt.plot(cost)

In [None]:
# Genera un nuevo numero de cluster del atributo

kproto = KPrototypes(n_clusters = 3, init = 'Cao')
kproto.fit_predict(train, categorical=[0, 1, 4, 5, 6])
print(kproto.cost_)
labels=kproto.labels_

In [None]:
# Agrego nuevo atributo
train["Cluster number"]=labels


In [None]:
# Ahora agrego "InvoiceDate" en dataframe

df2 = train.merge(pd.DataFrame(df["InvoiceDate"]), left_index=True, right_index=True)


#### Ingeniería de feature

In [None]:
# Agrego las fechas en diferentes rangos: horas, dias, meses, años
df2 = agregar_fechas( df2 )

In [None]:
# Guardo el proceso en archivo csv
df2.to_csv('./data/df2.csv')

In [None]:
# Capturo el archivo, quito columna vacía y muestro
df2 = pd.read_csv('./data/df2.csv')
df2.drop(['Unnamed: 0'], axis =1, inplace=True)
df2.head()

### Classification of test data into number of clusters

- Cluster numbers were treated as a target variable as the objective
was to match the records from the validation and testing sets with the clusters from the training set.
- El numero de Clusters fueron tratados como variable target, con el objetivo de converger los registros del set de validación y testeo con el cluster del set de entrenamiento.

In [None]:
# Corte del dataframe entre entrenamiento y validación
train_, val_= train_test_split(df2, train_size = 0.8, random_state = 0)

In [None]:
train_y=train_["Cluster number"]
train_x=train_.drop(['Cluster number'],axis=1,inplace=False)

val_y=val_["Cluster number"]
val_x=val_.drop(['Cluster number'],axis=1,inplace=False)

#### Para Clasificar
* Utilizamos SVC, que brinda el mejor resultado sobre otros algoritmos

In [None]:
model1 = LinearSVC()
model1.fit(train_x, train_y)

In [None]:
# Validación del data test
pred_y = model1.predict(val_x)

In [None]:
# Evaluación de Performance
accuracy_score(val_y,pred_y)

In [None]:
# Adding "InvoiceDate" in test data

test_Df = test.merge(pd.DataFrame(dataset["InvoiceDate"]), left_index=True, right_index=True)
test_Df

### PREDICCION DE FEATURE "QUANTITY" PARA DEMANDAS DE PRODUCTOS

In [None]:
# Codificación de Label para features categoricos

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

for col in ["Invoice", "StockCode", "Customer ID","Country", "Product Type","Colour_type"]:
  df[col] = label_encoder.fit_transform(df[col])

In [None]:
train_y=train_["Quantity"].astype('int')
train_x=train_.drop(['Quantity'], axis=1,inplace=False)

test_Df_y=test_Df["Quantity"].astype('int')
test_Df_x=test_Df.drop(['Quantity'],axis=1,inplace=False)

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(bootstrap=True,ccp_alpha=0.0,
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None,)
clf.fit(train_x, train_y)

In [None]:
prediction_test = clf.predict(test_Df_x)
prediction_test

In [None]:
from sklearn.metrics import f1_score
f1_score(test_Df_y, prediction_test, average='micro')

In [None]:
accuracy_score(test_Df_y, prediction_test)

In [None]:
# Ajuste de Hiperparámetros para Algoritmo de Random forest

# Numero de arboles en random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Numero de features a considerar en cada split
max_features = ['auto', 'sqrt']

# Numero Maximo de niveles en el arbol
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Numero Minimo de samples requeridos para splitear un nodo
min_samples_split = [2, 5, 10]

# Numero Minimo de samples requerido por cada hoja nodo
min_samples_leaf = [1, 2, 4]

# Método de selección de samples para entrenamiento en cada arbol
bootstrap = [True, False]

# Crea la cuadrícula (random grid)
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)



# Uso del random grid para buscar los mejores hiperparámetros
# Primero crea el modelo de base para ajustar
rf = RandomForestRegressor()

# Busqueda aleatoria de parametros, usando 3 fold cross validation, 
# search across 100 diferentes combinaciones, y usa todos los nucleos disponibles (available score)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fitea el modelo random search model
rf_random.fit(train_x, train_y)

In [None]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x, train_y)
knn=neigh.predict(test_Df_x)
print(accuracy_score(test_Df_y, knn))

In [None]:
# SVC with kernel 

from sklearn import svm
from sklearn.svm import SVC

rbf_svc = svm.SVC(kernel='rbf')
rbf_svc.fit(train_x, train_y)

rbf=rbf_svc.predict(test_Df_x)
accuracy_score(test_Df_y, rbf)


In [None]:
# AdaBoost

from sklearn.ensemble import AdaBoostClassifier

ad = AdaBoostClassifier(n_estimators=100, random_state=0)
ad.fit(train_x, train_y)
adb=ad.predict(test_Df_x)
print(accuracy_score(test_Df_y, adb))

In [None]:
# logistic

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(multi_class='ovr')
lr.fit(train_x, train_y)
lrc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, lrc))

In [None]:
# Naive base Classifier

from sklearn.naive_bayes import GaussianNB

lr = GaussianNB()
lr.fit(train_x, train_y)
lrc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, lrc))

In [None]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier 

dtree_model = DecisionTreeClassifier().fit(train_x, train_y)
dtree_predictions = dtree_model.predict(test_Df_x)
accuracy_score(test_Df_y, dtree_predictions)

In [None]:
# GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier

gb=GradientBoostingClassifier()
gb.fit(train_x, train_y)
gbc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, gbc))

## ALGORITMO 2 - CLASIFICACION Y PREDICCIÓN DE CLIENTES


In [80]:
# Limpiar datos y establecer tipos
df = limpieza_datos(df)

In [82]:
df = tipos_dataset(df)
df = agregar_fecha_log2(df)


In [92]:
#Plots a timeseries of total sales
df.groupby('date')['Quantity'].sum().plot()
#Prints the total number of days between start and end
print(df['date'].max() - df['date'].min())

364 days 00:00:00


In [93]:
#Dataset info
print(f'Total Number of Purchases: {df.shape[0]}')
print(f'Total Number of transactions: {df.Invoice.nunique()}')
print(f'Total Unique Days: {df.date.nunique()}')
print(f"Total Unique Customers: {df['Customer ID'].nunique()}")
print(f"We are predicting {(df['date'].max() - datetime(2011, 9, 11)).days} days")

Total Number of Purchases: 301401
Total Number of transactions: 17451
Total Unique Days: 297
Total Unique Customers: 4278
We are predicting 80 days


### Preparado de Datos

In [96]:
#Context data for the revenue (date & customerID)
id_lookup = df[['Customer ID', 'Invoice', 'date']].drop_duplicates()
id_lookup.index = id_lookup['Invoice']
id_lookup = id_lookup.drop('Invoice', axis=1)

transactions_data = pd.DataFrame(df.groupby('Invoice')['Revenue'].sum()).join(id_lookup)

In [97]:
#Split into train - test
rfm_train_test = calibration_and_holdout_data(transactions_data, 'Customer ID', 'date',
                                        calibration_period_end='2011-09-10',
                                        monetary_value_col = 'Revenue')   

#Selecting only customers with positive value in the calibration period (otherwise Gamma-Gamma model doesn't work)
rfm_train_test = rfm_train_test.loc[rfm_train_test['monetary_value_cal'] > 0, :]

### ML APPROACH
1. Ingeniería de feature para entrenar y testear períodos
2. Modelado
3. Evaluación

In [134]:
X_train, y_train = get_features(df, '2011-01-01', '2011-06-11', '2011-06-12', '2011-09-09')
X_test, y_test = get_features(df, '2011-04-02', '2011-09-10', '2011-09-11', '2011-12-09')

Using data from 161 days
To predict 89 days
Using data from 161 days
To predict 89 days


In [148]:
X_train, y_train = get_features(df, '2010-01-01', '2010-12-30', '2010-12-31', '2011-11-30')
X_train_, X_valid, y_train_, y_valid = train_test_split(X_train, y_train, random_state=42)


Using data from 363 days
To predict 334 days


In [149]:
y_train_

Customer ID
12395.0    1870.01
13122.0     779.19
18204.0    1716.61
14708.0     162.72
15144.0    4680.84
            ...   
15796.0    1802.53
16309.0     627.58
14915.0     239.42
16818.0    2659.34
15777.0     330.71
Name: target_rev, Length: 1454, dtype: float64

In [189]:
scaler = StandardScaler()
X_train_ = scaler.fit_transform(X_train_)
X_valid = scaler.transform(X_valid)


In [190]:
X_test, y_test = get_features(df, '2010-01-01', '2010-12-30', '2010-12-31', '2011-11-30')

Using data from 363 days
To predict 334 days


In [191]:
X_test = scaler.transform(X_test)

In [192]:
X_test

array([[ 6.58990000e+02,  9.28494634e-17,  2.80000000e+01, ...,
         1.40000000e+01,  1.00000000e+00,  8.06324287e-17],
       [ 8.66400000e+02,  9.28494634e-17,  1.50000000e+01, ...,
         1.90000000e+01,  3.00000000e+00,  8.06324287e-17],
       [ 3.78728074e-17,  9.28494634e-17, -3.90945109e-17, ...,
        -2.93208832e-17, -1.58821451e-17,  8.06324287e-17],
       ...,
       [ 3.78728074e-17,  9.28494634e-17, -3.90945109e-17, ...,
        -2.93208832e-17, -1.58821451e-17,  8.06324287e-17],
       [ 3.78728074e-17,  9.28494634e-17, -3.90945109e-17, ...,
        -2.93208832e-17, -1.58821451e-17,  8.06324287e-17],
       [ 3.78728074e-17,  9.28494634e-17, -3.90945109e-17, ...,
        -2.93208832e-17, -1.58821451e-17,  8.06324287e-17]])

In [155]:
X_train_

array([[ 0.27897921,  3.82453451,  0.53788397, ...,  1.8281611 ,
         1.89850422, -0.22380263],
       [-0.2392185 , -0.29572082, -0.36840165, ..., -0.72123736,
        -0.57195121, -0.22380263],
       [-0.2392185 , -0.29572082, -0.36840165, ..., -0.72123736,
        -0.57195121, -0.22380263],
       ...,
       [-0.2392185 , -0.29572082, -0.36840165, ..., -0.72123736,
        -0.57195121, -0.22380263],
       [-0.2392185 , -0.29572082, -0.36840165, ..., -0.72123736,
        -0.57195121, -0.22380263],
       [ 0.08401475, -0.29572082, -0.20362244, ...,  1.8281611 ,
        -0.57195121, -0.22380263]])

#### Modelado

In [101]:
y_train

Customer ID
12347.0    2726.27
12348.0     612.68
12352.0    2301.24
12356.0    2206.69
12359.0    4509.48
            ...   
18260.0    1923.32
18263.0     950.80
18272.0    2132.63
18273.0     153.00
18283.0    1465.41
Name: target_rev, Length: 1939, dtype: float64

In [102]:
X_train

Unnamed: 0_level_0,total_revenue,recency,frequency,t,time_between,avg_basket_value,avg_basket_Size,num_returns,purchase_hour_med,purchase_dow_med,purchase_weekend_prop
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12347.0,658.99,0.0,28.0,186.0,6.642857,23.535357,10.321429,0.0,14.0,1.0,0.0
12348.0,866.40,0.0,15.0,177.0,11.800000,57.760000,80.400000,0.0,19.0,3.0,0.0
12352.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
12356.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
12359.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
18260.0,230.70,0.0,11.0,177.0,16.090909,20.972727,5.272727,0.0,18.0,3.0,0.0
18263.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
18272.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
18273.0,0.00,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [199]:
# The patience parameter is the amount of epochs to check for improvement
#early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
early_stop = keras.callbacks.EarlyStopping(monitor='val_mse', patience=50)

In [194]:
model = build_model()

In [200]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 256)               3072      
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 21,633
Trainable params: 21,633
Non-trainable params: 0
__________________________________________________

In [122]:
keras.utils.plot_model(model, "wide_and_deep_model.png", show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [196]:
hidden1 = model.layers[2]
print(hidden1.name)
weights, biases = hidden1.get_weights()
print(weights.shape)
weights[:,0].shape

dense_9
(256, 64)


(256,)

In [201]:
# probemos epochs a partir de 100 en adelante
early_history = model.fit( X_train_, y_train_, 
                    epochs=1000, validation_data=(X_valid, y_valid), verbose=1,
                    callbacks=[early_stop, tfmodel.EpochDots()])

Epoch 1/1000

Epoch: 0, loss:58844044.0000,  mae:2161.0393,  val_loss:23248696.0000,  val_mae:1958.7972,  
.Epoch 2/1000
.Epoch 3/1000
.Epoch 4/1000
.Epoch 5/1000
.Epoch 6/1000
.Epoch 7/1000
.Epoch 8/1000
.Epoch 9/1000
.Epoch 10/1000
.Epoch 11/1000
.Epoch 12/1000
.Epoch 13/1000
.Epoch 14/1000
.Epoch 15/1000
.Epoch 16/1000
.Epoch 17/1000
.Epoch 18/1000
.Epoch 19/1000
.Epoch 20/1000
.Epoch 21/1000
.Epoch 22/1000
.Epoch 23/1000
.Epoch 24/1000
.Epoch 25/1000
.Epoch 26/1000
.Epoch 27/1000
.Epoch 28/1000
.Epoch 29/1000
.Epoch 30/1000
.Epoch 31/1000
.Epoch 32/1000
.Epoch 33/1000
.Epoch 34/1000
.Epoch 35/1000
.Epoch 36/1000
.Epoch 37/1000
.Epoch 38/1000
.Epoch 39/1000
.Epoch 40/1000
.Epoch 41/1000
.Epoch 42/1000
.Epoch 43/1000
.Epoch 44/1000
.Epoch 45/1000
.Epoch 46/1000
.Epoch 47/1000
.Epoch 48/1000
.Epoch 49/1000
.Epoch 50/1000
.Epoch 51/1000
.Epoch 52/1000
.Epoch 53/1000
.Epoch 54/1000
.Epoch 55/1000
.Epoch 56/1000
.Epoch 57/1000
.Epoch 58/1000
.Epoch 59/1000
.Epoch 60/1000
.Epoch 61/1000
.

#### Evaluación
Veamos qué tan bien el modelo puede predecir los 3 próximos 3 meses que antes no había visto. Usaremos datos del período más reciente (X_test) para asegurarnos de que nuestro pronóstico sea lo más preciso posible.

In [202]:
#Predicción
dnn_preds = model.predict(X_test).ravel()

#### Explicación de datos

In [203]:
#Dataset info
print(f'Total Número de Ventas: {df.shape[0]}')
print(f'Total Numero de transacciones: {df.InvoiceDate.nunique()}')
print(f'Total Días Unicos: {df.date.nunique()}')
print(f"Total clientes únicos: {df['Customer ID'].nunique()}")
print(f"Nuestra predicción {(df['date'].max() - datetime(2011, 9, 11)).days} days")

Total Número de Ventas: 301401
Total Numero de transacciones: 16318
Total Días Unicos: 297
Total clientes únicos: 4278
Nuestra predicción 80 days


In [204]:
#Predicting
dnn_preds = model.predict(X_test).ravel()


In [205]:
evaluate(y_test, dnn_preds)

Total Ventas Actual: 5142746.0
Total Ventas Predictivo: 3891386368.0
Individual R2 score: -351790.6943101874 
Individual Mean Absolute Error: 2005791.3451546184


## ALGORITMO 3 - PROPHET - PREDICCION SERIES DE TIEMPO

Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects.

### cargar solo un archivo para pruebas
df = carga_archivo()

In [None]:
# Limpiar datos y establecer tipos
df = limpieza_datos(df)

In [None]:
# Cuántos clientes compraron algo cada mes durante el último año
cantidad_mensual_pais(df)

In [None]:
# Rango de Precios comunes
rango_precios(df)

In [None]:
# Rango de cambio de precio durante el tiempo, enviamos los resultados de EDA 
customer_avg_spending, avg_selling_of_products = AED(df)

In [None]:

customer_avg_spending_insights(df)

### NLP

In [None]:
class CleanColumn(BaseEstimator , TransformerMixin):
    def __init__(self):
        pass
    def fit(self ,X , y=None):
        return self
    def transform(self , X):
            description = [re.sub("\'" , "" , sent) for sent in X]
            description = [re.sub("\"" , "" , sent) for sent in description]
            description = [re.sub("\&" , "" , sent) for sent in description]
            description = [re.sub("\+" , "" , sent) for sent in description]
            description = [re.sub("\/" , "" , sent) for sent in description]
            description = [re.sub("\-" , "" , sent) for sent in description]
            description_clean =[re.sub("\d+" , "" , sent) for sent in description]
            return description_clean

In [None]:
class Debug(BaseEstimator , TransformerMixin):
    def fit(self , X , y=None , **fit_params):
        return self
    def transform(self , X):
        print(X.shape)
        return X  

In [None]:
nlp_pipeline = Pipeline([
    ('cleaning', CleanColumn()),
    ('vect', CountVectorizer()),
    ('debug', Debug()),
])

In [None]:
nlp_transformed = nlp_pipeline.fit_transform(df['Description'].astype(str))

### LDA - Crear target "y" desde feature "Description"

In [None]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42,verbose=True)
LDA.fit(nlp_transformed)

In [None]:
topic_values = LDA.transform(nlp_transformed)

### MODELADO

#### PROPHET

In [None]:
class PrepareDataFrame(BaseException , TransformerMixin):
    def __init__(self , df):
        self.df = df
    def fit(self , df):
        return self
    def transform(self , df):
        df = df[['InvoiceDate', 'Topic', 'Price']]
        df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.strftime('%Y-%m-%d')
        
        df = df.groupby(['InvoiceDate', 'Topic']).agg({'Price':'mean'})
        df = df_modeling.reset_index()
        df_topic_1 = df[df['Topic'] == 1]
        df_topic_2 = df[df['Topic'] == 2]
        df_topic_3 = df[df['Topic'] == 3]
        df_topic_4 = df[df['Topic'] == 4]
        return df_topic_1 , df_topic_2,df_topic_3,df_topic_4

In [None]:
num_pipeline = Pipeline([
    ('preparedf' , PrepareDataFrame(df))
])

In [None]:
df_topic_1 , df_topic_2,df_topic_3,df_topic_4= num_pipeline.fit_transform(df)