<a href="https://colab.research.google.com/github/jmarrietar/mineria-de-datos/blob/main/%5BProyecto%5D%20Clasificacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* Original dataset from UCI ML http://archive.ics.uci.edu/ml/datasets/Online+Retail

In [None]:
!pip install --quiet lifetimes

[?25l[K     |▋                               | 10kB 15.1MB/s eta 0:00:01[K     |█▏                              | 20kB 13.9MB/s eta 0:00:01[K     |█▊                              | 30kB 10.7MB/s eta 0:00:01[K     |██▎                             | 40kB 8.5MB/s eta 0:00:01[K     |██▉                             | 51kB 5.3MB/s eta 0:00:01[K     |███▍                            | 61kB 5.5MB/s eta 0:00:01[K     |████                            | 71kB 5.9MB/s eta 0:00:01[K     |████▌                           | 81kB 6.2MB/s eta 0:00:01[K     |█████                           | 92kB 6.4MB/s eta 0:00:01[K     |█████▋                          | 102kB 6.5MB/s eta 0:00:01[K     |██████▏                         | 112kB 6.5MB/s eta 0:00:01[K     |██████▊                         | 122kB 6.5MB/s eta 0:00:01[K     |███████▎                        | 133kB 6.5MB/s eta 0:00:01[K     |███████▉                        | 143kB 6.5MB/s eta 0:00:01[K     |████████▍               

In [None]:
import gdown
import pandas as pd
import numpy as np
from dateutil import parser
import datetime
from dateutil import relativedelta
from lifetimes.utils import summary_data_from_transaction_data
from sklearn.metrics import mean_squared_error

In [None]:
!gdown https://drive.google.com/uc?id=1_AJd_FiQ6LSWSK4e__ccfLzhu_sWWNUS

Downloading...
From: https://drive.google.com/uc?id=1_AJd_FiQ6LSWSK4e__ccfLzhu_sWWNUS
To: /content/OnlineRetail.csv
45.6MB [00:00, 98.0MB/s]


In [None]:
online_retail = pd.read_csv(
    "OnlineRetail.csv",
    sep=",",
    dtype={"CustomerID": "object"},
    encoding="unicode_escape",
)

In [None]:
online_retail["InvoiceDate"] = pd.to_datetime(online_retail["InvoiceDate"])

# Create an additional column for date as year and month
online_retail["date"] = online_retail["InvoiceDate"].dt.strftime("%Y-%m")

# Create a new column for the total expenditure of that product in the purchase.
online_retail["total_sales_amount"] = (
    online_retail["UnitPrice"] * online_retail["Quantity"]
)

In [None]:
# Add column for cancelations 
online_retail['cancelation'] = online_retail['InvoiceNo'].apply(lambda x: x.startswith("C"))
online_retail['cancelation'] = online_retail['total_sales_amount'] <0

In [None]:
# Only positive purchases (No cancelations)
online_retail_purchases = online_retail[online_retail['cancelation'] == False] 

In [None]:
transactional_purchases = (
    online_retail_purchases.groupby(["InvoiceNo", "CustomerID", "InvoiceDate"])[
        ["Quantity", "total_sales_amount"]
    ]
    .agg("sum")
    .reset_index()
)

# Clasificacion

#### ML approach (Scikit-Learn)  

Debido a que este problema se relaciona con ** Time Aware Modeling**, donde el conjunto de validación se compone de observaciones de una ventana de tiempo fuera de (y más reciente que) la ventana de tiempo utilizada para el entrenamiento del modelo, la prueba se realiza en  un enfoque de validacion Out-of-time validation (OTV). 

![](https://drive.google.com/uc?export=view&id=12ncYfgu1s77BScvVHu_PKliSwioC8glr)    


In [None]:
def create_features_split(transactions, split_date, period_length, datetime_col, total_sales_col):
    """
    Taket historic transactional level data and returns train and test dataset in 
    custumer level useful to be used by machine learning models. 
    
    Arguments:
        transactions - Dataframe at transaction level with list of purchases.
        split_date - Date to be used to end train date
        period_length - The length of period in Months.
        datetime_col - Column of date time
        
    Returns:
        train - Dataframe at customer level to be used for training
        test - Dataframe at customer level to be used for testing 
        
    """

    train = transactions[transactions[datetime_col] < split_date]

    date = parser.parse(split_date)
    end_test_date = date + relativedelta.relativedelta(months=period_length)
    end_test_date = end_test_date.strftime("%Y-%m-%d")

    train_transactions = transactions[transactions[datetime_col] < split_date]
    test_transactions = transactions[transactions[datetime_col] < end_test_date]

    print("Creating Train ...")
    train = _transactions_to_dataset(
        train_transactions,
        split_date,
        period_length,
        "InvoiceDate",
        "CustomerID",
        total_sales_col,
    )

    print("Creating Test ...")
    test = _transactions_to_dataset(
        test_transactions,
        end_test_date,
        period_length,
        "InvoiceDate",
        "CustomerID",
        total_sales_col,
    )

    return train, test


def _transactions_to_dataset(
    transactions,
    end_date,
    period_length,
    datetime_col,
    customer_id_col,
    total_sales_col,
):
    """
    Take historic transactions and create a dataset with basics staticts features,
    number of purchases from past, current and next period and amount spent from 
    past, current and next period.
    
    Begining dataset: t0
    Past period: t1 - t2
    Current period: t2 - t3
    Target period: t3 - t4
    
    Arguments: 
        transactions - Dataframe at transaction level with war list of purchases.
        end_date - Last date to use to create dataset
        period_length - The length of period in Months.
        customer_id_col - Name of column with the ids of costumers
        total_sales_col - Name of column of the total amount spent in purchase
    
    Returns: 
        dataset - Data for customer level with number of transactions and total 
                    amount spent in the last, current and next period
    
    """

    t4 = end_date
    t3 = (
        parser.parse(t4) - relativedelta.relativedelta(months=period_length)
    ).strftime("%Y-%m-%d")
    t2 = (
        parser.parse(t3) - relativedelta.relativedelta(months=period_length)
    ).strftime("%Y-%m-%d")
    t1 = (
        parser.parse(t2) - relativedelta.relativedelta(months=period_length)
    ).strftime("%Y-%m-%d")
    t0 = transactions[datetime_col].min().strftime("%Y-%m-%d")

    # Define time periods
    transactions_dev = transactions[transactions[datetime_col] < t3]

    current_period = transactions_dev[
        (transactions_dev[datetime_col] >= t2) & (transactions_dev[datetime_col] < t3)
    ]

    past_period = transactions_dev[
        (transactions_dev[datetime_col] >= t1) & (transactions_dev[datetime_col] < t2)
    ]

    target_period = transactions[
        (transactions[datetime_col] >= t3) & (transactions[datetime_col] < t4)
    ]

    # Basic Features (Frequency, Recency and T) since t0
    features_train = summary_data_from_transaction_data(
        transactions_dev,
        customer_id_col=customer_id_col,
        datetime_col=datetime_col,
        monetary_value_col=total_sales_col,
        freq="D",
    )
    features_train.reset_index(level=0, inplace=True)

    # Purchases by customers current period
    purchases_current_period = (
        current_period[customer_id_col]
        .value_counts()
        .rename_axis(customer_id_col)
        .to_frame("purchases_current_period")
    )
    purchases_current_period.reset_index(level=0, inplace=True)

    # Purchases by customer past period
    purchases_past_period = (
        past_period[customer_id_col]
        .value_counts()
        .rename_axis(customer_id_col)
        .to_frame("purchases_past_period")
    )

    purchases_past_period.reset_index(level=0, inplace=True)

    # Amount spent by customer current period
    amount_spent_current_period = (
        current_period.groupby([customer_id_col])[[total_sales_col]]
        .agg("sum")
        .reset_index()
        .rename(columns={"total_sales_amount": "amount_spent_current_period"})
    )

    # Amount spent by customer last period
    amount_spent_past_period = (
        past_period.groupby([customer_id_col])[[total_sales_col]]
        .agg("sum")
        .reset_index()
        .rename(columns={"total_sales_amount": "amount_spent_past_period"})
    )

    # Create Targets
    purchases_target = (
        target_period[customer_id_col]
        .value_counts()
        .rename_axis(customer_id_col)
        .to_frame("purchases_next_period")
    )

    amount_spent_target = (
        target_period.groupby([customer_id_col])[[total_sales_col]]
        .agg("sum")
        .reset_index()
        .rename(columns={total_sales_col: "amount_spent_next_period"})
    )

    # Join the Datasets
    dataset = pd.merge(
        features_train, purchases_past_period, on=customer_id_col, how="left"
    )
    dataset = pd.merge(
        dataset, purchases_current_period, on=customer_id_col, how="left"
    )
    dataset = pd.merge(dataset, purchases_target, on=customer_id_col, how="left")
    dataset = pd.merge(
        dataset, amount_spent_past_period, on=customer_id_col, how="left"
    )
    dataset = pd.merge(
        dataset, amount_spent_current_period, on=customer_id_col, how="left"
    )
    dataset = pd.merge(dataset, amount_spent_target, on=customer_id_col, how="left")

    # Fill NA (No sales) with 0
    dataset["purchases_past_period"].fillna(0, inplace=True)
    dataset["purchases_current_period"].fillna(0, inplace=True)
    dataset["purchases_next_period"].fillna(0, inplace=True)
    dataset["amount_spent_past_period"].fillna(0, inplace=True)
    dataset["amount_spent_current_period"].fillna(0, inplace=True)
    dataset["amount_spent_next_period"].fillna(0, inplace=True)

    print("Data statistics starts from {}".format(t0))
    print("Past period from [{} to {})".format(t1, t2))
    print("Current period from [{} to {})".format(t2, t3))
    print("Next period from [{} to {})".format(t3, t4))

    return dataset

In [None]:
# Configurable experimental variables 
SPLIT_DATE = "2011-05-01" # [CHANGE Accordingly] Date to be used to end train date 
PERIOD_LENGTH = 2 # Months

In [None]:
train, test = create_features_split(
    transactional_purchases,
    split_date=SPLIT_DATE,
    period_length=PERIOD_LENGTH,
    datetime_col="InvoiceDate",
    total_sales_col="total_sales_amount"
)

Creating Train ...
Data statistics starts from 2010-12-01
Past period from [2010-11-01 to 2011-01-01)
Current period from [2011-01-01 to 2011-03-01)
Next period from [2011-03-01 to 2011-05-01)
Creating Test ...
Data statistics starts from 2010-12-01
Past period from [2011-01-01 to 2011-03-01)
Current period from [2011-03-01 to 2011-05-01)
Next period from [2011-05-01 to 2011-07-01)


* **Frequency:** representa el número de compras repetidas que ha realizado el cliente. 

* **Recency:** representa la edad del cliente cuando realizó sus compras más recientes. ** Esto es igual a la duración entre la primera compra de un cliente y su última compra. ** (Por lo tanto, si solo ha realizado una compra, la antigüedad es 0).

* **T:** representa la edad del cliente en las unidades de tiempo elegidas. ** Esto es igual a la duración entre la primera compra de un cliente y el final del período en estudio. **

* **monetary_value** representa el valor medio de las compras de un cliente determinado. Esto es igual a la suma de todas las compras de un cliente dividida por el número total de compras.

In [5]:
train.columns

NameError: ignored

In [4]:
train

NameError: ignored

In [None]:
"""
TO DO: 
    - Meterle tipo de usuario?
"""

#### Predict future total amount spent for individual customers (next period)

In [1]:
"""
feature_cols = ['frequency', 'recency', 'T', 'monetary_value',
       			'purchases_past_period', 'purchases_current_period',
       			'amount_spent_past_period','amount_spent_current_period']
"""

In [2]:
#y_col = ['amount_spent_next_period']

In [3]:
"""
X_train = train[feature_cols]
y_train = train[y_col]

X_test = test[feature_cols]
y_test = test[y_col]
"""

NameError: ignored

In [None]:
"""
# XGBOOST
import xgboost

# fit model
model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
model.fit(X_train, y_train)

# make predictions for test data
predictions = model.predict(X_test)

print(mean_squared_error(predictions, y_test, squared=False))
"""

In [None]:
"""
TO DO: 
 - Predecir el log de esa variable, es posible que `1592` sea alto.
"""

#### Predict number of future purchases for individual customers (next period)

In [None]:
"""
feature_cols = ['frequency', 'recency', 'T', 'monetary_value',
       			'purchases_past_period', 'purchases_current_period',
       			'amount_spent_past_period','amount_spent_current_period']

y_col = ['purchases_next_period']
"""

In [None]:
"""
X_train = train[feature_cols]
y_train = train[y_col]

X_test = test[feature_cols]
y_test = test[y_col]
"""

In [None]:
"""
# XGBOOST
import xgboost

# fit model
model = xgboost.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.07,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=10000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 
model.fit(X_train, y_train)

# make predictions for test data
predictions = model.predict(X_test)

print(mean_squared_error(predictions, y_test, squared=False))
"""

1.4765242908616307


#### Predict if customer will buy in the next period

In [None]:
"""
TO DO: 
 - Sacar el Precision y tmb el Recall , Accuracy 
"""

"""
TO DO: 
 - Tratar de mejorar este AUC
 - Hacer Hyper Parameters Search 
"""

"""
TO DO: 
    - Aplicar de Algoritmos: 
        * Árboles de decisión
        * Bayes
        * Redes Neuronales
"""


In [None]:
# create new column based on `amount_spent_next_period`
train['buy_next_period'] = np.where(train['amount_spent_next_period'] > 0, 1, 0)
test['buy_next_period'] = np.where(test['amount_spent_next_period'] > 0, 1, 0)

In [None]:
y_col = ['buy_next_period']

In [None]:
X_train = train[feature_cols]
y_train = train[y_col]

X_test = test[feature_cols]
y_test = test[y_col]

In [None]:
# XGBOOST
import xgboost

# fit model
model = xgboost.XGBClassifier(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.08,
                 max_depth=7,
                 min_child_weight=1.5,
                 n_estimators=10,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42) 

In [None]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, gamma=0,
              learning_rate=0.08, max_delta_step=0, max_depth=7,
              min_child_weight=1.5, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
              silent=None, subsample=0.6, verbosity=1)

In [None]:
# make predictions for test data
predictions = model.predict(X_test)

In [None]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
metrics.auc(fpr, tpr)

0.6785307782478929

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, predictions, average='weighted')

0.6834016028272576