# <font color='red'>DSP_002 - High Value Customer Identification (Insiders)

# Basic setups

## Imports

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import umap.umap_ as umap
#!pip install umap-learn 
from IPython.core.display     import HTML
from matplotlib import pyplot as plt
#from pandas_profiling import ProfileReport 
from sklearn import cluster       as c
from sklearn import metrics       as m
from sklearn import ensemble      as en
from sklearn import preprocessing as pp
from sklearn import decomposition as dd
from sklearn import manifold      as mn
from sklearn import mixture       as mx
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from scipy.cluster import hierarchy as hc
from plotly import express as px

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer

## Helper Functions

In [10]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    sns.set()
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


In [11]:
# Supress Scientific Notation
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', '{:.2f}'.format)

# Data Exploring

## Data Load

In [12]:
df_raw = pd.read_csv(r'../data/raw/Ecommerce.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa3 in position 117795: invalid start byte

## Data Overview

In [13]:
df_raw.head()

NameError: name 'df_raw' is not defined

In [None]:
print('The original dataset has {} transations registers with {} attributes.'.format(df_raw.shape[0], df_raw.shape[1]) )

In [None]:
df_raw.columns

## Check NaN

In [None]:
df_raw.isna().sum() / len(df_raw) *100

There are a considerable number of transations without customers. Identify the customers is the main objective of this project, so this is a big problem. Through this cycle, we have a lot of problems with non identified customers, so we decided to drop then in this 
first cycle of the project. Maybe in the next cycle something different can be done. After all, we discovery that dropping all the clientes without id, we drop all the "descriptions" NaN as well.

## Data types

In [None]:
df_raw.dtypes

## Check duplicated

In [None]:
df_raw['CustomerID'].duplicated().sum()

In [None]:
df_raw[df_raw.duplicated()].shape

From a business perspective, none of the repeated attributes will be a problem unless there are more than one record with all inputs duplicated.

## Statistics data overview

In [None]:
df_raw.describe().T

Tasks:

    01) Change the columns names to snacke case;
    02) Drop "Unnamed: 8" column;
    03) Check NaN ("Description" and "CustomerID");
    04) Change types (InvoiceNo, InvoiceDate and CustomerID;
    05) Solve duplicated registers
    06) Check the invoices with letters;
    07) Check the strange description names;
    08) Check negative unit prices and quantities

# <font color='green'> Data Processing

In [None]:
df3 = df_raw.copy()

## Rename columns

In [None]:
df3.columns

In [None]:
cols_new = ['invoice_no', 'stock_code', 'description','quantity', 'invoice_date','unit_price', 'customer_id', 'country', 'unnamed']

In [None]:
df3.columns = cols_new #change to snake case

## Check "Unnamed: 8" attribute

All the Unnamed are NaN. There is no relevante information in this feature. It will be dropped.

In [None]:
df3 = df3.drop(columns=['unnamed'], axis=1) 

## Solving NaN

In [None]:
df3.isna().sum()

In [None]:
df3 = df3.dropna(subset = ['customer_id'], axis=0)

In [None]:
df3.isna().sum()

The "description" NaN were dropped together with the "customer_id" NaN

## Change dtypes

### Attribute 'invoice_number'

In [None]:
df3['invoice_no'].unique()

In [None]:
#df3['invoice_no'] = df3['invoice_no'].astype(int64)
print('According with the code error, there are invoices with number with letters. For now it will be keep as a object')

### Attribute 'invoice_date'

In [None]:
df3['invoice_date'] = pd.to_datetime(df3['invoice_date'], format = '%d-%b-%y')
# use %b because the month is in a string in the df

In [None]:
df3.dtypes

### Attribute 'customer_id'

In [None]:
df3['customer_id'] = df3['customer_id'].astype(int64)

In [None]:
df3.dtypes

## Solve Duplicates

In [None]:
#next cycle

## Negative values

From here, we have to take a direction. The negative values problably are from returns. We can just exclude them, exclude the in an out or we can create features for returns. From this point, we decided to sum all the values for customer, and those who ended up with a negative or zero amount will be dropped. We can divided the dataframe in two, one with the purchases, one just with the returns, but it will be done in the next cycle.

In [None]:
df3['subtotal'] = df3['unit_price'] * df3['quantity'] 

In [None]:
df_customers = df3[['customer_id','subtotal']].groupby('customer_id').sum().reset_index()

In [None]:
bad_customers = df_customers[df_customers['subtotal'] <= 0.0]

In [None]:
print('There are {} customers with negative or equal zero total purchases.'.format(bad_customers.shape[0]))

In [None]:
bad_list = bad_customers['customer_id'].to_list()

In [None]:
df3 = df3[~df3['customer_id'].isin(bad_list)] # throw out the bad customers

In [None]:
df3.describe().T

## Attribute invoice_no

First, we have to identify and try to discover what the mean of the invoices with letters.

In [None]:
df_letter_invoices = df3.loc[df3['invoice_no'].apply (lambda x: bool(re.search ( '[^0-9]+', x))), :] 
df_letter_invoices.sample(10)

In [None]:
df_letter_invoices.describe().T

All those letter invoices are from negative quantities. Lets check the invoice just with letters:

In [None]:
df3.loc[df3['stock_code'].apply (lambda x:bool (re.search('^[a-zA-Z]+$', x ) ) ), 'stock_code' ].value_counts()

Stock codes to be dropped: 'POST', 'D', 'M', 'PADS', 'DOT', 'CRUK'

In [None]:
df3 = df3[~df3['stock_code'].isin (['POST', 'D', 'DOT', 'M', 'PADS', 'CRUK'])]

## Description filter

Throughout the work, several strange names were observed in the description of the items. It was decided to exclude them after all other cleaning operations because it seems that many of them were returns or discounts, so they would be eliminated throughout the process. However, a final cleaning will be done before the generation of new features to ensure the integrity of the results of the machine learning models.

In [None]:
df3['description'].unique()

# <font color='orange'> Feature Engineering

In [None]:
df4 = df3.copy()

In [None]:
df4.head()

## Average Ticket

In [None]:
avg_ticket_df = df4[['customer_id','invoice_no','subtotal']].groupby(['customer_id','invoice_no']).sum().reset_index()

In [None]:
avg_ticket_df.rename(columns = {'subtotal':'avg_ticket'}, inplace = True)            

In [None]:
avg_ticket_df = avg_ticket_df[['customer_id','avg_ticket']].groupby('customer_id').mean().reset_index()

In [None]:
avg_ticket_df.head()

In [None]:
df4 = pd.merge(df4,avg_ticket_df, on ='customer_id', how = 'left')

In [None]:
df4.head()

In [None]:
df4['customer_id'].unique().size

## recency

In [None]:
# day from the last purchase until the last date of the dataset
df_recency = df4[['customer_id','invoice_date']].groupby('customer_id').max().reset_index()

df_recency['recency'] = (df4['invoice_date'].max() - df_recency['invoice_date']).dt.days


In [None]:
# one day shif to avoid zero division
df_recency['recency'] = df_recency['recency'].apply(lambda x: x+1)

In [None]:
df_recency['recency'] = df_recency['recency'].apply(lambda x: 100/x)

In [None]:
df4=pd.merge(df4, df_recency[['customer_id','recency']], on = 'customer_id', how='left')

In [None]:
df4.sample(10)

## gross_revenue

In [None]:
df_revenue = df4[['subtotal', 'customer_id']].copy()

In [None]:
df_revenue = df_revenue.groupby('customer_id').sum().reset_index() #total per customer
df_revenue.rename(columns = {'subtotal':'gross_revenue'}, inplace = True)
df_revenue.head()

In [None]:
df4 = pd.merge(df4, df_revenue, on='customer_id', how='left')

In [None]:
df4.head()

In [None]:
# Exclude gross revenue == 0

## Frequency

In [None]:
df_frequency = df4[['customer_id', 'invoice_no']]

In [None]:
df_frequency = df_frequency.drop_duplicates(subset = 'invoice_no')
df_frequency = df_frequency.groupby('customer_id').count().reset_index() # number of purchases for customer within the dataset

In [None]:
df_frequency.rename(columns={'invoice_no':'frequency'}, inplace = True)

In [None]:
df4 = pd.merge(df4, df_frequency, on='customer_id', how = 'left')

In [None]:
df4.head()

# Data Filtering

In [None]:
df5 = df4.copy()

In [None]:
df5.dtypes

In [None]:
df5.drop(columns = ['description','quantity','unit_price' ,'subtotal','invoice_date','invoice_no'], axis = 1, inplace = True) # we don't need this features

In [None]:
df5.head()

In [None]:
df5[df5['avg_ticket'] ==0].shape

# Exploratory data analysis (EDA)

In [None]:
df6 = df5.copy()

In [None]:
profile = ProfileReport(df6)
profile.to_file(r'../reports/profile_report.html')

In [None]:
df6.describe().T

## Descriptive Statistics

In [None]:
num_attributes = df6.drop(columns = ['country', 'customer_id','stock_code'], axis = 1)

In [None]:
#central tendency - mean, median
ct1 = pd.DataFrame ( num_attributes.apply (np.mean) ).T
ct2 = pd.DataFrame ( num_attributes.apply (np.median)).T
                    
# dispersion - desvio padrão, mínimo, máximo, range, skew, kurtosis
d1 = pd.DataFrame ( num_attributes.apply (np.std)).T
d2 = pd.DataFrame ( num_attributes.apply (np.min)).T
d3 = pd.DataFrame ( num_attributes.apply (np.max)).T
d4 = pd.DataFrame ( num_attributes.apply (lambda x: x.max() - x.min())).T
d5 = pd.DataFrame ( num_attributes.apply (lambda x: x.skew() )).T
d6 = pd.DataFrame ( num_attributes.apply (lambda x: x.kurtosis() ) ).T

#concatenate

df_metrics = pd.concat ([d2,d3,d4,ct1,ct2,d1,d5,d6]).T.reset_index()
df_metrics.columns = ['attributes', 'min.','max','range', 'mean', 'median', 'std', 'skew', 'kurtosis']

In [None]:
df_metrics

# <font color='red'>Data Preparation

In [None]:
df7 = df6.copy()

# <font color='red'>Feature Selection

In [None]:
df8 = df7.copy()

In [None]:
df8.head()

In [None]:
df8 = df8.drop(columns = ['stock_code','country'], axis=1)

# <font color='red'>Hyperparameter Fine-Tunning

In [None]:
df9 = df8.copy()

O número de clusters é nosso principal hiperparâmetro, pois não temos essa resposta.  Antes de treinar os modelos precisamos saber quantos clusters vamos considerar. Utilizamos o Kmeans para efetuar a clusterização em um range de 2 a 7 clusters, e utilizando duas métricas, a WSS e a Silhouette Score, escolhemos o número adequado de clusters. A partir daí temos um ponto de partida para treinar os diversos modelos.

In [None]:
X = df9.drop(columns =  ['customer_id'],  axis=1) 
clusters = [2,3,4,5,6,7] # cluster range

## Within-Cluster Sum of Square (WSS)

In [None]:
wss = [] #lista para guardar as métricas para cada iteração do modelo. Cada número de cluster irá gerar um valor

for k in clusters:
    #model definition
    kmeans = c.KMeans (init = 'random' , n_clusters=k, n_init=10, max_iter=300)
    #model training
    kmeans.fit ( X )
    #validation
    wss.append (kmeans.inertia_) #inertia é o próprio valor wss

In [None]:
#plot elbow method
kmeans = KElbowVisualizer (c.KMeans(), k=clusters, timing=False)
kmeans.fit(X)
kmeans.show()

## Silhouette Score

In [None]:
kmeans = KElbowVisualizer ( c.KMeans(), k=clusters, metric='silhouette', timing=False)
kmeans.fit(X)
kmeans.show()

## Silhouette Analysis

In [None]:
fig, ax = plt.subplots (3,2, figsize=(25,18))

for k in clusters:
    km = c.KMeans (n_clusters=k, init='random', n_init=10, max_iter=100, random_state=42)
    q,mod = divmod(k,2) #truque para distribuição dos plots
    visualizer = SilhouetteVisualizer (km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(X)
    visualizer.finalize()

# Model training

## K-Means

In [None]:
# model definition
k = 4
kmeans = c.KMeans (init='random', n_clusters=k, n_init=10, max_iter=300, random_state = 42)
# model training
kmeans.fit (X)
#clustering
labels = kmeans.labels_

## Cluster Validation

In [None]:
# WSS
print('WSS value: {}'.format (kmeans.inertia_))
# SS
print ('SS value: {}'.format(m.silhouette_score(X, labels, metric='euclidean')))

# Cluster visual inspection

In [None]:
reducer = umap.UMAP (n_neighbors = 20, random_state = 42) # instancia o objeto

In [None]:
embedding = reducer.fit_transform (X)

In [None]:
# embedding
X['embedding_x'] = embedding[:,0] # embedding é o espaço projetado
X['embedding_y'] = embedding[:,1]

In [None]:
# plot UMAP
sns.scatterplot( x='embedding_x', y='embedding_y', hue='cluster', pallete = sns.collor_pallete ('hls', n_colors = len(df_viz['cluster'].unique())), data = df_viz)

# <font color='red'>Cluster Analysis

In [None]:
df11 = df9.copy()
df11['cluster'] = labels

## Cluster Profile

In [None]:
# clusters size
df_cluster = df11[['customer_id', 'cluster']].groupby('cluster').count().reset_index()
# percentage of each cluster
df_cluster['perc_customer'] = 100*(df_cluster['customer_id'] /df_cluster['customer_id'].sum())
#avg gross revenue
df_avg_gross_revenue = df11[['gross_revenue', 'cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster, df_avg_gross_revenue, how='inner', on='cluster')
#frequency
df_frequency = df11[['frequency','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster, df_frequency, how='inner', on='cluster')
#recency
df_recency = df11[['recency','cluster']].groupby('cluster').mean().reset_index()
df_cluster = pd.merge(df_cluster, df_recency, how='inner', on='cluster')
df_cluster

# <font color='red'>Deploy to production