In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("data.csv", encoding='latin')
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.Quantity.value_counts()

In [None]:
sns.violinplot(df.Quantity);

**Quantity Sold cannot be negative as that doesn't make sense.**

In [None]:
df = df[df.Quantity >= 0]

In [None]:
df.head()

In [None]:
df.CustomerID.isna().sum()

In [None]:
df.shape

In [None]:
df.dropna(subset=['CustomerID'],how='all',inplace=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.info()

# Recency
- how recently the customer has visited the store

**Invoice Date is object so we need to convert it into Date format**

In [None]:
df.InvoiceDate.max()

In [None]:
df['date'] = pd.to_datetime(df['InvoiceDate'])

In [None]:
df.head()

In [None]:
df.Country.value_counts()

In [None]:
a = df.date.max() - df.date.min()
a

In [None]:
a.days

- Taking recent purchase date as a parameter to decide the recency of the customer

In [None]:
recent_purchase_date = df.date.max()
recent_purchase_date

- grouping customers by customerID as same customer may have visited the store more than once so while comparing we will only consider his/her last visit as a recent visit

In [None]:
recency_df = df.groupby(
    ['CustomerID'],
    as_index=False,
)['date'].max()

recency_df.columns = ['CustomerID', 'RecencyDate']

In [None]:
recency_df.head()

- finding out the number of days gap.

In [None]:
recency_df['recency'] = recency_df['RecencyDate'].apply(
    lambda x: (recent_purchase_date - x).days
)

recency_df.head()

In [None]:
recency_df.drop(
    columns=['RecencyDate'],
    inplace=True
)

In [None]:
recency_df.head()

In [None]:
recency_df.shape

# Frequency
- how frequently the customer has visited the store

In [None]:
freq_df = df.copy()
freq_df.drop_duplicates(
    subset=['CustomerID', 'InvoiceNo'],
    keep='first',
    inplace=True
)
freq_df.shape

In [None]:
freq_df = freq_df.groupby(
    'CustomerID',
    as_index=False
)['InvoiceNo'].count()
freq_df.columns = ['CustomerID', 'Frequency']
freq_df.head()


In [None]:
freq_df[freq_df.CustomerID == 12748.0]

In [None]:
freq_df.shape

# Monetary
- how much money that he had spent in the store

In [None]:
df['Total'] = df['UnitPrice'] * df['Quantity']

In [None]:
df.shape

In [None]:
df.head()

In [None]:
monetary_df = df.groupby(
    ['CustomerID'],
    as_index=False
)['Total'].sum()
monetary_df.columns = ['CustomerID', 'Monetary']
monetary_df.head()

In [None]:
monetary_df.shape

# Combining RFM together

In [None]:
rf = recency_df.merge(
    freq_df,
    left_on="CustomerID",
    right_on='CustomerID'
)
rf.shape


In [None]:
rf.head()

In [None]:
rfm = rf.merge(
    monetary_df,
    left_on='CustomerID',
    right_on='CustomerID'
)
rfm.shape

In [None]:
rfm.head()

In [None]:
rfm.info()

In [None]:
rfm.describe()

In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
sns.distplot(rfm['recency'], color='r')
plt.subplot(2, 2, 2)
sns.distplot(rfm['Frequency'], color='b')
plt.subplot(2, 2, 3)
sns.distplot(rfm['Monetary'], color='y')
plt.show()

In [None]:
rfm.recency.describe()

In [None]:
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
plt.boxplot(rfm['recency'])
plt.subplot(2, 2, 2)
plt.boxplot(rfm['Frequency'])
plt.subplot(2, 2, 3)
plt.boxplot(rfm['Monetary'])
plt.show()

In [None]:
quantiles = rfm.quantile([0.25, 0.5, 0.75, 1])
quantiles.drop(
    ['CustomerID'],
    axis=1,
    inplace=True
)

In [None]:
quantiles.head()

In [None]:
quantiles.recency[0.50]

In [None]:
def label_customers(value, feature, quantile_df):
    if feature == 'recency':
        if value <= quantile_df[feature][0.25]:
            return 4
        if value <= quantile_df[feature][0.5]:
            return 3
        if value <= quantile_df[feature][0.75]:
            return 2
        else:
            return 1
    else:
        if value <= quantile_df[feature][0.25]:
            return 1
        if value <= quantile_df[feature][0.5]:
            return 2
        if value <= quantile_df[feature][0.75]:
            return 3
        else:
            return 4

    

In [None]:
rfm['r_quantile'] = rfm.recency.apply(
    label_customers,
    args=('recency', quantiles)
)
rfm.head()

In [None]:
rfm['f_quantile'] = rfm.Frequency.apply(
    label_customers,
    args=('Frequency', quantiles)
)
rfm.head()

In [None]:
rfm['m_quantile'] = rfm.Monetary.apply(
    label_customers,
    args=('Monetary', quantiles)
)
rfm.head()

In [None]:
rfm['rfm_quantile'] = rfm['r_quantile'].astype(str) +  rfm['f_quantile'].astype(str) + rfm['m_quantile'].astype(str)
rfm.head()

In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
sns.histplot(rfm['r_quantile'], color='r')
plt.subplot(2, 2, 2)
sns.histplot(rfm['f_quantile'], color='b')
plt.subplot(2, 2, 3)
sns.histplot(rfm['m_quantile'], color='y')
plt.show()

In [None]:
plt.figure(figsize=(24, 18))
sns.histplot(rfm['rfm_quantile'], color='pink')
plt.xticks(rotation=90);

In [None]:
def customer_segments(value):
    if value == '444':
        return "Best Customer"
    if value[1] == '4':
        return "Loyal Customer"
    if value[2] == '4':
        return "Big Spender"
    if value == "111":
        return "DeadBeats"
    if value[0] == '1' and value[2] == '1' and value[1] != '1':
        return "Lost Customer"
    else:
        return "New Customer"
    

In [None]:
rfm['Segmentation'] = rfm.rfm_quantile.apply(customer_segments)
rfm.head()

In [None]:
rfm.Segmentation.value_counts()

In [None]:
def pie(column, labels, y_label, title, ax=None):

    (rfm[column].value_counts() * 100.0 / len(rfm))\
        .plot.pie(autopct='%.1f%%', labels=labels, ax=ax, fontsize=12)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    ax.set_ylabel(y_label, fontsize=12)
    ax.set_title(title, fontsize=12)

In [None]:
plt.figure(figsize=(15, 20))
fig, axs = plt.subplots(1, 1)

pie('Segmentation', list((rfm['Segmentation'].value_counts()).index), "", '% of Customer Segmentation', axs)

In [None]:
rfm[rfm.Segmentation == "DeadBeats"].rfm_quantile.value_counts()

In [None]:
def label_target(value):
    if value == "DeadBeats":
        return "No Use"
    if value == "Lost Customer":
        return "Churn"
    else:
        return "Non Churn"

In [None]:
def label_target_numeric(value):
    if value == "DeadBeats":
        return 0
    if value == "Lost Customer":
        return 1
    else:
        return 2

In [None]:
rfm['target'] = rfm.Segmentation.apply(label_target)
rfm['target_variable'] = rfm.Segmentation.apply(label_target_numeric)
rfm.head()

In [None]:
rfm.target.value_counts()

In [None]:
rfm.target_variable.value_counts()

In [None]:
plt.figure(figsize=(15, 20))
fig, axs = plt.subplots(1, 1)

pie('target', list((rfm['target'].value_counts()).index), "", '% of Churn, No Churn and No Use', axs)

In [None]:
labels = list((rfm['target'].value_counts()).index)
values = rfm['target'].value_counts()
# Create subplots: use 'domain' type for Pie subplot
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
title = f"<b>Churn Distribution</b>"

fig.update_layout(
    title=dict(text=title, x=0.5, xanchor='center'),
    showlegend=True,
)
fig.update_traces(hole=.2, hoverinfo="all", textfont_size=16)
fig.show()

In [None]:
rfm.to_csv("rfm_analysis.csv", index=False)

In [None]:
rfm.head()

In [None]:
rfm.shape

In [None]:
df.head()

In [None]:
df.shape

In [None]:
finalized_rfm_df = df.merge(
    rfm,
    left_on='CustomerID',
    right_on='CustomerID'
)
finalized_rfm_df.shape

In [None]:
finalized_rfm_df.head()

In [None]:
finalized_rfm_df.to_csv("rfm_ecommerce_data.csv", index=False)