# Customer Lifetime Value Prediction with BG-NBD and Gamma-Gamma

# 1. Understanding & Preparing Data

In [None]:
!pip install lifetimes
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_ = pd.read_csv("/kaggle/input/flo-customer-data/flo_data_20k.csv")
df = df_.copy()
df.head()

In [None]:
df.dtypes

In [None]:
df.describe().T

In [None]:
df["order_channel"].value_counts()

In [None]:
df.isnull().sum()

## Preprocessing Data

In [None]:
# Define threshold functions for outliers
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile2 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile2 - quartile1
    up_limit = quartile2 + interquantile_range * 1.5
    low_limit = quartile1 - interquantile_range * 1.5

    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = round(low_limit)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = round(up_limit)

In [None]:
#Suppressing outliers
columns = ["order_num_total_ever_online", "order_num_total_ever_offline", "customer_value_total_ever_offline","customer_value_total_ever_online"]
for col in columns:
    replace_with_thresholds(df, col)

In [None]:
df.describe().T

In [None]:
#Creating total order number and total customer value variables
df["total_order_num"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["total_customer_value"] = df["customer_value_total_ever_offline"] + df["customer_value_total_ever_online"]
df[["total_order_num", "total_customer_value"]].head()

In [None]:
# Convert date variables(object) to date type
date_columns = df.columns[df.columns.str.contains("date")]
df[date_columns] = df[date_columns].apply(pd.to_datetime)

# 2. Creating CLTV DataFrame


In [None]:
df["last_order_date"].max()
today_date = dt.datetime(2021, 6, 1) # we analyse as if we are in 2021 because 2021 is current for data set

cltv_df = pd.DataFrame()
cltv_df["customer_id"] = df["master_id"]
cltv_df["recency_cltv_weekly"] = (df["last_order_date"] - df["first_order_date"]).astype("timedelta64[D]") / 7
cltv_df["T_weekly"] = (today_date - df["first_order_date"]).astype("timedelta64[D]") / 7
cltv_df["frequency"] = df["total_order_num"]
cltv_df["monetary_cltv_avg"] = df["total_customer_value"] / df["total_order_num"]
cltv_df["frequency"] = cltv_df["frequency"].astype(int)

cltv_df.head()

# 3. Creating BG-NBD Model

In [None]:
bgf = BetaGeoFitter(penalizer_coef=0.001)

bgf.fit(cltv_df["frequency"],
        cltv_df["recency_cltv_weekly"],
        cltv_df["T_weekly"])

In [None]:
# Expected transaction in 3 months
cltv_df["exp_sales_3_month"] = bgf.conditional_expected_number_of_purchases_up_to_time(12, # months * week
                                                        cltv_df["frequency"],
                                                        cltv_df["recency_cltv_weekly"],
                                                        cltv_df["T_weekly"])
# Expected transaction in 6 months
cltv_df["exp_sales_6_month"] = bgf.conditional_expected_number_of_purchases_up_to_time(24,
                                                        cltv_df["frequency"],
                                                        cltv_df["recency_cltv_weekly"],
                                                        cltv_df["T_weekly"])

cltv_df.head()

In [None]:
# 10 people who will make the most purchases in the 3rd and 6th months.
cltv_df["exp_sales_3_month"].sort_values(ascending=False).head(10)
cltv_df["exp_sales_6_month"].sort_values(ascending=False).head(10)

# 4. Creating Gamma Gamma Model

In [None]:
ggf = GammaGammaFitter(penalizer_coef=0.01)

ggf.fit(cltv_df["frequency"],
        cltv_df["monetary_cltv_avg"])

In [None]:
# Expected average value for each customers
cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(cltv_df["frequency"],
                                                                       cltv_df["monetary_cltv_avg"])
cltv_df["exp_average_value"]

# 5. Predicting CLTV for 6 months

In [None]:
cltv_df["cltv"] = ggf.customer_lifetime_value(bgf,
                                              cltv_df["frequency"],
                                              cltv_df["recency_cltv_weekly"],
                                              cltv_df["T_weekly"],
                                              cltv_df["monetary_cltv_avg"],
                                              time=6, # months
                                              freq="W", # week
                                              discount_rate=0.01)

cltv_df

In [None]:
# Top 20 CLTV
cltv_df.sort_values("cltv", ascending=False).head(20)

# 6. Creating Segments by CLTV

In [None]:
# for 6 months cltv values
cltv_df["cltv_segment"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
cltv_df.groupby("cltv_segment").agg({"cltv": ["count", "mean", "std", "median"]})

In [None]:
cltv_df