In [1]:
##############################################################
# CLTV PREDICTIONS with  BG-NBD and Gamma-Gamma MODELS
##############################################################

In [None]:
###############################################################
# Business Problem
###############################################################
# FLO RETAIL CHAIN COMPANY IS PLANNING DEFINE A ROADMAP FOR SALED AND MARKETING STRATEGIES
# COMPANY IS AIMING TO PREDICT LIFE TIME VALUE FOR OF ITS CUSTOMERS TO SET A APPROPRIATE STRATEGY


In [2]:
###############################################################
# Dataset
###############################################################

# This dataset consist of customer purchase history data from 2020-2021
# master_id: Unique customer id
# order_channel : Purchase channel (Android, ios, Desktop, Mobile, Offline)
# last_order_channel :
# first_order_date :
# last_order_date :
# last_order_date_online : Total number of customer's online shoppings
# last_order_date_offline : Total number of customer's offline shoppings
# order_num_total_ever_online : Total nr. of customer's all online shoppings
# order_num_total_ever_offline :Total nr. of customer's all online shoppings
# customer_value_total_ever_offline : Total price of customer's offline shopping
# customer_value_total_ever_online : Total price of customer's online shopping
# interested_in_categories_12 : List of categories which customer purchased in last 12 months

In [4]:
###############################################################
# TASK 1: DATA PREPROCESSING
###############################################################
import pandas as pd
import datetime as dt
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.options.mode.chained_assignment = None

# Read the date
df_ = pd.read_csv("flo_data_20K.csv")
df = df_.copy()
df.head()

Unnamed: 0,master_id,order_channel,last_order_channel,first_order_date,last_order_date,last_order_date_online,last_order_date_offline,order_num_total_ever_online,order_num_total_ever_offline,customer_value_total_ever_offline,customer_value_total_ever_online,interested_in_categories_12
0,cc294636-19f0-11eb-8d74-000d3a38a36f,Android App,Offline,2020-10-30,2021-02-26,2021-02-21,2021-02-26,4.0,1.0,139.99,799.38,[KADIN]
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,Android App,Mobile,2017-02-08,2021-02-16,2021-02-16,2020-01-10,19.0,2.0,159.97,1853.58,"[ERKEK, COCUK, KADIN, AKTIFSPOR]"
2,69b69676-1a40-11ea-941b-000d3a38a36f,Android App,Android App,2019-11-27,2020-11-27,2020-11-27,2019-12-01,3.0,2.0,189.97,395.35,"[ERKEK, KADIN]"
3,1854e56c-491f-11eb-806e-000d3a38a36f,Android App,Android App,2021-01-06,2021-01-17,2021-01-17,2021-01-06,1.0,1.0,39.99,81.98,"[AKTIFCOCUK, COCUK]"
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,Desktop,Desktop,2019-08-03,2021-03-07,2021-03-07,2019-08-03,1.0,1.0,49.99,159.99,[AKTIFSPOR]


In [5]:
#  Detect outliers and replace them with thresholds
# In order to calculate cltv , frequency values should be in integers. So round  the low and up limits
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = round(low_limit,0)
    dataframe.loc[(dataframe[variable] > up_limit), variable] = round(up_limit,0)

In [6]:
# OUTLIER VALUES IN "order_num_total_ever_online","order_num_total_ever_offline","customer_value_total_ever_offline", AND #"customer_value_total_ever_online" VARIABLES SHOULD BE REPLACED WITH THRESHOLDS
#aykırı değerleri varsa baskılayanız.
columns = ["order_num_total_ever_online", "order_num_total_ever_offline", "customer_value_total_ever_offline","customer_value_total_ever_online"]
for col in columns:
    replace_with_thresholds(df, col)

In [7]:
# Here we gather the sum of both customer shopping from both online and offline shopping and save them as new variables.
df["order_num_total"] = df["order_num_total_ever_online"] + df["order_num_total_ever_offline"]
df["customer_value_total"] = df["customer_value_total_ever_offline"] + df["customer_value_total_ever_online"]

In [8]:
# Here we need to convert some objectS data types to datetime
date_columns = df.columns[df.columns.str.contains("date")]
df[date_columns] = df[date_columns].apply(pd.to_datetime)

In [9]:
###############################################################
# TASK 2: SETTING UP THE DATASET FOR CLTV DATA STRUCTURE
###############################################################

In [10]:
# IN order to analyze, firstly an analysis date should be set which is 2 days later than the latest order date in dataset
df["last_order_date"].max()

Timestamp('2021-05-30 00:00:00')

In [11]:
analysis_date = dt.datetime(2021,6,1)

In [12]:
# steps of creating a new dataframe to work on it called as cltv_df :"recency_cltv_weekly",T_weekly","frequency","monetary_cltv_avg"
cltv_df = pd.DataFrame()
cltv_df["customer_id"] = df["master_id"]
cltv_df["recency_cltv_weekly"] = ((df["last_order_date"]- df["first_order_date"]).astype('timedelta64[D]')) / 7
cltv_df["T_weekly"] = ((analysis_date - df["first_order_date"]).astype('timedelta64[D]'))/7
cltv_df["frequency"] = df["order_num_total"]
cltv_df["monetary_cltv_avg"] = df["customer_value_total"] / df["order_num_total"]

cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.57,5.0,187.87
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.86,224.86,21.0,95.88
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.29,78.86,5.0,117.06
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.57,20.86,2.0,60.98
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.14,95.43,2.0,104.99


In [13]:
###############################################################
# TASK 3: BUILDING BG/NBD AND Gamma-Gamma MODELS AND CALCULATING CLTV FOR 6 MONTHS
###############################################################

# BUILDING BG/NBD MODEL
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(cltv_df['frequency'],
        cltv_df['recency_cltv_weekly'],
        cltv_df['T_weekly'])

# 3 ay içerisinde müşterilerden b

  result = getattr(ufunc, method)(*inputs, **kwargs)


<lifetimes.BetaGeoFitter: fitted with 19945 subjects, a: 0.00, alpha: 76.17, b: 0.00, r: 3.66>

In [15]:
# PREDICT THE EXPECTED NUMBER OF TRANSACTIONS FOR 3 AND 6 MONTHS OF PERIOD AND SAVE IN DATAFRAME

# PREDICTIONS FOR 3 MONTHS OF PERIOD
cltv_df["exp_sales_3_month"] = bgf.predict(4*3,
                                           cltv_df['frequency'],
                                           cltv_df['recency_cltv_weekly'],
                                           cltv_df['T_weekly'])

In [16]:
# PREDICTIONS FOR 6 MONTHS OF PERIOD
cltv_df["exp_sales_6_month"] = bgf.predict(4*6,
                                           cltv_df['frequency'],
                                           cltv_df['recency_cltv_weekly'],
                                           cltv_df['T_weekly'])

In [None]:
# COMPARE TOP 5 PREDICTIONS FOR 3 AND 6 MONTHS OF PERIODS

In [18]:
cltv_df.sort_values("exp_sales_3_month",ascending=False)[:5]


Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_6_month,exp_sales_3_month
7330,a4d534a2-5b1b-11eb-8dbd-000d3a38a36f,62.71,67.29,52.0,166.22,9.31,4.66
15611,4a7e875e-e6ce-11ea-8f44-000d3a38a36f,39.71,40.0,29.0,165.3,6.75,3.37
8328,1902bf80-0035-11eb-8341-000d3a38a36f,28.86,33.29,25.0,97.44,6.28,3.14
19538,55d54d9e-8ac7-11ea-8ec0-000d3a38a36f,52.57,58.71,31.0,228.53,6.17,3.08
14373,f00ad516-c4f4-11ea-98f7-000d3a38a36f,38.0,46.43,27.0,141.35,6.0,3.0


In [19]:
cltv_df.sort_values("exp_sales_6_month",ascending=False)[:5]

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_6_month,exp_sales_3_month
7330,a4d534a2-5b1b-11eb-8dbd-000d3a38a36f,62.71,67.29,52.0,166.22,9.31,4.66
15611,4a7e875e-e6ce-11ea-8f44-000d3a38a36f,39.71,40.0,29.0,165.3,6.75,3.37
8328,1902bf80-0035-11eb-8341-000d3a38a36f,28.86,33.29,25.0,97.44,6.28,3.14
19538,55d54d9e-8ac7-11ea-8ec0-000d3a38a36f,52.57,58.71,31.0,228.53,6.17,3.08
14373,f00ad516-c4f4-11ea-98f7-000d3a38a36f,38.0,46.43,27.0,141.35,6.0,3.0


In [20]:
# BUILD THE GAMMA-GAMMA MODEL AND SAVE EXPECTED AVERAGE PROFITS IN CLTV_DF DATAFRAME AS "exp_average_value"
ggf = GammaGammaFitter(penalizer_coef=0.01)
ggf.fit(cltv_df['frequency'], cltv_df['monetary_cltv_avg'])
cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
                                                                       cltv_df['monetary_cltv_avg'])
cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_6_month,exp_sales_3_month,exp_average_value
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.57,5.0,187.87,1.95,0.97,193.63
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.86,224.86,21.0,95.88,1.97,0.98,96.67
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.29,78.86,5.0,117.06,1.34,0.67,120.97
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.57,20.86,2.0,60.98,1.4,0.7,67.32
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.14,95.43,2.0,104.99,0.79,0.4,114.33


In [21]:
# CALCULATE THE CLTV FOR 6 MONTHS AND SAVE IN DATAFRAME AS cltv
cltv = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency_cltv_weekly'],
                                   cltv_df['T_weekly'],
                                   cltv_df['monetary_cltv_avg'],
                                   time=6,
                                   freq="W",
                                   discount_rate=0.01)
cltv_df["cltv"] = cltv

In [23]:
# INDICATE CUSTOMERS HAVING TOP 5 CLTV VALUES.
cltv_df.sort_values("cltv",ascending=False)[:5]

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_6_month,exp_sales_3_month,exp_average_value,cltv
9055,47a642fe-975b-11eb-8c2a-000d3a38a36f,2.86,7.86,4.0,1401.8,2.19,1.09,1449.06,3327.78
13880,7137a5c0-7aad-11ea-8f20-000d3a38a36f,6.14,13.14,11.0,758.09,3.94,1.97,767.36,3172.39
17323,f59053e2-a503-11e9-a2fc-000d3a38a36f,51.71,101.0,7.0,1106.47,1.44,0.72,1127.61,1708.98
12438,625f40a2-5bd2-11ea-98b0-000d3a38a36f,74.29,74.57,16.0,501.87,3.13,1.57,506.17,1662.61
7330,a4d534a2-5b1b-11eb-8dbd-000d3a38a36f,62.71,67.29,52.0,166.22,9.31,4.66,166.71,1628.89


In [24]:
###############################################################
# TASK3 :SEGMENTATION ACCORDING TO CLTV
###############################################################

#  CREATE 4 LEVELS OF SEGMENTS FROM A TO D
cltv_df["cltv_segment"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
cltv_df.head()

Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_6_month,exp_sales_3_month,exp_average_value,cltv,cltv_segment
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.57,5.0,187.87,1.95,0.97,193.63,395.73,A
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.86,224.86,21.0,95.88,1.97,0.98,96.67,199.43,B
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.29,78.86,5.0,117.06,1.34,0.67,120.97,170.22,B
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.57,20.86,2.0,60.98,1.4,0.7,67.32,98.95,D
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.14,95.43,2.0,104.99,0.79,0.4,114.33,95.01,D


In [None]:
# ANALYZE CLTV_DF GROUPED BY CLTV_SEGMENTS

In [25]:
coll2 = [col for col in cltv_df.columns if col != "customer_id"]

In [28]:
coll2

['recency_cltv_weekly',
 'T_weekly',
 'frequency',
 'monetary_cltv_avg',
 'exp_sales_6_month',
 'exp_sales_3_month',
 'exp_average_value',
 'cltv',
 'cltv_segment']

In [29]:
cltv_df.groupby("cltv_segment")[coll2].agg(["count","mean","median"])

Unnamed: 0_level_0,recency_cltv_weekly,recency_cltv_weekly,recency_cltv_weekly,T_weekly,T_weekly,T_weekly,frequency,frequency,frequency,monetary_cltv_avg,monetary_cltv_avg,monetary_cltv_avg,exp_sales_6_month,exp_sales_6_month,exp_sales_6_month,exp_sales_3_month,exp_sales_3_month,exp_sales_3_month,exp_average_value,exp_average_value,exp_average_value,cltv,cltv,cltv
Unnamed: 0_level_1,count,mean,median,count,mean,median,count,mean,median,count,mean,median,count,mean,median,count,mean,median,count,mean,median,count,mean,median
cltv_segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
D,4987,139.0,100.57,4987,162.18,118.14,4987,3.77,3.0,4987,93.15,89.5,4987,0.82,0.82,4987,0.41,0.41,4987,98.69,94.56,4987,80.34,83.72
C,4986,92.63,77.86,4986,112.82,95.43,4986,4.4,4.0,4986,125.79,121.99,4986,1.05,1.02,4986,0.53,0.51,4986,132.25,128.41,4986,138.31,138.03
B,4986,81.99,72.57,4986,100.33,89.5,4986,5.09,4.0,4986,160.64,156.87,4986,1.2,1.15,4986,0.6,0.58,4986,168.0,163.28,4986,199.53,198.1
A,4986,67.43,62.43,4986,82.55,79.14,4986,6.65,5.0,4986,228.83,209.96,4986,1.55,1.42,4986,0.77,0.71,4986,238.02,218.27,4986,362.32,312.93


In [30]:
#  FUNCTIONING THE ALL CLTV STEPS
def create_cltv_df(dataframe):

    # Veriyi Hazırlama
    columns = ["order_num_total_ever_online", "order_num_total_ever_offline", "customer_value_total_ever_offline","customer_value_total_ever_online"]
    for col in columns:
        replace_with_thresholds(dataframe, col)

    dataframe["order_num_total"] = dataframe["order_num_total_ever_online"] + dataframe["order_num_total_ever_offline"]
    dataframe["customer_value_total"] = dataframe["customer_value_total_ever_offline"] + dataframe["customer_value_total_ever_online"]
    dataframe = dataframe[~(dataframe["customer_value_total"] == 0) | (dataframe["order_num_total"] == 0)]
    date_columns = dataframe.columns[dataframe.columns.str.contains("date")]
    dataframe[date_columns] = dataframe[date_columns].apply(pd.to_datetime)

    # CLTV veri yapısının oluşturulması
    dataframe["last_order_date"].max()  # 2021-05-30
    analysis_date = dt.datetime(2021, 6, 1)
    cltv_df = pd.DataFrame()
    cltv_df["customer_id"] = dataframe["master_id"]
    cltv_df["recency_cltv_weekly"] = ((dataframe["last_order_date"] - dataframe["first_order_date"]).astype('timedelta64[D]')) / 7
    cltv_df["T_weekly"] = ((analysis_date - dataframe["first_order_date"]).astype('timedelta64[D]')) / 7
    cltv_df["frequency"] = dataframe["order_num_total"]
    cltv_df["monetary_cltv_avg"] = dataframe["customer_value_total"] / dataframe["order_num_total"]
    cltv_df = cltv_df[(cltv_df['frequency'] > 1)]

    # BG-NBD Modelinin Kurulması
    bgf = BetaGeoFitter(penalizer_coef=0.001)
    bgf.fit(cltv_df['frequency'],
            cltv_df['recency_cltv_weekly'],
            cltv_df['T_weekly'])
    cltv_df["exp_sales_3_month"] = bgf.predict(4 * 3,
                                               cltv_df['frequency'],
                                               cltv_df['recency_cltv_weekly'],
                                               cltv_df['T_weekly'])
    cltv_df["exp_sales_6_month"] = bgf.predict(4 * 6,
                                               cltv_df['frequency'],
                                               cltv_df['recency_cltv_weekly'],
                                               cltv_df['T_weekly'])

    # # Gamma-Gamma Modelinin Kurulması
    ggf = GammaGammaFitter(penalizer_coef=0.01)
    ggf.fit(cltv_df['frequency'], cltv_df['monetary_cltv_avg'])
    cltv_df["exp_average_value"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
                                                                           cltv_df['monetary_cltv_avg'])

    # Cltv tahmini
    cltv = ggf.customer_lifetime_value(bgf,
                                       cltv_df['frequency'],
                                       cltv_df['recency_cltv_weekly'],
                                       cltv_df['T_weekly'],
                                       cltv_df['monetary_cltv_avg'],
                                       time=6,
                                       freq="W",
                                       discount_rate=0.01)
    cltv_df["cltv"] = cltv

    # CLTV segmentleme
    cltv_df["cltv_segment"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])

    return cltv_df

cltv_df = create_cltv_df(df)
cltv_df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,customer_id,recency_cltv_weekly,T_weekly,frequency,monetary_cltv_avg,exp_sales_3_month,exp_sales_6_month,exp_average_value,cltv,cltv_segment
0,cc294636-19f0-11eb-8d74-000d3a38a36f,17.0,30.57,5.0,187.87,0.97,1.95,193.63,395.73,A
1,f431bd5a-ab7b-11e9-a2fc-000d3a38a36f,209.86,224.86,21.0,95.88,0.98,1.97,96.67,199.43,B
2,69b69676-1a40-11ea-941b-000d3a38a36f,52.29,78.86,5.0,117.06,0.67,1.34,120.97,170.22,B
3,1854e56c-491f-11eb-806e-000d3a38a36f,1.57,20.86,2.0,60.98,0.7,1.4,67.32,98.95,D
4,d6ea1074-f1f5-11e9-9346-000d3a38a36f,83.14,95.43,2.0,104.99,0.4,0.79,114.33,95.01,D
