# **Predicting Customer Lifetime Value (CLV)**

In [0]:
import os
current_working_directory = os.getcwd()
print(current_working_directory)

import pandas as pd

# Correct method usage
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 80)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)


# pd.set_option('display.float_format', lambda x: '%.2f' % x)

import numpy as np
import warnings
warnings.filterwarnings('ignore')

# pip install lifetimes

import datetime as dt
import matplotlib.pyplot as plt

from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.plotting import plot_period_transactions

from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import seaborn as sns



In [0]:

import json
with open('/Workspace/Credentials/db_data.json', 'r') as fp:
    data = json.load(fp)


host = data['redshift']['host']
user = data['redshift']['user']
passwd = data['redshift']['passwd']
database = data['redshift']['database']

conn = create_engine(f"postgresql+psycopg2://{user}:{passwd}@{host}:5439/{database}")

In [0]:

from datetime import datetime, timedelta
today = datetime.today().strftime('%Y-%m-%d')
yesterday =  (datetime.today() - timedelta(days = 1)).strftime('%Y-%m-%d')
print(today)
print(yesterday)


last_2_wks = datetime.today() - timedelta(days = 14)
last_2_wks = last_2_wks.strftime('%Y-%m-%d')
print('------------------------------------')
print(last_2_wks)

print('\n')
now = datetime.today().strftime('%Y-%m-%d %H:%M:%S')

last_30_mins = (datetime.today() - timedelta(days = 1)).strftime('%Y-%m-%d %H:%M:%S')
trunc_last_30_mins = (datetime.today() - timedelta(days = 1)).strftime('%Y-%m-%d %H:%M')
print(last_30_mins, 'to', now)

In [0]:

query = '''
SELECT
    dac.client_id,
    dac.client_category,
    COUNT(dat.transaction_id) AS Frequency,
    SUM(dat.amount) AS Monetary,
    DATEDIFF(DAYS, MAX(dat.transaction_date), CURRENT_DATE) AS Recency,
    DATEDIFF(DAYS, MIN(dac.activation_date), CURRENT_DATE) AS T,
    CURRENT_DATE AS rundate
FROM
    dwh_all_transactions dat
LEFT JOIN
    dwh_all_clients dac ON dat.client_id = dac.client_id
LEFT JOIN
    dwh_all_accounts daa ON dac.client_id = daa.client_id
WHERE
    dat.transaction_type_enum IN (1, 2)
    AND UPPER(dac.client_category) NOT IN ('Unclassified')
    AND dac.client_status = 'Active'
    AND dat.transaction_date >= DATEADD(MONTH, -12, CURRENT_DATE)  -- Assuming a SQL-compatible syntax for date manipulation
GROUP BY
    dac.client_id,
    dac.client_category
HAVING COUNT(dat.transaction_id) > 0
'''

# Execute the query using the engine and read the result into a DataFrame
query_df = pd.read_sql_query(query, conn)



In [0]:
df = query_df

In [0]:
# Display the first few rows of the result
df.head(3)

In [0]:
print(df.info())
df['rundate'] = df['rundate'].astype('datetime64[ns]')
df.info()

In [0]:
clv_df = pd.DataFrame()
clv_df["customer_id"] = df["client_id"]
clv_df["frequency"] = df["frequency"]
clv_df["T_weekly"] = df["t"] / 7
clv_df["recency_clv_weekly"] = df["recency"] / 7
clv_df["monetary_clv_avg"] = df["monetary"] / df["frequency"]
clv_df = clv_df[clv_df["T_weekly"] >= 1]
clv_df = clv_df[clv_df["recency_clv_weekly"] >= 1]
clv_df = clv_df[clv_df["frequency"] >= 1]
clv_df = clv_df[clv_df["monetary_clv_avg"] >= 1]

clv_df.describe().T

In [0]:
clv_df.head()

In [0]:
'''
clv_df["predicted_spend_value_3_months"] = clv_df["frequency"] * clv_df["predicted_purchases_3_months"]
clv_df["predicted_spend_value_6_months"] = clv_df["frequency"] * clv_df["predicted_purchases_6_months"]

'''

In [0]:
clv_df.head()

In [0]:
# plot_period_transactions(bgf)

In [0]:
bgf = BetaGeoFitter(penalizer_coef=0.001)

bgf.fit(clv_df["frequency"],
        clv_df["recency_clv_weekly"],
        clv_df["T_weekly"])

In [0]:
# Predict number of transactions for the next 3 months 
clv_df["exp_sales_3_month"] = bgf.predict(4 * 3,
                                          clv_df["frequency"],
                                          clv_df["recency_clv_weekly"],
                                          clv_df["T_weekly"])
clv_df.sort_values("exp_sales_3_month", ascending=False).head(10)

In [0]:
# Predict number of transactions for the next 6 months 
clv_df["exp_sales_6_month"] = bgf.predict(4 * 6,
                                          clv_df["frequency"],
                                          clv_df["recency_clv_weekly"],
                                          clv_df["T_weekly"])
clv_df.sort_values("exp_sales_6_month", ascending=False).head(10)

In [0]:
#fitting Gamma-Gamma model
ggf = GammaGammaFitter(penalizer_coef=0.01)

ggf.fit(clv_df['frequency'], clv_df['monetary_clv_avg'])
clv_df['predicted_average_profit'] = ggf.conditional_expected_average_profit(clv_df['frequency'], clv_df['monetary_clv_avg'])

In [0]:
clv = ggf.customer_lifetime_value(bgf,
                                    clv_df["frequency"],
                                    clv_df["recency_clv_weekly"],
                                    clv_df["T_weekly"],
                                    clv_df["monetary_clv_avg"],
                                    time=6,  #6months
                                    freq='W', #Tenure frequency weekly
                                    discount_rate=0.01)


clv_df["clv"] = clv
clv

In [0]:
clv_df.sort_values("clv", ascending=False).head(20)

In [0]:
# Create segments according to CLV

clv_df["segment"] = pd.qcut(clv_df["clv"], 4, labels=["D", "C", "B", "A"])
clv_df.groupby("segment").agg({"mean", "sum"})

In [0]:
clv_df.info()

In [0]:
clv_df.describe().T

In [0]:
clv_df.groupby("segment").agg({"clv" : ["mean", "sum", "count", "max", "min", "std"]})



In [0]:
clv_df.shape


In [0]:
clv_df.sort_values("clv", ascending=False).head(20)


In [0]:
clv_df.sort_values("clv", ascending=False).tail(20)

In [0]:
clv_df["run_date"] = today    
from pyspark.sql import SparkSession

# Create a SparkSession if not already created
spark = SparkSession.builder.getOrCreate()

# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(clv_df)

# Write Spark DataFrame to table in Databricks
spark_df.write \
    .mode("overwrite") \
    .saveAsTable("vfd_databricks.default.clv_prediction")


In [0]:

%%time

# Write DataFrame to Redshift
# Assuming the table name should be 'dwh_rfm_clusters'
table_name = 'dwh_customer_lifetime_value'

# Write the DataFrame to the Redshift table
clv_df.to_sql(name=table_name, con=conn, if_exists='replace', index=False, chunksize = 10000, method = 'multi')

print(f"Data successfully written to at: ", now)

