# Benodigde libraries importeren

In [75]:
import pandas as pd
from datetime import datetime
import numpy as np
import pandas.api.types as ptypes


# Bestand inladen en data filteren

In [76]:
# excel bestand maxilia geimporteerd en kolommen gefilterd 

data = pd.read_excel(r'/Users/LisannePeeters/Documents/GitHub/python/Sample dataset.xlsx') 
df = pd.DataFrame(data, columns = ['email','Orderdate','OrderID','TurnoverLead', 'BuyinLead', 'LeadPhase','LeadStatus'])
df.head()

Unnamed: 0,email,Orderdate,OrderID,TurnoverLead,BuyinLead,LeadPhase,LeadStatus
0,janneke.van.wingerden@hotmail.com,2014-04-22,27654,840.0,0.0,Order,Gefactureerd
1,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
2,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
3,tim@schoonderbeek-cv.nl,2014-09-30,38145,219.0,0.0,Order,Gefactureerd
4,jm@luxuria-import.com,2014-10-16,39348,635.0,0.0,Order,Gefactureerd


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 7 columns):
email           27 non-null object
Orderdate       27 non-null datetime64[ns]
OrderID         27 non-null int64
TurnoverLead    27 non-null float64
BuyinLead       27 non-null float64
LeadPhase       27 non-null object
LeadStatus      27 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.6+ KB


In [78]:
# lege rijen 'NaT' eruit gefilterd 
# Email bevat @ controle

df = df.dropna(axis=0, how = 'any', inplace=False)
df = df[~df['email'].str.contains('@')==False]
df.head()

Unnamed: 0,email,Orderdate,OrderID,TurnoverLead,BuyinLead,LeadPhase,LeadStatus
0,janneke.van.wingerden@hotmail.com,2014-04-22,27654,840.0,0.0,Order,Gefactureerd
1,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
2,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
3,tim@schoonderbeek-cv.nl,2014-09-30,38145,219.0,0.0,Order,Gefactureerd
4,jm@luxuria-import.com,2014-10-16,39348,635.0,0.0,Order,Gefactureerd


In [79]:
df['Orderdate']=pd.to_datetime(df['Orderdate'], format='%d-%m-%Y')


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 7 columns):
email           27 non-null object
Orderdate       27 non-null datetime64[ns]
OrderID         27 non-null int64
TurnoverLead    27 non-null float64
BuyinLead       27 non-null float64
LeadPhase       27 non-null object
LeadStatus      27 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.7+ KB


In [81]:
df.describe()

Unnamed: 0,OrderID,TurnoverLead,BuyinLead
count,27.0,27.0,27.0
mean,38899.962963,555.72963,0.0
std,3132.80892,667.923824,0.0
min,27654.0,0.0,0.0
25%,39645.0,227.0,0.0
50%,39895.0,385.18,0.0
75%,39895.0,605.0,0.0
max,42331.0,2625.0,0.0


In [82]:
# Leadphase = order
# Leadstatus = gefactureerd
df = df[~df['LeadPhase'].str.contains('Order')==False]
df = df[~df['LeadStatus'].str.contains('Gefactureerd')==False]
df.head()

Unnamed: 0,email,Orderdate,OrderID,TurnoverLead,BuyinLead,LeadPhase,LeadStatus
0,janneke.van.wingerden@hotmail.com,2014-04-22,27654,840.0,0.0,Order,Gefactureerd
1,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
2,info@ferox-design.com,2014-07-30,32291,235.0,0.0,Order,Gefactureerd
3,tim@schoonderbeek-cv.nl,2014-09-30,38145,219.0,0.0,Order,Gefactureerd
4,jm@luxuria-import.com,2014-10-16,39348,635.0,0.0,Order,Gefactureerd


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 7 columns):
email           27 non-null object
Orderdate       27 non-null datetime64[ns]
OrderID         27 non-null int64
TurnoverLead    27 non-null float64
BuyinLead       27 non-null float64
LeadPhase       27 non-null object
LeadStatus      27 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 1.7+ KB


In [84]:
# Check data bevat voldoet aan bepaalde condities
assert ptypes.is_datetime64_any_dtype(df['Orderdate'])
assert df['email'].str.contains('@').all()
assert df['LeadPhase'].eq('Order').all()
assert df['LeadStatus'].eq('Gefactureerd').all()


# Data grouperen per klant

In [85]:
# group by customer ID
# hoeveel orders zijn er gedaan per customer
# totale uitgaven per customer 

df_clv = df.groupby('email').agg({
                                                        'Orderdate': lambda date: (date.max() - date.min()).days,
                                                        'OrderID': lambda num: len(num), 
                                                        'TurnoverLead': lambda price: price.sum()
                                                    }) 
df_clv.head()

Unnamed: 0_level_0,Orderdate,OrderID,TurnoverLead
email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
annelammertink@live.nl,0,1,0.0
c.huigen@denf.nl,0,1,1057.0
d.engelkes@welzijnbergen.nl,0,3,0.0
e.kroon@kbkbouwgroep.nl,0,1,177.4
info@airportmultiservice.com,0,1,1027.52


In [None]:
# rename kolommen 

df_clv.columns = ['days_customer', 'total_transactions', 'total_spent']
df_clv.head()

In [None]:
df_clv.describe()

# Berekenen van de totale CLV

In [None]:
# gemiddelde order waarde 

avg_order_value = sum(df_clv['total_spent']) / sum(df_clv['total_transactions'])
print(avg_order_value)

In [None]:
# hoe vaak koopt iemand gemiddeld iets

purchase_frequency = sum(df_clv['total_transactions']) / df_clv.shape[0]
print(purchase_frequency)

In [None]:
# hoe vaak komt iemand terug

repeat_rate = df_clv[df_clv.total_transactions > 1].shape[0] / df_clv.shape[0] 
print(repeat_rate)

In [None]:
# churnrate: hoeveel % doet na 1 aankoop geen bestelling meer

churn_rate = 1 - repeat_rate
print(churn_rate)

In [None]:
# gemiddelde marge  

profit = sum(df['TurnoverLead']) - sum(df['BuyinLead'])
margin = profit / sum(df['TurnoverLead'])
    
print(margin)

In [None]:
# totale clv & clv marge 
# in het blok hierboven is de marge berekend ('margin'), maar je kan ook een getal noteren ipv 'margin'

total_clv = (avg_order_value * purchase_frequency) / churn_rate 
total_clv_margin = total_clv * margin

print(total_clv) # het eerste getal is de totale clv
print(total_clv_margin) # het tweede getal is de clv waarbij rekening is gehouden met de berekende marge

In [None]:
# Bovenstaande cijfers in kolom neerzetten zodat de data in datastudio kan worden gebruikt
set_total_clv = avg_order_value, purchase_frequency, churn_rate, total_clv, total_clv_margin 
df_total_clv = pd.DataFrame(set_total_clv, columns= ['Waarde'], index = ['Gemiddelde orderwaarde' , 'Frequentie aankoop', 'Churn rate' , 'CLV', 'CLV marge'])
df_clv_values=df_total_clv.T.groupby(level=0).agg(lambda x : x.values.tolist()).stack().apply(pd.Series).unstack().sort_index(level=1,axis=1)
df_clv_values.columns=df_clv_values.columns.droplevel(level=0)
df_clv_values.head()

In [None]:
df_clv_values.to_excel('CLV_waardes.xlsx')

   # Transacties en churn rate per maand uiteenzetten

In [None]:
# transacties per klant per maand uiteenzetten
df_monthly = df.set_index('Orderdate')
df_monthly = df_monthly.pivot_table(
    index=['email'],
    columns=pd.Grouper(freq='M'),
    values='OrderID', 
    aggfunc='count',
    fill_value=0
)

# Format column headers to human readable format
df_monthly.columns = [x.strftime('%b %Y') for x in df_monthly.columns]
df_monthly.head()

In [None]:
repeat_rate = df_monthly[df_monthly > 1].count() / df_monthly[df_monthly > 0].count()
df_churn = repeat_rate.to_frame(name='repeat_rate')

In [None]:
df_churn['churn_rate'] = 1 - df_churn['repeat_rate']
df_churn.head()


# CLV per klant uitrekenen

In [None]:
# clv per customer

df_clv['customer_order_value'] = df_clv['total_spent'] / df_clv['total_transactions']
df_clv['customer_clv'] = (df_clv['customer_order_value'] * purchase_frequency) / churn_rate
df_clv['customer_clv_margin'] = df_clv['customer_clv'] * margin

df_clv.head()

In [None]:
df_clv.describe()

In [None]:
df_clv.to_excel('CLV_per_klant.xlsx')

In [None]:
# Uiteenzetting uitgaves klanten per maand
df['month_yr'] = df['Orderdate'].apply(lambda x: x.strftime('%b %Y'))
df.head()


In [None]:
#totale transacties / opbrengst per maand

df_transacties = df.groupby('month_yr').aggregate({
                                                        'OrderID': lambda num: len(num), 
                                                        'TurnoverLead': lambda price: price.sum(),
                                                        'email': pd.Series.nunique
                                                    }) 
df_transacties.head()

In [None]:
df_transacties.info()

In [None]:
df_transacties.columns = ['total_transactions', 'total_spent', 'total_unique_customers']
df_transacties.head()

In [None]:
df_transacties.describe()

In [None]:
df_transacties.to_excel('Overzicht_per_maand.xlsx')

# CLV per maand berekenen

In [None]:
# CLV per maand berekenen
df_transacties['purchase_frequency_month'] = df_transacties['total_transactions'] / df_transacties['total_unique_customers']
df_transacties['avg_order_value_month'] = df_transacties['total_spent'] / df_transacties['total_transactions'] 
df_transacties['CLV'] = (df_transacties['avg_order_value_month'] * df_transacties['purchase_frequency_month']) / df_churn['churn_rate']
df_transacties['CLV_margin'] = df_transacties['CLV'] * margin
df_transacties.head()

In [None]:
df_clv_cum = pd.DataFrame(df_transacties.iloc[:,-2])
df_clv_cum['CLV_cumulatief'] = df_clv_cum.rolling(window=4).mean()
df_clv_cum.head()

# Statistieken CLV per klant

In [None]:
# statistieken van clv per customer 

import statistics

df_clv_list = df_clv['customer_clv'].values

x1 = statistics.mean(df_clv_list) # eerste getal is gemiddelde
x2 = statistics.median(df_clv_list) # tweede getal is mediaan
x3 = statistics.mode(df_clv_list) # derde getal is de modus 
x4 = statistics.stdev(df_clv_list) # vierde getal is de standaarddeviatie 

print(x1) 
print(x2) 
print(x3) 
print(x4)

In [None]:
# box plot: visuele weergaven van de verdeling van de clv 
# zichtbaar = minimum, maximum, mediaan en kwartielafstanden 

import matplotlib.pyplot as plt 
df_clv.plot(y = 'customer_clv', kind = 'box') 
plt.show()

In [None]:
# histogram plot: visuele weergaven van frequentie verdeling van de clv 
# hoe vaak komt een clv waarde voor 

import matplotlib.pyplot as plt 
df_clv.plot(y = 'customer_clv', kind = 'hist') 
plt.show()