# Benodigde libraries importeren

In [469]:
import pandas as pd
from datetime import datetime
import numpy as np
import pandas.api.types as ptypes


# Bestand inladen en data filteren

In [470]:
# excel bestand maxilia geimporteerd en kolommen gefilterd 

data = pd.read_csv('expdata 20190912 1002.csv', sep = None, delimiter = None, usecols = ['email','Orderdate','OrderID','TurnoverLead', 'BuyinLead', 'LeadPhase','LeadStatus'], engine = 'python') 
df.head()

Unnamed: 0_level_0,email,OrderID,TurnoverLead,BuyinLead,LeadPhase,LeadStatus
Orderdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-09-30,tim@schoonderbeek-cv.nl,38145,219.0,0.0,Order,Gefactureerd
2014-10-16,jm@luxuria-import.com,39348,635.0,0.0,Order,Gefactureerd
2014-09-28,d.engelkes@welzijnbergen.nl,39718,435.0,0.0,Order,Gefactureerd
2014-10-20,d.engelkes@welzijnbergen.nl,39715,689.0,0.0,Order,Gefactureerd
2014-10-28,d.engelkes@welzijnbergen.nl,39714,200.0,0.0,Order,Gefactureerd


In [471]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13 entries, 2014-09-30 to 2014-11-28
Data columns (total 6 columns):
email           13 non-null object
OrderID         13 non-null int64
TurnoverLead    13 non-null float64
BuyinLead       13 non-null float64
LeadPhase       13 non-null object
LeadStatus      13 non-null object
dtypes: float64(2), int64(1), object(3)
memory usage: 728.0+ bytes


In [472]:
# lege rijen 'NaT' eruit gefilterd 
# Email bevat @ controle
# Duplicaten verwijderen

df = df.dropna(axis=0, how = 'any', inplace=False)
df = df[~df['email'].str.contains('@')==False]
df = df.drop_duplicates('OrderID', keep='first')
df

Unnamed: 0_level_0,email,OrderID,TurnoverLead,BuyinLead,LeadPhase,LeadStatus
Orderdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-09-30,tim@schoonderbeek-cv.nl,38145,219.0,0.0,Order,Gefactureerd
2014-10-16,jm@luxuria-import.com,39348,635.0,0.0,Order,Gefactureerd
2014-09-28,d.engelkes@welzijnbergen.nl,39718,435.0,0.0,Order,Gefactureerd
2014-10-20,d.engelkes@welzijnbergen.nl,39715,689.0,0.0,Order,Gefactureerd
2014-10-28,d.engelkes@welzijnbergen.nl,39714,200.0,0.0,Order,Gefactureerd
2014-11-30,c.huigen@denf.nl,37999,1057.0,0.0,Order,Gefactureerd
2014-11-04,info@montagebedrijfhorlings.nl,39895,385.18,0.0,Order,Gefactureerd
2014-11-24,info@montagebedrijfhorlings.nl,39896,385.18,0.0,Order,Gefactureerd
2014-11-07,info@montagebedrijfhorlings.nl,39893,385.18,0.0,Order,Gefactureerd
2014-09-20,tim@schoonderbeek-cv.nl,38146,219.0,0.0,Order,Gefactureerd


In [473]:
df['Orderdate']=pd.to_datetime(df['Orderdate'], format='%Y-%m-%d')


KeyError: 'Orderdate'

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Leadphase = order
# Leadstatus = gefactureerd
df = df[~df['LeadPhase'].str.contains('Order')==False]
df = df[~df['LeadStatus'].str.contains('Gefactureerd')==False]
df.head()

In [None]:
df.info()

In [None]:
# Check data bevat voldoet aan bepaalde condities
assert ptypes.is_datetime64_any_dtype(df['Orderdate'])
assert df['email'].str.contains('@').all()
assert df['LeadPhase'].eq('Order').all()
assert df['LeadStatus'].eq('Gefactureerd').all()


# Data grouperen per klant

In [None]:
# group by customer ID
# hoeveel orders zijn er gedaan per customer
# totale uitgaven per customer 

df_clv = df.groupby('email').agg({
                                                        'Orderdate': lambda date: (date.max() - date.min()).days,
                                                        'OrderID': lambda num: len(num), 
                                                        'TurnoverLead': lambda price: price.sum()
                                                    }) 
df_clv.head()

In [None]:
# rename kolommen 

df_clv.columns = ['days_customer', 'total_transactions', 'total_spent']
df_clv.head()

In [None]:
df_clv.describe()

# Berekenen van de totale CLV

In [None]:
# gemiddelde order waarde 

avg_order_value = sum(df_clv['total_spent']) / sum(df_clv['total_transactions'])
print(avg_order_value)

In [None]:
# hoe vaak koopt iemand gemiddeld iets

purchase_frequency = sum(df_clv['total_transactions']) / df_clv.shape[0]
print(purchase_frequency)

In [None]:
# hoe vaak komt iemand terug

repeat_rate = df_clv[df_clv.total_transactions > 1].shape[0] / df_clv.shape[0] 
print(repeat_rate)

In [None]:
# churnrate: hoeveel % doet na 1 aankoop geen bestelling meer

churn_rate = 1 - repeat_rate
print(churn_rate)

In [None]:
# gemiddelde marge  

profit = sum(df['TurnoverLead']) - sum(df['BuyinLead'])
margin = profit / sum(df['TurnoverLead'])
    
print(margin)

In [None]:
# totale clv & clv marge 
# in het blok hierboven is de marge berekend ('margin'), maar je kan ook een getal noteren ipv 'margin'

total_clv = (avg_order_value * purchase_frequency) / churn_rate 
total_clv_margin = total_clv * margin

print(total_clv) # het eerste getal is de totale clv
print(total_clv_margin) # het tweede getal is de clv waarbij rekening is gehouden met de berekende marge

In [None]:
# Bovenstaande cijfers in kolom neerzetten zodat de data in datastudio kan worden gebruikt
set_total_clv = avg_order_value, purchase_frequency, churn_rate, total_clv, total_clv_margin 
df_total_clv = pd.DataFrame(set_total_clv, columns= ['Waarde'], index = ['Gemiddelde orderwaarde' , 'Frequentie aankoop', 'Churn rate' , 'CLV', 'CLV marge'])
df_clv_values=df_total_clv.T.groupby(level=0).agg(lambda x : x.values.tolist()).stack().apply(pd.Series).unstack().sort_index(level=1,axis=1)
df_clv_values.columns=df_clv_values.columns.droplevel(level=0)
df_clv_values.head()

In [None]:
df_clv_values.to_excel('CLV_waardes.xlsx')

   # Transacties en churn rate per maand uiteenzetten

In [None]:
# transacties per klant per maand uiteenzetten
df_monthly = df.set_index('Orderdate')
df_monthly = df_monthly.pivot_table(
    index=['email'],
    columns=pd.Grouper(freq='M'),
    values='OrderID', 
    aggfunc='count',
    fill_value=0
)

# Format column headers to human readable format
#df_monthly.columns = [x.strftime('%b %Y') for x in df_monthly.columns]
df_monthly

In [None]:
repeat_rate = df_monthly[df_monthly > 1].count() / df_monthly[df_monthly > 0].count()
df_churn = repeat_rate.to_frame(name='repeat_rate')

In [None]:
df_churn['churn_rate'] = 1 - df_churn['repeat_rate']
df_churn


# CLV per klant uitrekenen

In [None]:
# clv per customer

df_clv['customer_order_value'] = df_clv['total_spent'] / df_clv['total_transactions']
df_clv['customer_clv'] = (df_clv['customer_order_value'] * purchase_frequency) / churn_rate
df_clv['customer_clv_margin'] = df_clv['customer_clv'] * margin

df_clv.head()

In [None]:
df_clv.describe()

In [None]:
df_clv.to_excel('CLV_per_klant.xlsx')

In [None]:
# set orderdate als index
df.set_index('Orderdate', inplace=True)

In [None]:
#totale transacties / opbrengst per maand

df_transacties = df.groupby(pd.Grouper(freq='M')).agg(
    {
        'OrderID': len, 
        'TurnoverLead': sum,
        'email': pd.Series.nunique
    }
)

df_transacties.head()

In [None]:
# Uiteenzetting uitgaves klanten per maand
# df['month_yr'] = df['Orderdate'].apply(lambda x: x.strftime('%b %Y'))
# df.head()


In [None]:
# totale transacties / opbrengst per maand

# df_transacties = df.groupby('month_yr').aggregate({
#                                                        'OrderID': lambda num: len(num), 
#                                                        'TurnoverLead': lambda price: price.sum(),
#                                                        'email': pd.Series.nunique
#                                                    }) 
# df_transacties.head()

In [None]:
df_transacties.info()

In [None]:
df_transacties.columns = ['total_transactions', 'total_spent', 'total_unique_customers']
df_transacties.head(5)

In [None]:
df_transacties.describe()

In [None]:
df_transacties.to_excel('Overzicht_per_maand.xlsx')

# CLV per maand berekenen

In [None]:
# CLV per maand berekenen
df_transacties['purchase_frequency_month'] = df_transacties['total_transactions'] / df_transacties['total_unique_customers']
df_transacties['avg_order_value_month'] = df_transacties['total_spent'] / df_transacties['total_transactions'] 
df_transacties['CLV'] = (df_transacties['avg_order_value_month'] * df_transacties['purchase_frequency_month']) / df_churn['churn_rate']
df_transacties['CLV_margin'] = df_transacties['CLV'] * margin
df_transacties.head()

In [None]:
# New dataframe to calculate CLV averages. Needs "index" & "number of months" helper columns
# Even if some rows ("months") are missing, "number of months" is still correct
# Because it looks at the dates, not the index locations. 
df_clv_cum = df_transacties[['CLV']].copy()
df_clv_cum.dropna(inplace=True)
df_clv_cum['Index'] = np.arange(1, len(df_clv_cum) + 1)
df_clv_cum['CLV_AVG'] = df_clv_cum['CLV'].cumsum().div(df_clv_cum['Index'])
df_clv_cum

In [None]:
# df_clv_cum = pd.DataFrame(df_transacties.iloc[:,-2])
# df_clv_cum['CLV_cumulatief'] = df_clv_cum.rolling(window=4).mean()
# df_clv_cum.head()