# Choose file to process

In [None]:
# Please enter the name of the file after executing this cell (leave empty for test data)
filename = input('Filename of file in /incoming/: ')

# Benodigde libraries importeren

In [None]:
from datetime import datetime
from pathlib import Path
import helpers  # custom helper functions
import numpy as np
import pandas as pd
import pandas.api.types as ptypes

# Bestand inladen en data filteren

In [None]:
# Building the full path to the file
path = Path().absolute().parent  # each '.parent' goes one level up - vary as required
if filename:
    file = f'{path}/incoming/{filename}'
else:
    file = f'{path}/samples/sample-dataset.csv'

# Path for outgoing files
path_outgoing = f'{path}/outgoing/'

print('Processing:', file)

In [None]:
# excel bestand maxilia geimporteerd en kolommen gefilterd 
# voeg comments in over instructies delimiter & hoe je een excel bestand kunt inladen

df = pd.read_csv(file, sep = None, delimiter = None, usecols = ['email','Orderdate','OrderID','TurnoverLead', 'BuyinLead', 'LeadPhase','LeadStatus'], engine = 'python') 
df.head()

In [None]:
df.info()

In [None]:
# lege rijen 'NaT' eruit gefilterd 
# Email bevat @ controle
# Duplicaten verwijderen in orderID

df = df.dropna(axis=0, how = 'any', inplace=False)
df = df[~df['email'].str.contains('@')==False]
df = df.drop_duplicates('OrderID', keep='first')
df.head()

In [None]:
date = df['Orderdate'].loc[df['Orderdate'].first_valid_index()]

In [None]:
# Find out the date format with the helper function
date_format = helpers.date_format(date)

# Orderdate to datetime. Vul juiste date format in
df['Orderdate']=pd.to_datetime(df['Orderdate'], format=date_format)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Data filteren (specifiek voor Maxilia)
# Leadphase = order
# Leadstatus = gefactureerd
df = df[~df['LeadPhase'].str.contains('Order')==False]
df = df[~df['LeadStatus'].str.contains('Gefactureerd')==False]
df.head()

In [None]:
df.info()

In [None]:
# Check data voldoet aan bepaalde condities
assert ptypes.is_datetime64_any_dtype(df['Orderdate'])
assert df['email'].str.contains('@').all()
assert df['LeadPhase'].eq('Order').all()
assert df['LeadStatus'].eq('Gefactureerd').all()


# Data grouperen per klant

In [None]:
# group by email (customer)
# hoeveel orders zijn er gedaan per customer
# totale uitgaven per customer 

df_clv = df.groupby('email').agg({
                                                        'Orderdate': lambda date: (date.max() - date.min()).days,
                                                        'OrderID': lambda num: len(num), 
                                                        'TurnoverLead': lambda price: price.sum()
                                                    }) 
df_clv.head()

In [None]:
# rename kolommen 

df_clv.columns = ['days_customer', 'total_transactions', 'total_spent']
df_clv.head()

In [None]:
df_clv.describe()

In [None]:
df_clv.to_excel(path_outgoing + 'Overzicht_per_klant.xlsx')

# Berekenen van de totale CLV

In [None]:
# gemiddelde order waarde 

avg_order_value = sum(df_clv['total_spent']) / sum(df_clv['total_transactions'])
print(avg_order_value)

In [None]:
# hoe vaak koopt iemand gemiddeld iets

purchase_frequency = sum(df_clv['total_transactions']) / df_clv.shape[0]
print(purchase_frequency)

In [None]:
# hoe vaak komt iemand terug

repeat_rate = df_clv[df_clv.total_transactions > 1].shape[0] / df_clv.shape[0] 
print(repeat_rate)

In [None]:
# churnrate: hoeveel % doet na 1 aankoop geen bestelling meer

churn_rate = 1 - repeat_rate
print(churn_rate)

In [None]:
# gemiddelde marge  

profit = sum(df['TurnoverLead']) - sum(df['BuyinLead'])
margin = profit / sum(df['TurnoverLead'])
absolut_margin = profit * margin
    
print(profit)    
print(margin)
print(absolut_margin)

In [None]:
# totale clv & clv marge 
# in het blok hierboven is de marge berekend ('margin'), maar je kan ook een getal noteren ipv 'margin'

total_clv = (avg_order_value * purchase_frequency) / churn_rate 
total_clv_margin = total_clv * margin

print(total_clv) # het eerste getal is de totale clv
print(total_clv_margin) # het tweede getal is de clv waarbij rekening is gehouden met de berekende marge

In [None]:
# Bovenstaande cijfers in kolom neerzetten zodat de data in datastudio kan worden gebruikt
set_total_clv = avg_order_value, purchase_frequency, churn_rate, total_clv, total_clv_margin 
df_total_clv = pd.DataFrame(set_total_clv, columns= ['Waarde'], index = ['Gemiddelde orderwaarde' , 'Frequentie aankoop', 'Churn rate' , 'CLV', 'CLV marge'])
df_clv_values=df_total_clv.T.groupby(level=0).agg(lambda x : x.values.tolist()).stack().apply(pd.Series).unstack().sort_index(level=1,axis=1)
df_clv_values.columns=df_clv_values.columns.droplevel(level=0)
df_clv_values.head()

In [None]:
df_clv_values.to_excel(path_outgoing + 'CLV_waardes.xlsx')

   # Churn rate per maand uiteenzetten

In [None]:
# transacties per klant per maand uiteenzetten
df_month = df.set_index('Orderdate')
df_monthly = df_month.pivot_table(
    index=['email'],
    columns=pd.Grouper(freq='M'),
    values='OrderID', 
    aggfunc='count',
    fill_value=0
)

df_monthly.head()

In [None]:
repeat_rate = df_monthly[df_monthly > 1].count() / df_monthly[df_monthly > 0].count()
df_churn = repeat_rate.to_frame(name='repeat_rate')

In [None]:
df_churn['churn_rate'] = 1 - df_churn['repeat_rate']
df_churn.head()


# Churn per 6 maanden en per jaar

In [None]:
# Churn per 6 maanden
df_6_months = df_month.pivot_table(
    index=['email'],
    columns=pd.Grouper(freq='6M'),
    values='OrderID', 
    aggfunc='count',
    fill_value=0
)

df_6_months.head()

In [None]:
repeat_rate = df_6_months[df_6_months > 1].count() / df_6_months[df_6_months > 0].count()
df_churn_6_months = repeat_rate.to_frame(name='repeat_rate_6_months')

In [None]:
df_churn_6_months['churn_rate_6_months'] = 1 - df_churn_6_months['repeat_rate_6_months']
df_churn_6_months

In [None]:
# Churn per jaar
df_year = df_month.pivot_table(
    index=['email'],
    columns=pd.Grouper(freq='Y'),
    values='OrderID', 
    aggfunc='count',
    fill_value=0
)

df_year.head()

In [None]:
repeat_rate = df_year[df_year > 1].count() / df_year[df_year > 0].count()
df_churn_year = repeat_rate.to_frame(name='repeat_rate_year')

In [None]:
df_churn_year['churn_rate_year'] = 1 - df_churn_year['repeat_rate_year']
df_churn_year

# Transacties, uitgaves en unieke klanten per maand

In [None]:
# set orderdate als index
df.set_index('Orderdate', inplace=True)

In [None]:
# groepeer totale transacties / opbrengst per maand / unieke klanten per maand

df_transacties = df.groupby(pd.Grouper(freq='M')).agg(
    {
        'OrderID': len, 
        'TurnoverLead': sum,
        'email': pd.Series.nunique
    }
)

df_transacties.head()

In [None]:
df_transacties.info()

In [None]:
# Hernoem kolommen 
df_transacties.columns = ['total_transactions', 'total_spent', 'total_unique_customers']
df_transacties.head(5)

In [None]:
df_transacties.describe()

In [None]:
df_transacties.to_excel(path_outgoing + 'Overzicht_per_maand.xlsx')

# CLV per maand berekenen met churn per maand

In [None]:
# CLV per maand berekenen
# margin is nu over gehele dataset, je kunt deze ook nog per maand uitrekenen
df_transacties['purchase_frequency_month'] = df_transacties['total_transactions'] / df_transacties['total_unique_customers']
df_transacties['avg_order_value_month'] = df_transacties['total_spent'] / df_transacties['total_transactions'] 
df_transacties['CLV'] = (df_transacties['avg_order_value_month'] * df_transacties['purchase_frequency_month']) / df_churn['churn_rate']
df_transacties['CLV_margin'] = df_transacties['CLV'] * margin
df_transacties.head()

In [None]:
# nieuwe dataset maken om gemiddelde CLV te berekenen
# Drop lege kolommen 
# Toevoegen index kolom om gemiddelde mee te berekenen 
df_clv_cum= df_transacties[['CLV', 'CLV_margin']].copy()
df_clv_cum.dropna(inplace=True)
df_clv_cum['Index'] = np.arange(1, len(df_clv_cum) + 1)
df_clv_cum['CLV_AVG'] = df_clv_cum['CLV'].cumsum().div(df_clv_cum['Index'])
df_clv_cum['CLV_margin_AVG'] = df_clv_cum['CLV_margin'].cumsum().div(df_clv_cum['Index'])
df_clv_cum.head()

# CLV per maand berekenen met churn per 6 maanden

In [None]:
# Code om churn rate over 6 maanden en jaar bij juiste maanden en jaar vinden
# Zonder deze code komt de churn rate over 6 maanden alleen bij de laatste maand uit en over een jaar alleen bij december
# Deze code vult de data daarvoor aan met de berekende churn rate over deze periode
def find_churn_rate_6_months(date):
    """
    Iterate through the 'df_churn_6_months' dataframe.
    When the date is smaller than the date on the index, 
    return the corresponding churn rate 
    Eg: July < Oct, so return October's churn rate
    """
    for m in df_churn_6_months['churn_rate_6_months'].iteritems():
        if date <= m[0]:
            return m[1]
        
def find_churn_rate_1_year(date):
    """
    Iterate through the 'df_churn_year' dataframe.
    When the date is smaller than the year on the index, 
    return the corresponding churn rate 
    """
    for y in df_churn_year['churn_rate_year'].iteritems():
        if date <= y[0]:
            return y[1]

In [None]:
# CLV berekenen met churn per 6 maanden
# Eerste regel zoekt de juiste churn rate bij de juiste maanden met behulp van bovenstaande code
df_transacties['churn_rate_6_months'] = df_transacties.index.to_series().apply(lambda x: find_churn_rate_6_months(x))
df_transacties['CLV_churn_6M'] = (df_transacties['avg_order_value_month'] * df_transacties['purchase_frequency_month']) / df_transacties['churn_rate_6_months']
df_transacties['CLV_margin_churn_6M'] = df_transacties['CLV_churn_6M'] * margin
df_transacties.head(10)

# CLV per maand berekenen met churn per jaar

In [None]:
# CLV per maand berekenen met churn berekend over een jaar
df_transacties['churn_rate_1_year'] = df_transacties.index.to_series().apply(lambda x: find_churn_rate_1_year(x))
df_transacties['CLV_churn_Y'] = (df_transacties['avg_order_value_month'] * df_transacties['purchase_frequency_month']) / df_transacties['churn_rate_1_year']
df_transacties['CLV_margin_churn_Y'] = df_transacties['CLV_churn_Y'] * margin
df_transacties.head(10)

# CLV gemiddelde toevoegen met churn per 6 maanden en jaar

In [None]:
# Toevoegen van gemiddelde CLV aan de data set bij CLV berekend met churn per 6 maanden
df_clv_cum_6M = df_transacties[['CLV_churn_6M', 'CLV_margin_churn_6M']].copy()
df_clv_cum_6M.dropna(inplace=True)
df_clv_cum['CLV_AVG_6M'] = df_clv_cum_6M['CLV_churn_6M'].cumsum().div(df_clv_cum['Index'])
df_clv_cum['CLV_margin_AVG_6M'] = df_clv_cum_6M['CLV_margin_churn_6M'].cumsum().div(df_clv_cum['Index'])
df_clv_cum.head()

In [None]:
# Toevoegen van gemiddelde CLV aan de data set bij CLV berekend met churn per jaar
df_clv_cum_Y = df_transacties[['CLV_churn_Y', 'CLV_margin_churn_Y']].copy()
df_clv_cum_Y.dropna(inplace=True)
df_clv_cum['CLV_AVG_Y'] = df_clv_cum_Y['CLV_churn_Y'].cumsum().div(df_clv_cum['Index'])
df_clv_cum['CLV_margin_AVG_Y'] = df_clv_cum_Y['CLV_margin_churn_Y'].cumsum().div(df_clv_cum['Index'])
df_clv_cum.head()

In [None]:
# Dataset overhouden met alle gemiddeldes
# CLV_AVG is churn berekend per maand, 6M churn berekend over 6 maanden, Y is churn berekend over een jaar
# Margin is steeds berekend over de gehele dataset en zelfde marge is gebruikt in alle berekeningen (kan eventueel nog per maand, 6 maanden, jaar etc. berekend worden)
df_clv_avg = df_clv_cum[['CLV_AVG', 'CLV_margin_AVG', 'CLV_AVG_6M', 'CLV_margin_AVG_6M', 'CLV_AVG_Y', 'CLV_margin_AVG_Y']].copy()
df_clv_avg.head(10)

In [None]:
df_clv_avg.to_excel(path_outgoing + 'Gemiddeldes CLV per maand.xlsx')