# Preparation

In [None]:
#Git Repo Hammam Tugas Akhir
import csv
import regex
import datetime as dt
from datetime import timedelta
import locale
locale.setlocale(locale.LC_TIME, 'id-ID.UTF-8')

import pandas as pd
import numpy as np
from scipy import stats
from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")
#from feature_engine.outlier_removers import Winsorizer

## Import dataset

In [None]:
df = pd.read_csv("Coret-Coret Marketing_Transaction Details (TA Hammam)_2006-2105.csv")
# df = pd.read_csv("Coret-Coret Marketing_Transaction Details (TA Hammam)_2003-2105.csv")
# df = pd.read_csv("Coret-Coret Marketing_Transaction Details (TA Hammam)_1904-2104.csv")
display(df.info())
df.to_csv("Hasil/Data_TA.csv")
df.head(10)

In [None]:
print("Dataset dimension : " + str(df.shape))
print('Jumlah Customer : ' + str(len(pd.unique(df['telp']))))
print('Jumlah Transaksi : ' + str(len(pd.unique(df['no_order']))))

# Cleaning

In [None]:
#Clean unused atribute
df.drop(['transaction_type', 'ID CL', 'nama_komunitas', 'username', 'useremail', 'prod_id', 'harga', 'qty'],axis=1)

## Clean Telp

In [None]:
#Drop Null
df.dropna(inplace=True)

#standarize telp number
df['telp'] = df['telp'].str.replace('-','')
df['telp'] = df['telp'].str.replace(' ','')
df['telp'] = df['telp'].str.replace('+','')
df['telp'] = df['telp'].str.replace('08', '628', 1)
df = df[df['telp'].str.startswith('628')]

#Clean admin no hp & Invalid
df = df.drop(df[df.telp.eq('6281381306699')
                | df.telp.eq('0')].index)
df = df[~df['telp'].str.contains('1234567')]

#drop number with invalid character
df = df.drop(df[df.telp.str.contains(r'[^0-9a-zA-Z]')
                | df.telp.str.contains(r'[a-zA-Z]')].index)
                
#drop abnormal telpon length
df = df[df['telp'].str.len() > 10]
df = df[df['telp'].str.len() < 15]

## Clean Name, Channel, Product

In [None]:
#Clean penyesuaian, test, dan keperluan internal
testing = ['test', 'testing', 'COBAAA', 'tes123', 'TEXTING', 'penyesuaian', 'penyelamatan', 'stok', 'stock', 'tamu', 'b2b']
df = df[~df['name'].str.contains('|'.join(testing))]

#Clean Old B2B Customer
B2B_cust = ['hotel', 'resto', 'restoran', 'cafe', 'coffee', 'Toko', 'toko', 'Patata', 'Geprek', 'Laziza', 'ayam', 'nasi', 'sego', 'Baksoe', 'Bakso', 'Rismart', 'Nu Mart', 'Warung', 'Rice box','E-Warung', 'ewarung', 'Kedai', 'geprek', 'nelongso', 'pkk', 'pkh']
df = df[~df['name'].str.contains('|'.join(B2B_cust))] 

#Clean B2B Product
B2Bproduct = ['B2B']
df = df[~df['produk'].str.contains('|'.join(B2Bproduct))] 

#Clean Based on Channel
channel = ['CL', 'End User', 'Shopee']
df = df[df['Channel'].isin(channel)]

#clean status non delivered
df = df[df.delivery_status.eq('delivered')]

#Cleaning outlier (Paket donasi sembako dan grosir)
## need futher data based technique
donasi = ['Donasi','donasi','Pahlawan', 'Pangan', 'garda', 'sembako', 'psbb ']
df = df[~df['name'].str.contains('|'.join(donasi))]
df = df[(df['subtotal'] > 0)]

In [None]:
print("Dataset dimension : " + str(df.shape))
print('Jumlah Customer : ' + str(len(pd.unique(df['telp']))))
print('Jumlah Transaksi : ' + str(len(pd.unique(df['no_order']))))

In [None]:
#parse datetime format
df_clean = df.copy()
df_clean['delivery_date'] = df_clean['delivery_date'].apply(lambda x:dt.datetime.strptime(x,'%d %b %Y'))

df_clean.to_csv("Hasil/Data_TA Clean.csv")
df_clean.info()

# Generate LRFM

In [None]:
#group by invoice 
df_group = df_clean.groupby(['no_order', 'telp', 'delivery_date'], as_index = False).agg({'subtotal':'sum'})
df_group

In [None]:
#Group By Telp
#Perform mapping to LRFM
snapshot_date = df_clean['delivery_date'].max() + timedelta(days=1)
lrfm = df_group.groupby(['telp']).agg(Length=('delivery_date', lambda x: (snapshot_date - x.min()).days),
                                   Recency=('delivery_date', lambda x: (snapshot_date - x.max()).days),
                                   Frequency=('no_order', 'count'),
                                   Monetary=('subtotal', 'sum'))
lrfm.to_csv("Hasil/Data_TA LRFM.csv")
display(lrfm)
lrfm.describe()

In [None]:
#Check Outlier on Frequency
# display(sns.boxplot(lrfm['Frequency']))

# #delete outlier
# lrfm = lrfm[~(lrfm['Frequency'] > 57)]
# sns.boxplot(x=lrfm['Frequency'])

In [None]:
def check_skew(df_skew, column):
    skew = stats.skew(df_skew[column])
    skewtest = stats.skewtest(df_skew[column])
    plt.title('Distribution of ' + column)
    sns.distplot(df_skew[column], kde=False)
    
    print("{}'s: Skew: {}, : {}".format(column, skew, skewtest))
    return

def norm_minmax(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [None]:
# Plot all 4 graphs to check skewness
plt.figure(figsize=(9, 9))

plt.subplot(4, 1, 1)
check_skew(lrfm,'Length')

plt.subplot(4, 1, 2)
check_skew(lrfm,'Recency')

plt.subplot(4, 1, 3)
check_skew(lrfm,'Frequency')

plt.subplot(4, 1, 4)
check_skew(lrfm,'Monetary')

plt.tight_layout()
plt.savefig('Grafik/before_transform.png', format='png', dpi=1000)

In [None]:
#Frequency dan monetary skewnya ndak masuk akal

## Transformation & Normalization

In [None]:
lrfm_trans = lrfm.copy()

lrfm_trans['Recency'] = np.sqrt(lrfm_trans['Recency'])
lrfm_trans['Frequency'] = np.log10(lrfm_trans['Frequency']+1)
lrfm_trans['Monetary'] = np.log10(lrfm_trans['Monetary']+1)

plt.figure(figsize=(10, 10))
plt.subplot(4, 1, 1)
check_skew(lrfm_trans,'Length')
plt.subplot(4, 1, 2)
check_skew(lrfm_trans,'Recency')
plt.subplot(4, 1, 3)
check_skew(lrfm_trans,'Frequency')
plt.subplot(4, 1, 4)
check_skew(lrfm_trans,'Monetary')

plt.tight_layout()
plt.savefig('Grafik/after_transform-1.png', format='png', dpi=1000)

In [None]:
lrfm_trans['Frequency'] = np.sqrt(lrfm_trans['Frequency'])
lrfm_trans['Monetary'] = np.sqrt(lrfm_trans['Monetary'])

plt.figure(figsize=(10, 10))
plt.subplot(4, 1, 1)
check_skew(lrfm_trans,'Length')
plt.subplot(4, 1, 2)
check_skew(lrfm_trans,'Recency')
plt.subplot(4, 1, 3)
check_skew(lrfm_trans,'Frequency')
plt.subplot(4, 1, 4)
check_skew(lrfm_trans,'Monetary')

plt.tight_layout()
plt.savefig('Grafik/after_transform-2.png', format='png', dpi=1000)

In [None]:
lrfm_trans['Frequency'] = np.sqrt(lrfm_trans['Frequency'])

plt.figure(figsize=(10, 10))
plt.subplot(4, 1, 1)
check_skew(lrfm_trans,'Length')
plt.subplot(4, 1, 2)
check_skew(lrfm_trans,'Recency')
plt.subplot(4, 1, 3)
check_skew(lrfm_trans,'Frequency')
plt.subplot(4, 1, 4)
check_skew(lrfm_trans,'Monetary')

plt.tight_layout()
plt.savefig('Grafik/after_transform-3.png', format='png', dpi=1000)

In [None]:
lrfm_trans['Frequency'] = np.sqrt(lrfm_trans['Frequency'])

plt.figure(figsize=(10, 10))
plt.subplot(4, 1, 1)
check_skew(lrfm_trans,'Length')
plt.subplot(4, 1, 2)
check_skew(lrfm_trans,'Recency')
plt.subplot(4, 1, 3)
check_skew(lrfm_trans,'Frequency')
plt.subplot(4, 1, 4)
check_skew(lrfm_trans,'Monetary')

plt.tight_layout()
plt.savefig('Grafik/after_transform-4.png', format='png', dpi=1000)

In [None]:
#Min-Max Normalization
norm_lrfm = norm_minmax(lrfm_trans)

#Mengubah nilai Recency menggunakan 1-R karena merupakan kebalikan dari variabel lain
#R asli jika semakin kecil akan semakin bagus
norm_lrfm['Recency'] = 1-norm_lrfm['Recency']

norm_lrfm.describe()

# K-Means Clustering

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

from yellowbrick.cluster import SilhouetteVisualizer
from kneed import KneeLocator

## Find K Optimum

In [None]:
# Elbow method Viz
K_elbow = range(1,10)
inertias = []
distortions = []

for k in K_elbow: 
    #Building and fitting the model 
    kmeanModel = KMeans(n_clusters=k, n_init=10, max_iter=100, random_state=123)
    kmeanModel.fit(norm_lrfm)     

    #Elbow Method metrics
    inertias.append(kmeanModel.inertia_) 
    distortions.append(sum(np.min(cdist(lrfm, kmeanModel.cluster_centers_, 
                      'euclidean'),axis=1)) / norm_lrfm.shape[0])     

kn = KneeLocator(K_elbow, inertias, curve='convex', direction='decreasing')
print("Elbow at K =", kn.knee)

plt.plot(K_elbow, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('SSE') 
plt.title('The Elbow Method using SSE') 
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.savefig('Grafik/Elbow Method using Inertia.png', format='png', dpi=1000)
plt.show() 

kn = KneeLocator(K_elbow, distortions, curve='convex', direction='decreasing')
print("Elbow at K =", kn.knee)

plt.plot(K_elbow, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using SSE') 
plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.savefig('Grafik/Elbow Method using distortion.png', format='png', dpi=1000)
plt.show() 

In [None]:
K = range(2,10)
# fig, ax = plt.subplots(4, 2, figsize=(15,15))

# Silhouette Method metrics
euclidean = []
ch_index = []
db_index = []
  
for k in K: 
    #Building and fitting the model 
    kmeanModel = KMeans(n_clusters=k, n_init=10, max_iter=100, random_state=123)
    kmeanModel.fit(norm_lrfm)     
    
    # Silhouette Method metrics
    euclidean.append(silhouette_score(norm_lrfm, kmeanModel.labels_, metric='euclidean'))

    # Silhouette also
    # q, mod = divmod(k, 2)
    # visualizer = SilhouetteVisualizer(kmeanModel, colors='yellowbrick', ax=ax[q-1][mod])
    # visualizer.fit(norm_lrfm)

    # C-H Index
    ch_index.append(calinski_harabasz_score(norm_lrfm,kmeanModel.labels_))

    # DB Index
    db_index.append(davies_bouldin_score(norm_lrfm,kmeanModel.labels_))


In [None]:
ymax = max(euclidean)
xpos = euclidean.index(ymax)
xmax = K[xpos]

print("Highest Euclidean Value = %s at K=%s" % (ymax, xmax,))
plt.plot(K, euclidean, 'bo-') 
plt.xlabel('Values of K') 
plt.ylabel('Euclidean') 
plt.title('The Silhouette Method using Euclidean Distance') 
plt.vlines(xmax, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.savefig('Grafik/Silhouette Method using Euclidean Distance.png', format='png', dpi=1000)
plt.show() 

In [None]:
# CH Index Viz
ymax = max(ch_index)
xpos = ch_index.index(ymax)
xmax = K[xpos]

print("Highest CH-Index Value = %s at K=%s" % (ymax, xmax,))
plt.plot(K, ch_index, 'bo-') 
plt.xlabel('Values of K') 
plt.ylabel('CH Index') 
plt.title('CH Index for k=2 to k=9') 
plt.vlines(xmax, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.savefig('Grafik/CH Index.png', format='png', dpi=1000)
plt.show() 

In [None]:
# DB Index Viz
ymin = min(db_index)
xpos = db_index.index(ymin)
xmax = K[xpos]

print("Lowest DB Index Value = %s at K=%s" % (ymax, xmax,))
plt.plot(K, db_index, 'bo-') 
plt.xlabel('Values of K') 
plt.ylabel('DB Index') 
plt.title('Davis Bouldin Index for k=2 to k=9') 
plt.vlines(xmax, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
plt.show() 

## Clustering!!

In [None]:
#Set K Value
K_fix = 3

#Scatter Plot
# plt.figure(figsize=(16, 8))
df_cluster_fix = kmeans(norm_lrfm, K_fix, lrfm)
# plt.savefig('Grafik/Cluster_is_4-flattened.png', format='png', dpi=300)

In [None]:
df_lrfm_all = pd.merge(df_cluster_fix, norm_lrfm, on='telp', suffixes=('_real', '_norm'))

df_lrfm_all.to_csv("Hasil/Data_TA Clustered_LRFM.csv")
df_lrfm_all

In [None]:
def kmeans(normalised_df_lrfm, clusters_number, original_df_lrfm):
    
    kmeans = KMeans(n_clusters = clusters_number, random_state = 123)
    kmeans.fit(normalised_df_lrfm)

    # Extract cluster labels
    cluster_labels = kmeans.labels_
        
    # Create a cluster label column in original dataset
    df_new = original_df_lrfm.assign(Cluster = cluster_labels)
    df_new['Cluster'] += 1

    # # Initialise TSNE
    # model = TSNE(random_state=1)
    # transformed = model.fit_transform(df_new)
    
    # # Plot t-SNE
    # plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
    # sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, style=cluster_labels, palette="Set1")
    
    return df_new

def snake_plot(normalised_df_lrfm, df_lrfm_kmeans, df_lrfm_original):

    normalised_df_lrfm = pd.DataFrame(normalised_df_lrfm, 
                                       index=df_lrfm_original.index, 
                                       columns=df_lrfm_original.columns)
    normalised_df_lrfm['Cluster'] = df_lrfm_kmeans['Cluster']

    # Melt data into long format
    df_melt = pd.melt(normalised_df_lrfm.reset_index(), 
                        id_vars=['telp', 'Cluster'],
                        value_vars=['Length', 'Recency', 'Frequency', 'Monetary'], 
                        var_name='Metric', 
                        value_name='Value')

    plt.xlabel('Metric')
    plt.ylabel('Value')
    sns.pointplot(data=df_melt, x='Metric', y='Value', hue='Cluster')
    
    return

def threeD_plot(df_merge):
    from mpl_toolkits.mplot3d import Axes3D
    
    x = np.array(df_merge['Recency_norm'])
    y = np.array(df_merge['Frequency_norm'])
    z = np.array(df_merge['Monetary_norm'])

    plt.xlabel('Recency')
    plt.ylabel('Frequency')
    # plt.zlabel('Monetary')
    
    ax.scatter(x,y,z, s=(df_merge['Length_norm']*40), marker="s", c=df_merge["Cluster"], cmap="rainbow")
    # scatter = ax.scatter(x, y, c=c, s=s) 
    # legend = ax.legend(*scatter.legend_elements(),
    #                 loc="lower left", title="Classes")
    # ax.add_artist(legend)
    # ax.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0)) 


    return


In [None]:
plt.figure(figsize=(16, 8))
#Snake Plot
plt.title("Snake Plot of K = %s" % (K_fix))
snake_plot(norm_lrfm, df_cluster_fix, lrfm)
plt.savefig('Grafik/Cluster_is_3-snakeplot.png', format='png', dpi=300)

In [None]:
plt.figure(figsize=(16, 9))

#3D Plot
ax = plt.axes(projection='3d')
plt.title("3D Plot of K = %s" % (K_fix))
threeD_plot(df_lrfm_all)
plt.savefig('Grafik/Cluster_is_3-3dplot.png', format='png', dpi=300)

In [None]:
#BoxPlot for Real Data
fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Length_real", data=df_lrfm_all, width=0.3, whis=10)
L_mean = df_lrfm_all["Length_real"].mean()
plt.plot([-0.5, 2.5], [L_mean, L_mean], 'k-', lw=2, dashes=[5, 2], color='purple')
plt.title('Boxplot for Length each cluster') 
plt.savefig('Grafik/boxplot L.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Recency_real", data=df_lrfm_all, width=0.3, whis=10)
R_mean = df_lrfm_all["Recency_real"].mean()
plt.plot([0, 2], [R_mean, R_mean], 'k-', lw=2, dashes=[5, 2], color='purple')
plt.title('Boxplot for Recency each cluster') 
plt.savefig('Grafik/boxplot R.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
plt.yscale("log")
ax = sns.boxplot(x="Cluster", y="Frequency_real", data=df_lrfm_all, width=0.3, whis=100)
F_mean = df_lrfm_all["Frequency_real"].mean()
plt.plot([0, 2], [F_mean, F_mean], 'k-', lw=2, dashes=[5, 2], color='purple')
plt.title('Boxplot for Frequency each cluster') 
plt.savefig('Grafik/boxplot F.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
plt.yscale("log")
ax = sns.boxplot(x="Cluster", y="Monetary_real", data=df_lrfm_all, width=0.3, whis=100)
M_mean = df_lrfm_all["Monetary_real"].mean()
plt.plot([0, 2], [M_mean, M_mean], 'k-', lw=2, dashes=[5, 2], color='purple')
plt.title('Boxplot for Monetary each cluster') 
plt.savefig('Grafik/boxplot M.png', format='png', dpi=300)
plt.show()

In [None]:
#Boxplot for Normalized data
fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Length_norm", data=df_lrfm_all)
plt.title('Boxplot for Length each cluster') 
plt.savefig('Grafik/Cluster_is_3-L_boxplot_norm.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Recency_norm", data=df_lrfm_all)
plt.title('Boxplot for Recency each cluster') 
plt.savefig('Grafik/Cluster_is_3-R_boxplot_norm.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Frequency_norm", data=df_lrfm_all)
plt.title('Boxplot for Frequency each cluster') 
plt.savefig('Grafik/Cluster_is_3-F_boxplot_norm.png', format='png', dpi=300)
plt.show()

fig = plt.figure(figsize=(10, 5))
ax = sns.boxplot(x="Cluster", y="Monetary_norm", data=df_lrfm_all)
plt.title('Boxplot for Monetary each cluster') 
plt.savefig('Grafik/Cluster_is_3-M_boxplot_norm.png', format='png', dpi=300)
plt.show()

In [None]:
#Violin Plot
plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Length_real', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Length each cluster', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Recency_real', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Recency each cluster', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Frequency_real', data=df_lrfm_all, scale='width', inner='quartile')
plt.yscale("log")
plt.title('Violin Plot of Frequency each cluster (log scale)', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Monetary_real', data=df_lrfm_all, scale='width', inner='quartile')
plt.yscale("log")
plt.title('Violin Plot of Monetary each cluster (log scale)', fontsize=14)
plt.show()

plt.savefig('Grafik/Cluster_is_3-Violin.png', format='png', dpi=300)

In [None]:
#Violin Plot
plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Length_norm', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Normalized Length each cluster', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Recency_norm', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Normalized Recency each cluster', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Frequency_norm', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Normalized Frequency each cluster', fontsize=14)
plt.show()

plt.figure(figsize=(8,4))
sns.violinplot(x='Cluster', y='Monetary_norm', data=df_lrfm_all, scale='width', inner='quartile')
plt.title('Violin Plot of Normalized Monetary each cluster', fontsize=14)
plt.show()

plt.savefig('Grafik/Cluster_is_3-Violin_norm.png', format='png', dpi=300)

# Cluster Analysis

In [None]:
cluster_char = df_cluster_fix.reset_index().groupby(['Cluster']).describe()
cluster_char.transpose().to_csv("Hasil/Data_TA Cluster_characteristics.csv")
cluster_char

In [None]:
df_cluster_fix.groupby(['Cluster']).agg('mean').round(3)

In [None]:
#mean from all dataframe
df_lrfm_all.mean(axis=0).round(3)

## Menghitung CLV

In [None]:
#Nilai bobot didapatkan dari file excel
AHP = {
    'Length' : 0.052,
    'Recency' : 0.099,
    'Frequency' : 0.362,
    'Monetary' : 0.486}

In [None]:
df_lrfm_all['CLV'] = (df_lrfm_all['Length_norm']*AHP['Length'] + df_lrfm_all['Recency_norm']*AHP['Recency'] + df_lrfm_all['Frequency_norm']*AHP['Frequency'] + df_lrfm_all['Monetary_norm']*AHP['Monetary'])
display(df_lrfm_all['CLV'].describe())

#Agar angka lebih cantik dan mudah dibaca, kita kalikan 1000
df_lrfm_all['CLV'] = df_lrfm_all['CLV']*100
plt.title('Distribution of CLV')
sns.distplot(df_lrfm_all['CLV'], kde=False)
plt.show()

In [None]:
df_lrfm_all.head(7)

## Rank CLV

In [None]:
df_lrfm_all.groupby(['Cluster']).agg('mean').sort_values(by=['CLV'], ascending=False)

In [None]:
def clv_values(df):
    clv_cluster = df.groupby(['Cluster']).agg({
        'Length_real': ['min', 'max', 'mean'],
        'Recency_real': ['min', 'max', 'mean'],
        'Frequency_real': ['min', 'max', 'mean'],
        'Monetary_real': ['min', 'max', 'mean'],
        'CLV' : ['min', 'max', 'mean', 'median']
    }).round(0)
    
    return clv_cluster

In [None]:
clv_values(df_lrfm_all).sort_values(by=[('CLV','mean')], ascending=False)

# Market Basket Analysis

In [None]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

## Join Dataframe

In [None]:
df_basket_all = pd.merge(df_clean, df_cluster_fix, on='telp', how='left')
df_basket_all = df_basket_all[['delivery_date', 'no_order', 'name', 'telp', 'produk', 'qty', 'Cluster']].copy()
df_basket_all.to_csv("Hasil/Data_TA Clustered_Transaksi.csv")

#drop NA from frequency outlier
df_basket_all.dropna(subset=['Cluster'], inplace=True)

print("Dataset dimension : " + str(df_basket_all.shape))
print('Jumlah Customer : ' + str(len(pd.unique(df_basket_all['telp']))))
print('Jumlah Transaksi : ' + str(len(pd.unique(df_basket_all['no_order']))))

#clean the product names
df_basket_all['produk'] = df_basket_all['produk'].str.strip()
df_basket_all['no_order'] = df_basket_all['no_order'].astype('str')

In [None]:
#one hot encode the basket
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

#create FP-Growth MBA for every cluster
def createMBA(basket_data, min_sup) :
    totalTransactions = len(basket_data.index)
    totalCustomers = len(pd.unique(basket_data['telp']))
    # minTransaction = totalTransactions*0.005
    # min_support_calc = minTransaction/totalTransactions

    print('Jumlah Pelanggan = ', totalCustomers)
    print('Jumlah Transaksi yang dianalisis = ', totalTransactions)
    print('Nilai Support Minimum = ', round(min_sup*100, 4), '%')

    basket = basket_data.groupby(['no_order', 'produk'])['qty'].sum().unstack().reset_index().fillna(0).set_index('no_order')
    basket_sets = basket.applymap(encode_units)
    basket_sets.dropna(inplace=True)
    basket_sets = basket_sets.astype(int)
    # display(basket_sets.head(5))

    #create frequent items sets with clculated minimum support
    frequent_itemsets = fpgrowth(basket_sets, min_support=min_sup, use_colnames=True)
    # display(frequent_itemsets.describe())

    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
    rules.sort_values('support', ascending = False, inplace = True)

    createMBA.rules = rules
    
    return rules

#Apriori
from mlxtend.frequent_patterns import apriori

def aprioriMBA(basket_data, min_sup) :
    totalTransactions = len(basket_data.index)
    totalCustomers = len(pd.unique(basket_data['telp']))
    # minTransaction = totalTransactions*0.005
    # min_support_calc = minTransaction/totalTransactions

    print('Jumlah Pelanggan = ', totalCustomers)
    print('Jumlah Transaksi yang dianalisis = ', totalTransactions)
    print('Nilai Support Minimum = ', round(min_sup*100, 4), '%')

    basket = basket_data.groupby(['no_order', 'produk'])['qty'].sum().unstack().reset_index().fillna(0).set_index('no_order')
    basket_sets = basket.applymap(encode_units)
    basket_sets.dropna(inplace=True)
    basket_sets = basket_sets.astype(int)
    # display(basket_sets.head(5))

    #create frequent items sets with clculated minimum support
    frequent_itemsets = apriori(basket_sets, min_support=min_sup, use_colnames=True)
    # display(frequent_itemsets.describe())

    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=0)
    rules.sort_values('support', ascending = False, inplace = True)

    aprioriMBA.rules = rules
    
    return rules

In [None]:
#MBA w/ Clustering minsup 1%
for i in range(1, K_fix+1) :
    #slice data
    print("Market Basket Analysis for Cluster", i)
    basket_data = df_basket_all[lambda x: x['Cluster'] == i]

    createMBA(basket_data, 0.01)
    mean_suppport = createMBA.rules['support'].mean()
    print('rules berhasil dibangkitkan = ', len(createMBA.rules.index))
    print('rerata support = ', mean_suppport)
    
    
    display(createMBA.rules.head(5))
    createMBA.rules.to_csv("Hasil/Data_TA FPGrowth(minsup0.01)_Cluster_%s.csv" % (i,), ';')
    # rules[(rules['lift'] >= 0.2) & (rules['confidence'] >= 0.1)].sort_values(by=['confidence', 'lift'], ascending=False).to_csv("Data/Ternakmart_Results_FPGrowth_Cluster_%s.csv" % (i,), 'a', newline='')
    print("\n")

In [None]:
#Apriori MBA w/ Clustering minsup 1%
for i in range(1, K_fix+1) :
    #slice data
    print("Market Basket Analysis for Cluster", i)
    basket_data = df_basket_all[lambda x: x['Cluster'] == i]

    aprioriMBA(basket_data, 0.01)
    mean_suppport = aprioriMBA.rules['support'].mean()
    print('rules berhasil dibangkitkan = ', len(aprioriMBA.rules.index))
    print('rerata support = ', mean_suppport)
    
    display(aprioriMBA.rules.head(5))
    aprioriMBA.rules.to_csv("Hasil/Data_TA Apriori(minsup0.01)_Cluster_%s.csv" % (i,), ';')
    # rules[(rules['lift'] >= 0.2) & (rules['confidence'] >= 0.1)].sort_values(by=['confidence', 'lift'], ascending=False).to_csv("Data/Ternakmart_Results_FPGrowth_Cluster_%s.csv" % (i,), 'a', newline='')
    print("\n")

In [None]:
#MBA w/ Clustering minsup 0.5%
for i in range(1, K_fix+1) :
    #slice data
    print("Market Basket Analysis for Cluster", i)
    basket_data = df_basket_all[lambda x: x['Cluster'] == i]

    createMBA(basket_data, 0.005)
    print('rules berhasil dibangkitkan = ', len(createMBA.rules.index))

    display(createMBA.rules.head(10))

    plt.scatter(createMBA.rules['support'], createMBA.rules['confidence'], alpha=0.5)
    # plt.xlabel('support')
    # plt.ylabel('confidence')
    # plt.title('Support vs Confidence')
    # plt.show()

    createMBA.rules.to_csv("Hasil/Data_TA FPGrowth (minsup0.005)_Cluster_%s.csv" % (i,), ';')
    # rules[(rules['lift'] >= 0.2) & (rules['confidence'] >= 0.1)].sort_values(by=['confidence', 'lift'], ascending=False).to_csv("Data/Ternakmart_Results_FPGrowth_Cluster_%s.csv" % (i,), 'a', newline='')
    print("\n")

# Experiment

In [None]:
#MBA w/o Clustering
createMBA(df_basket_all, 0.01)
print('rules berhasil dibangkitkan = ', len(createMBA.rules.index))

display(createMBA.rules.head(10))
createMBA.rules.to_csv("Hasil/Data_EXP FPGrowth_NO_Cluster_%s.csv", ';')
# rules[(rules['lift'] >= 0.2) & (rules['confidence'] >= 0.1)].sort_values(by=['confidence', 'lift'], ascending=False).to_csv("Data/Ternakmart_Results_FPGrowth_Cluster_All.csv", 'a', newline='')
print("\n \n")

In [None]:
#MBA w/o Jabmilk
jabmilk = ['JABMILK', 'JAB MILK']
df_basket_nonjabmilk = df_basket_all[~df_basket_all['produk'].str.contains('|'.join(jabmilk))]

for i in range(1, K_fix+1) :
    #slice data
    print("Market Basket Analysis (Without Jabmilk) for Cluster", i)
    basket_data = df_basket_nonjabmilk[lambda x: x['Cluster'] == i]

    createMBA(basket_data, 0.01)
    print('rules berhasil dibangkitkan = ', len(createMBA.rules.index))

    display(createMBA.rules.head(10))
    createMBA.rules.to_csv("Hasil/Data_EXP FPGrowth_nonjabmilk_Cluster_%s.csv" % (i,), ';')
    # rules[(rules['lift'] >= 0.2) & (rules['confidence'] >= 0.1)].sort_values(by=['confidence', 'lift'], ascending=False).to_csv("Data/Ternakmart_Results_nonjabmilk_FPGrowth_Cluster_%s.csv" % (i,), 'a', newline='')
    print("\n \n")

In [None]:
#MBA w/o Clustering w/o Jabmilk
createMBA(df_basket_nonjabmilk, 0.007)
print('rules berhasil dibangkitkan = ', len(createMBA.rules.index))
createMBA.rules.to_csv("Hasil/Data_EXP FPGrowth_nonjabmilk_NO_Cluster_%s.csv", ';')

display(createMBA.rules.head(10))