In [1]:
# !pip install kmodes
# !pip install scikit-learn-extra

In [1]:
import pandas as pd
import numpy as np
import datetime, math
import sys, os, warnings
import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,MinMaxScaler
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from PlotFigure import plot_values, plot_object
from Autoclass import find_best_cluster

warnings.filterwarnings("ignore")

### Reading file

In [2]:
filename = 'KmeanClustering_sampledata_v3.csv'

### Parameter setting 

In [3]:
pma = 38
AutoCluster = True
category = 'personal','rfm','area','purchasetime','prefer'

method = 'KPrototypes' #'KMeans', 'KPrototypes', 'KMedoids', 'Kmodes'
if AutoCluster == False:
    numofCluster = 4
else:
    selectMethod = 'elbow' #'calinski-harabasz', 'davies-bouldin', 'silhouette', 'bic'

if 'rfm' in category:
    rfm_level = 'median' # 'median', 'mode', 'average'
    select_rfm = True
    if select_rfm:
        select_two = 'f', 'm'

In [4]:
type(category)

tuple

### PRE-processing

In [5]:
select_two = list(select_two)
category = list(category)
dconfig = pd.read_json('category_config.json')
readcols = {'key':['mid', 'pma_no_fin', 'qty', 'avg_qty'],
           'fmt': ['object', 'object', 'values', 'values']}

for cate in category:
    getkey = dconfig[cate]['key']
    getfmt = dconfig[cate]['format']
    readcols['key'].extend( getkey )
    readcols['fmt'].extend( getfmt )
readcols = pd.DataFrame(readcols)
df = pd.read_csv(filename, usecols = readcols['key'])
df = df[df['pma_no_fin']==pma]


In [17]:
df_preprocess = df.copy()
df_preprocess.fillna(' ', inplace=True)

if 'rfm' in category:
    getkey = dconfig['rfm']['key']
    tempdf_rfm = pd.DataFrame()
    rfm_level_key = []
    for idx_rfm in getkey:
        if rfm_level == 'average':
            num_split = np.mean(df_preprocess[idx_rfm].astype(float))
        elif rfm_level == 'median':
            num_split = np.median(df_preprocess[idx_rfm].astype(float))
        elif rfm_level == 'mode':
            num_split = np.mode(df_preprocess[idx_rfm].astype(float))
        idx_rfm_level = f'{idx_rfm}_level'
        rfm_level_key.append(idx_rfm_level)
        if 'recency' in idx_rfm :
            tempdf_rfm[idx_rfm_level] = [ '1'  if i <= num_split else '0' for i in df_preprocess[idx_rfm]]
        else:
            tempdf_rfm[idx_rfm_level] = [ '1'  if i >= num_split else '0' for i in df_preprocess[idx_rfm]]
    
    rfmlevel = []
    for r, f, m in zip(tempdf_rfm[tempdf_rfm.keys()[0]], tempdf_rfm[tempdf_rfm.keys()[1]], tempdf_rfm[tempdf_rfm.keys()[2]]):
        if select_rfm:
            if 'r' not in select_two:
                level = f'{f}{m}'
            elif 'f' not in select_two:
                level = f'{r}{m}'
            elif 'm' not in select_two:
                level = f'{r}{f}'       
        else:
            level = f'{r}{f}{m}'
        rfmlevel.append(level)
        
    df_preprocess['rfm_level']  = rfmlevel    
    labelencoder_rfm = LabelEncoder()
    df_preprocess['rfm_level'] = labelencoder_rfm.fit_transform(df_preprocess['rfm_level'])
    df_preprocess['rfm_level'] = df_preprocess['rfm_level'].astype(object)
    
for objkey in readcols['key'][readcols['fmt']=='object']:
    try:
        if 'area' not in objkey:
            globals()[f'labelencoder_{objkey}'] = LabelEncoder()
            df_preprocess[objkey] = globals()[f'labelencoder_{objkey}'].fit_transform(df_preprocess[objkey])
            df_preprocess[objkey] = df_preprocess[objkey].astype(object)
    except:
        pass

df_preprocess = df_preprocess.replace(' ',0)   
df_4cluster = df_preprocess.copy()
df_4cluster = df_4cluster.drop(['mid', 'pma_no_fin', 'rfm_recency', 'rfm_frequency', 'rfm_monetary'], axis = 1) 
    
catobj_Name =  list(df_4cluster.select_dtypes('object').columns)
catobj_Pos = [df_4cluster.columns.get_loc(col) for col in list(df_4cluster.select_dtypes('object').columns)] 




### Makedir

In [7]:
savedir = f'PMA-{pma}'
if not os.path.isdir(savedir):
        os.makedirs(savedir)
else:
    gettime = os.path.getctime(savedir)
    datetimeObj = datetime.datetime.fromtimestamp(gettime)
    dateStr = datetimeObj.strftime('%Y%m%d_%H%M%S')
    os.makedirs(f'{savedir}/{dateStr}')
    cmd = '''
    mv %(savedir)s/Group* %(savedir)s/%(dateStr)s
    mv %(savedir)s/*.log %(savedir)s/%(dateStr)s
    mv %(savedir)s/*.png %(savedir)s/%(dateStr)s
    '''%locals()
    os.system(cmd)
    

### Find the best number of cluster

In [8]:
if AutoCluster:
    numofCluster = find_best_cluster(method, df_4cluster, pma, savedir, catobj_Pos, selectMethod)


### Begining to cluster

In [9]:
df_label = df_preprocess.copy()
if method.casefold() == 'kmeans'.casefold():
    model_kmean = KMeans(n_clusters=numofCluster, init='random', random_state=50)
    result = model_kmean.fit(df_4cluster)
    label = result.labels_
    df_label['label'] = label
elif method.casefold() == 'KPrototypes'.casefold():
    model_kproto = KPrototypes(n_clusters=numofCluster, init='Huang', random_state=50)
    label = model_kproto.fit_predict(df_4cluster.to_numpy(), categorical=catobj_Pos)
    df_label['label'] = label
elif method.casefold() == 'KMedoids'.casefold():    
    model_kmedoids = KMedoids(n_clusters=numofCluster, random_state=50)
    result = model_kmedoids.fit(df_4cluster)
    df_label['label'] = result.labels_
elif method.casefold() == 'KModes'.casefold():   
    model_kmodes = KModes(n_clusters=numofCluster, random_state=50)
    result = model_kmodes.fit(df_4cluster)
    df_label['label'] = result.labels_

## Descriptive statistics

### Overall 

In [29]:
logfile = f'{savedir}/pma_{pma}.log'
lenfile = len(df_label)
printcluster = numofCluster-1


cmd = '''
echo '===========================================' > %(logfile)s
echo `date` >> %(logfile)s
echo PMA: %(pma)i >> %(logfile)s
echo Datalength: %(lenfile)i >> %(logfile)s
echo Features: %(category)s >> %(logfile)s
if %(AutoCluster)s == 'True';  
then 
    echo Cluster method: %(method)s/Auto/%(selectMethod)s >> %(logfile)s
else
    echo Cluster method: %(method)s/Manual >> %(logfile)s
fi

echo Num. of Group: %(numofCluster)i >> %(logfile)s
echo '===========================================' >> %(logfile)s
'''%locals()
os.system(cmd)

cnt_grouplen = df_label['label'].value_counts()
for i in range(numofCluster):
    cnt_group = cnt_grouplen[i]
    pct_group = round(100*(cnt_group/lenfile),2)
    cmd = '''
    echo "Group %(i)i: %(cnt_group)i (%(pct_group).2f%%)" >> %(logfile)s
    '''%locals()
    os.system(cmd)

plot_key = readcols['key'][readcols['fmt']=='values'].tolist()
plot_values(df_label, savedir, pma, numofCluster, plot_key)


### Each Group 

In [20]:
for i in range(numofCluster):

    str_group = str(i).zfill(2)
    savegroupdir = f'{savedir}/Group_{str_group}'
    if not os.path.isdir(savegroupdir):
        os.makedirs(savegroupdir)

        
    df_group = df_label[df_label['label']==i]
    for objkey in readcols['key'][readcols['fmt']=='object']:
        try:
            if 'area' not in objkey:
                df_group[objkey] = globals()[f'labelencoder_{objkey}'].inverse_transform(df_group[objkey].astype(int))
        except:
            pass
    if 'rfm_level' in df_group.keys():
        df_group['rfm_level'] = labelencoder_rfm.inverse_transform(df_group['rfm_level'].astype(int))

    df_group_describe = df_group.describe()
    df_group_describe = df_group_describe.drop(labels='count')
    df_group_describe = df_group_describe.drop(columns='label')
    df_group_describe = df_group_describe.drop(columns='pma_no_fin')
    
    df_group_describe.to_csv(f'{savegroupdir}/00.Descriptive_statistics_G{str_group}.csv')
    plot_key = readcols['key'][readcols['fmt']=='object'].tolist()
    plot_object(df_group, savegroupdir, pma, str_group, plot_key)

    df_group.to_csv(f'{savegroupdir}/02.Detail_info_mid.csv', index=False)
    

['mid', 'pma_no_fin', 'gender', 'home_city', 'area_01', 'area_02', 'area_03', 'area_04', 'area_05', 'area_06', 'area_07', 'area_09', 'area_10', 'area_11', 'weekday', 'weekend']
['gender', 'home_city', 'area_01', 'weekday', 'rfm_level']
['mid', 'pma_no_fin', 'gender', 'home_city', 'area_01', 'area_02', 'area_03', 'area_04', 'area_05', 'area_06', 'area_07', 'area_09', 'area_10', 'area_11', 'weekday', 'weekend']
['gender', 'home_city', 'area_01', 'weekday', 'rfm_level']
['mid', 'pma_no_fin', 'gender', 'home_city', 'area_01', 'area_02', 'area_03', 'area_04', 'area_05', 'area_06', 'area_07', 'area_09', 'area_10', 'area_11', 'weekday', 'weekend']
['gender', 'home_city', 'area_01', 'weekday', 'rfm_level']
['mid', 'pma_no_fin', 'gender', 'home_city', 'area_01', 'area_02', 'area_03', 'area_04', 'area_05', 'area_06', 'area_07', 'area_09', 'area_10', 'area_11', 'weekday', 'weekend']
['gender', 'home_city', 'area_01', 'weekday', 'rfm_level']
