In [None]:
import pandas as pd
import math

In [None]:
retail_df = pd.read_excel('C:/Users/ETRI/ch12_data/Online_Retail.xlsx')
retail_df.head()

In [None]:
retail_df.info()

In [None]:
retail_df = retail_df[retail_df['Quantity']>0]
retail_df = retail_df[retail_df['UnitPrice']>0]
retail_df = retail_df[retail_df['CustomerID'].notnull()]
retail_df['CustomerID']=retail_df['CustomerID'].astype(int)

In [None]:
retail_df.info()
print(retail_df.isnull().sum())
print(retail_df.shape)

In [None]:
retail_df.drop_duplicates(inplace = True)
print(retail_df.shape) 

In [None]:
pd.DataFrame([{'Product':len(retail_df['StockCode'].value_counts()),
              'Transaction':len(retail_df['InvoiceNo'].value_counts()),
             'Customer':len(retail_df['CustomerID'].value_counts())}],
                              columns=['Product','Transaction','Customer'], index=['counts'])

In [None]:
retail_df['Country'].value_counts()

In [None]:
retail_df['SaleAmount']=retail_df['UnitPrice']*retail_df['Quantity']
retail_df.head()  

In [None]:
aggregations={'InvoiceNo':'count','SaleAmount':'sum','InvoiceDate':'max'}
customer_df=retail_df.groupby('CustomerID').agg(aggregations)
customer_df=customer_df.reset_index()
customer_df.head() 

In [None]:
customer_df = customer_df.rename(columns={'InvoiceNo':'Freq','InvoiceDate':'ElaspedDays'})
customer_df.head() 

In [None]:
import datetime
customer_df['ElaspedDays'] = datetime.datetime(2011,12,10) - customer_df['ElaspedDays']
customer_df.head() 

In [None]:
customer_df['ElaspedDays'] = customer_df['ElaspedDays'].apply(lambda x:x.days+1)
customer_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots()
ax.boxplot([customer_df['Freq'],customer_df['SaleAmount'], customer_df['ElaspedDays']], sym = 'bo')
plt.xticks([1,2,3],['Freq','SaleAmount','ElaspedDays'])
plt.show()

In [None]:
import numpy as np

customer_df['Freq_log'] = np.log1p(customer_df['Freq'])
customer_df['SaleAmount_log'] = np.log1p(customer_df['SaleAmount'])
customer_df['ElaspedDays_log'] = np.log1p(customer_df['ElaspedDays'])

customer_df.head() 


In [None]:
fig, ax = plt.subplots()
ax.boxplot([customer_df['Freq_log'], customer_df['SaleAmount_log'],
           customer_df['ElaspedDays_log']], sym= 'bo')
plt.xticks([1,2,3],['Freq_log','SaleAmount_log','ElaspedDays_log'])
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [None]:
X_features = customer_df[['Freq_log','SaleAmount_log','ElaspedDays_log']].values

In [None]:
from sklearn.preprocessing import StandardScaler
X_features_scaled = StandardScaler().fit_transform(X_features)

In [None]:
distortions = []

for i in range(1,11):
    kmeans_i = KMeans(n_clusters = i, random_state =0)  
    kmeans_i.fit(X_features_scaled)                     
    distortions.append(kmeans_i.inertia_)
    
plt.plot(range(1,11), distortions, marker ='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, random_state= 0) 
Y_labels = kmeans.fit_predict(X_features_scaled)

In [None]:
customer_df['ClusterLabel'] = Y_labels
customer_df.head()   

In [None]:
from matplotlib import cm

def silhouetteViz(n_cluster, X_features):
    kmeans = KMeans(n_clusters = n_cluster, random_state =0)
    Y_labels = kmeans.fit_predict(X_features)
    
    silhouette_values = silhouette_samples(X_features, Y_labels, metric = 'euclidean')
    
    y_ax_lower, y_ax_upper = 0, 0
    y_ticks = []
    
    for c in range(n_cluster):
        c_silhouettes = silhouette_values[Y_labels ==c]
        c_silhouettes.sort()
        y_ax_upper += len(c_silhouettes)
        color = cm.jet(float(c) / n_cluster)
        plt.barh(range(y_ax_lower, y_ax_upper), c_silhouettes, height = 1.0, edgecolor = 'none', color = color)
        y_ticks.append((y_ax_lower + y_ax_upper)/2.)
        y_ax_lower += len(c_silhouettes)
        
    silhouette_avg = np.mean(silhouette_values)
    
    plt.axvline(silhouette_avg, color = 'red', linestyle = '-')
    plt.title('Number of Cluster : '+ str(n_cluster) + '\n' + 
              'Silhouette Score : '+ str(round(silhouette_avg,3))) 
    plt.yticks(y_ticks, range(n_cluster))
    plt.xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coeficients')
    plt.tight_layout()
    plt.show()

In [None]:
def clusterScatter(n_cluster, X_features):
    c_colors = []
    kmeans = KMeans(n_clusters = n_cluster, random_state = 0)
    Y_labels = kmeans.fit_predict(X_features)
    
    for i in range(n_cluster):
        c_color = cm.jet(float(i)/n_cluster) 
        c_colors.append(c_color)
        plt.scatter(X_features[Y_labels == i,0],
               X_features[Y_labels == i,1], marker = 'o', color = c_color, 
               edgecolor = 'black', s = 50, label = 'cluster '+str(i))
        
    for i in range(n_cluster):
        plt.scatter(kmeans.cluster_centers_[i,0], kmeans.cluster_centers_[i,1], 
                        marker = '^', color= c_colors[i], edgecolor = 'w', s=200)
        plt.legend()
        plt.grid()
        plt.tight_layout()
        plt.show()

In [None]:
silhouetteViz(3, X_features_scaled)

In [None]:
silhouetteViz(4, X_features_scaled)

In [None]:
silhouetteViz(5, X_features_scaled)

In [None]:
silhouetteViz(6, X_features_scaled)

In [None]:
clusterScatter(3, X_features_scaled)

In [None]:
clusterScatter(4, X_features_scaled)

In [None]:
clusterScatter(5, X_features_scaled)

In [None]:
clusterScatter(6, X_features_scaled)

In [None]:
best_cluster = 4
kmeans = KMeans(n_clusters = best_cluster, random_state = 0)
Y_labels = kmeans.fit_predict(X_features_scaled)

In [None]:
customer_df['ClusterLabel'] = Y_labels
customer_df.head()

In [None]:
customer_df.to_csv('C:/Users/ETRI/ch12_data/Online_Retail_Customer_Cluster.csv')

In [None]:
customer_df.groupby('ClusterLabel')['CustomerID'].count()

In [None]:
customer_cluster_df = customer_df.drop(['Freq_log', 'SaleAmount_log', 
                                        'ElaspedDays_log'], axis =1, inplace = False)

In [None]:
customer_cluster_df['SaleAmountAvg'] = customer_cluster_df['SaleAmount']/customer_cluster_df['Freq']
customer_cluster_df.head()

In [None]:
customer_cluster_df.drop(['CustomerID'], axis=1, inplace=False).groupby('ClusterLabel').mean()