In [40]:
# import required libraries for dataframe and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

# import required libraries for clustering
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

import warnings 
warnings.filterwarnings('ignore')

In [6]:
merged_table = pd.read_csv('merged_table', parse_dates=['invoice_datetime'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
# Convert string 'Null' into null value
merged_table = merged_table.replace({'Null':None})

In [15]:
merged_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   invoice_no        554417 non-null  object        
 1   product_id        546019 non-null  object        
 2   quantity          554417 non-null  int64         
 3   unit_price        554417 non-null  float64       
 4   total_sale        554417 non-null  float64       
 5   customer_id       415102 non-null  object        
 6   rating            554417 non-null  float64       
 7   review_count      554417 non-null  int64         
 8   country           554417 non-null  object        
 9   product_name      554417 non-null  object        
 10  product_type      554417 non-null  object        
 11  stock_code        554417 non-null  object        
 12  invoice_datetime  554417 non-null  datetime64[ns]
 13  day_of_week       554417 non-null  int64         
 14  mont

In [17]:
# Count number of null values in each column
merged_table.isnull().sum()

invoice_no               0
product_id            8398
quantity                 0
unit_price               0
total_sale               0
customer_id         139315
rating                   0
review_count             0
country                  0
product_name             0
product_type             0
stock_code               0
invoice_datetime         0
day_of_week              0
month                    0
hour                     0
year                     0
quarter                  0
invoice_date             0
dtype: int64

In [18]:
# Dropping rows having missing values
retail = merged_table.dropna()
retail.shape

(409035, 19)

In [20]:
retail.head(2)

Unnamed: 0,invoice_no,product_id,quantity,unit_price,total_sale,customer_id,rating,review_count,country,product_name,product_type,stock_code,invoice_datetime,day_of_week,month,hour,year,quarter,invoice_date
0,78536597,B07GWKDLGT,4,496.95,1987.8,18011,4.6,1399,Germany,Nikon D3500 W/ AF-P DX NIKKOR 18-55mm f/3.5-5....,dslr camera,21703,2018-12-01 17:00:00,5,12,17,2018,4,2018-12-01
1,78536597,B01MTLH408,4,39.99,159.96,18011,4.6,289,Germany,"Manfrotto Element Aluminum 5-Section Monopod, ...",dslr camera,40001,2018-12-01 17:00:00,5,12,17,2018,4,2018-12-01


In [41]:
retail['max_date'] = max(retail['invoice_datetime'])
retail['max_date'] = pd.to_datetime(retail['max_date'])
retail['recency'] = (retail['max_date'] - retail ['invoice_datetime']).dt.days

<h5> We want to use three features for k-means clustering: </h5>

- monetary: 'total_revenue' (per customer_id)
- frequency: number of orders 
- recency: days since last purchase

In [45]:
rfm = retail.groupby('customer_id').agg({'invoice_no': 'nunique', 'total_sale':'sum', 'recency': 'min'}).reset_index().rename(columns = {'invoice_no': 'frequency', 'total_sale':'total_rev'})
rfm

Unnamed: 0,customer_id,frequency,total_rev,recency
0,12346,1,4433604.10,325
1,12347,7,210263.69,1
2,12348,4,174939.60,74
3,12349,1,42183.30,18
4,12350,1,17937.80,309
...,...,...,...,...
4334,18280,1,6720.42,277
4335,18281,1,2713.22,180
4336,18282,2,3304.10,7
4337,18283,16,112986.22,3
