In [83]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cluster import KMeans

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import  TransformerMixin, BaseEstimator
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import pandas as pd
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
df = pd.read_csv('data.csv', encoding = "ISO-8859-1")

In [85]:
class rfm_processing(TransformerMixin, BaseEstimator):
    def __init__(self, *, 
                 date_col = 'InvoiceDate',
                 ):
        self.date_col = date_col 
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df, y=None):
        df = df.copy()
        df[self.date_col] = pd.to_datetime(df[self.date_col])
        df['CustomerID'] = df['CustomerID'].astype(str)
        df = df.dropna()
        df['TotalSum'] = df['Quantity'] * df['UnitPrice']
        
        snapshot_date = df[self.date_col].max() + timedelta(days=1)
        
        data_process = df.groupby(['CustomerID']).agg({
        self.date_col: lambda x: (snapshot_date - x.max()).days,
        'InvoiceNo': 'count',
        'TotalSum': 'sum'})
        data_process.rename(columns={self.date_col: 'Recency',
                         'InvoiceNo': 'Frequency',
                         'TotalSum': 'MonetaryValue'}, inplace=True)
        
        data_process = self.rfm_score(data_process)
        return data_process
    
    @staticmethod
    def rfm_score(df):
        r_labels = range(4, 0, -1)
        f_labels = range(1, 5 )
        m_labels = range(1, 5)
    
        r_groups = pd.qcut(df['Recency'], q=4, labels=r_labels)
        f_groups = pd.qcut(df['Frequency'], q=4, labels=f_labels)
        m_groups = pd.qcut(df['MonetaryValue'], q=4, labels=m_labels)
        
        df = df.assign(R = r_groups.values,
                        F = f_groups.values,
                        M = m_groups.values)
        df['RFM_Score'] = df[['R','F','M']].sum(axis=1)
        
        return df

In [86]:
test = rfm_processing()
rfm_df = test.transform(df)
rfm_df.head()

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346.0,326,2,0.0,1,1,1,3
12347.0,2,182,4310.0,4,4,4,12
12348.0,75,31,1797.24,2,2,4,8
12349.0,19,73,1757.55,3,3,4,10
12350.0,310,17,334.4,1,1,2,4


In [87]:
class rfm_cluster(BaseEstimator):
    def __init__(self):
        pass
    
    def fit(self):
        return self
    
    def predict(self,rfm_df, y=None):
        rfm_df = rfm_df.copy()
        rfm_df['RFM_Level'] = rfm_df.apply(self.rfm_clustering, axis=1)
        return rfm_df
        
    @staticmethod    
    def rfm_clustering(df):
        if df['RFM_Score'] >= 9:
            return 'Can\'t Loose Them'
        elif ((df['RFM_Score'] >= 8) and (df['RFM_Score'] < 9)):
            return 'Champions'
        elif ((df['RFM_Score'] >= 7) and (df['RFM_Score'] < 8)):
            return 'Loyal'
        elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 7)):
            return 'Potential'
        elif ((df['RFM_Score'] >= 5) and (df['RFM_Score'] < 6)):
            return 'Promising'
        elif ((df['RFM_Score'] >= 4) and (df['RFM_Score'] < 5)):
            return 'Needs Attention'
        else:
            return 'Require Activation'
        

In [88]:
test = rfm_cluster()
pred = test.predict(rfm_df)
pred

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score,RFM_Level
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12346.0,326,2,0.00,1,1,1,3,Require Activation
12347.0,2,182,4310.00,4,4,4,12,Can't Loose Them
12348.0,75,31,1797.24,2,2,4,8,Champions
12349.0,19,73,1757.55,3,3,4,10,Can't Loose Them
12350.0,310,17,334.40,1,1,2,4,Needs Attention
...,...,...,...,...,...,...,...,...
18281.0,181,7,80.82,1,1,1,3,Require Activation
18282.0,8,13,176.60,4,1,1,6,Potential
18283.0,4,756,2094.88,4,4,4,12,Can't Loose Them
18287.0,43,70,1837.28,3,3,4,10,Can't Loose Them


In [89]:
rfm_model = Pipeline(steps=[
                    ("rfm_processing", rfm_processing()),
                    ("rfm_cluster", rfm_cluster())
                    ])

In [90]:
rfm_model

In [91]:
y_pred = rfm_model.predict(df)
y_pred.sample(10)

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score,RFM_Level
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16912.0,23,105,2528.67,3,4,4,11,Can't Loose Them
14261.0,51,52,1163.45,2,3,3,8,Champions
13379.0,26,97,572.56,3,3,2,8,Champions
16930.0,26,54,380.55,3,3,2,8,Champions
14740.0,191,93,1423.21,1,3,3,7,Loyal
17571.0,50,299,1498.51,3,4,3,10,Can't Loose Them
13959.0,79,92,598.2,2,3,2,7,Loyal
17176.0,202,15,306.13,1,1,2,4,Needs Attention
17117.0,288,9,116.2,1,1,1,3,Require Activation
12997.0,22,67,1197.94,3,3,3,9,Can't Loose Them


In [92]:
model_cluster = KMeans(n_clusters=5, init='k-means++', n_init='auto')

model = Pipeline(steps=[
    ('RFM_processing', rfm_processing()),
    ('StandardScaler', StandardScaler()),
    ('Kmean', model_cluster)
])


In [93]:
model

In [94]:
model.fit(df)

In [99]:
y_pred = model.predict(df)

In [100]:
y_pred

array([0, 3, 1, ..., 3, 3, 4])