# Reference

## Data Source

https://www.kaggle.com/datasets/jihyeseo/online-retail-data-set-from-uci-ml-repo

# Importing Packages

In [1]:
# importing necessary packages
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection  import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

# Loading Data

In [2]:
# loading retail data
df = pd.read_excel('C:/Users/KodavaliPavanKumar/Desktop/Training/github_folders/Projects/MM_Sample/Data/Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


# Cleaning Data

In [3]:
# removing duplicates
df = df[~df.duplicated()]
df.shape

(536641, 8)

In [4]:
# removing all the invoice number who starts with 'C' as they are returned orders
df = df[df['InvoiceNo'].str.startswith('C')!=True]
df.shape

(527390, 8)

In [5]:
# keeping only those transactions that have successfully ordered
df = df[df['Quantity']>=0]
df.shape

(526054, 8)

In [6]:
# putting UK as one country and combine rest countries into one category
df['Country'] = df['Country'].apply(lambda x:'United Kingdom' if x=='United Kingdom' else 'Others')
df.Country.value_counts(normalize=True)

United Kingdom    0.914627
Others            0.085373
Name: Country, dtype: float64

In [7]:
# removing all the above entries
df = df[df['Description'].str.startswith('?')!=True]
df.shape

(526048, 8)

In [8]:
# checking the data where description = * and it is noted that customerid is NaN
df[df['Description'].str.startswith('*')==True]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
20749,538071,21120,*Boombox Ipod Classic,1,2010-12-09 14:09:00,16.98,,United Kingdom
35675,539437,20954,*USB Office Mirror Ball,1,2010-12-17 14:54:00,8.47,,United Kingdom
37095,539453,20954,*USB Office Mirror Ball,1,2010-12-17 17:08:00,8.47,,United Kingdom


In [9]:
# replacing with appropriate name
df['Description'] = df['Description'].replace(('*Boombox Ipod Classic','*USB Office Mirror Ball'),
                                             ('BOOMBOX IPOD CLASSIC','USB OFFICE MIRROR BALL'))

In [10]:
# removing all the above noises
df = df[df['Description'].str.islower()!=True]
df.shape

(525920, 8)

In [11]:
# removing all the above listed noises
df = df[df['Description'].str.istitle()!=True]
df.shape

(525452, 8)

In [12]:
df['Description'] = df['Description'].str.strip()

In [13]:
# removing entries where customer id is null
df = df[~df.CustomerID.isnull()]
df.shape

(392353, 8)

# Segmenting the Data

In [15]:
# getting the data from timestamp
df_c = df.copy()
df_c['Date'] = df_c['InvoiceDate'].dt.strftime('%Y-%m-%d')
df_c.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12-01
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12-01
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01


In [16]:
df_c = df_c[['InvoiceNo','StockCode','Quantity','UnitPrice','CustomerID','Date']]
df_c.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Date
0,536365,85123A,6,2.55,17850.0,2010-12-01
1,536365,71053,6,3.39,17850.0,2010-12-01
2,536365,84406B,8,2.75,17850.0,2010-12-01
3,536365,84029G,6,3.39,17850.0,2010-12-01
4,536365,84029E,6,3.39,17850.0,2010-12-01


In [17]:
df_c['Date'].unique()

array(['2010-12-01', '2010-12-02', '2010-12-03', '2010-12-05',
       '2010-12-06', '2010-12-07', '2010-12-08', '2010-12-09',
       '2010-12-10', '2010-12-12', '2010-12-13', '2010-12-14',
       '2010-12-15', '2010-12-16', '2010-12-17', '2010-12-19',
       '2010-12-20', '2010-12-21', '2010-12-22', '2010-12-23',
       '2011-01-04', '2011-01-05', '2011-01-06', '2011-01-07',
       '2011-01-09', '2011-01-10', '2011-01-11', '2011-01-12',
       '2011-01-13', '2011-01-14', '2011-01-16', '2011-01-17',
       '2011-01-18', '2011-01-19', '2011-01-20', '2011-01-21',
       '2011-01-23', '2011-01-24', '2011-01-25', '2011-01-26',
       '2011-01-27', '2011-01-28', '2011-01-30', '2011-01-31',
       '2011-02-01', '2011-02-02', '2011-02-03', '2011-02-04',
       '2011-02-06', '2011-02-07', '2011-02-08', '2011-02-09',
       '2011-02-10', '2011-02-11', '2011-02-13', '2011-02-14',
       '2011-02-15', '2011-02-16', '2011-02-17', '2011-02-18',
       '2011-02-20', '2011-02-21', '2011-02-22', '2011-

In [19]:
df_c_seg = df_c.loc[(df_c['Date'] >= '2011-01-04') & (df_c['Date'] <= '2011-12-09')]
df_c_seg.shape

(366708, 6)

# RFM metrics for transaction until 2011-06-05

In [20]:
df_cohort_06_05 = df_c_seg.loc[(df_c['Date']<= '2011-06-05')]
df_cohort_06_05.shape

(121856, 6)

In [21]:
df_cohort_06_05.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Date
42481,539993,22386,10,1.95,13313.0,2011-01-04
42482,539993,21499,25,0.42,13313.0,2011-01-04
42483,539993,21498,25,0.42,13313.0,2011-01-04
42484,539993,22379,5,2.1,13313.0,2011-01-04
42485,539993,20718,10,1.25,13313.0,2011-01-04


In [22]:
df_cohort_06_05['Amount'] = df_cohort_06_05['Quantity']*df_cohort_06_05['UnitPrice']
df_cohort_06_05.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Date,Amount
42481,539993,22386,10,1.95,13313.0,2011-01-04,19.5
42482,539993,21499,25,0.42,13313.0,2011-01-04,10.5
42483,539993,21498,25,0.42,13313.0,2011-01-04,10.5
42484,539993,22379,5,2.1,13313.0,2011-01-04,10.5
42485,539993,20718,10,1.25,13313.0,2011-01-04,12.5


In [23]:
df_cohort_06_05.shape

(121856, 7)

In [24]:
df_rfm_06_05 = df_cohort_06_05.copy()
df_rfm_06_05.shape

(121856, 7)

In [25]:
# extracting the RECENCY
recency = pd.DataFrame(df_rfm_06_05.groupby('CustomerID')['Date'].max().reset_index())
recency['Date'] = pd.to_datetime(recency['Date']).dt.date
recency['MaxDate'] = recency['Date'].max()
recency['recency'] = (recency['MaxDate'] - recency['Date']).dt.days + 1
recency = recency[['CustomerID','recency']]
recency.head()

Unnamed: 0,CustomerID,recency
0,12346.0,139
1,12347.0,60
2,12348.0,62
3,12350.0,124
4,12352.0,76


In [26]:
# extracting the FREQUENCY
frequency = pd.DataFrame(df_rfm_06_05.groupby('CustomerID')['InvoiceNo'].nunique().reset_index())
frequency.columns = ['fCustomerID','frequency']
frequency.head()

Unnamed: 0,fCustomerID,frequency
0,12346.0,1
1,12347.0,2
2,12348.0,2
3,12350.0,1
4,12352.0,4


In [28]:
# extracting the MONETARY
monetary = pd.DataFrame(df_rfm_06_05.groupby('CustomerID')['Amount'].sum().reset_index())
monetary.columns = ['mCustomerID','monetary']
monetary.head()

Unnamed: 0,mCustomerID,monetary
0,12346.0,77183.6
1,12347.0,1111.64
2,12348.0,594.44
3,12350.0,334.4
4,12352.0,721.51


In [29]:
# combining the three into one table
rfm_06_05 = pd.concat([recency,frequency,monetary], axis=1)
rfm_06_05.drop(['fCustomerID','mCustomerID'], axis=1, inplace=True)
rfm_06_05.head(10)

Unnamed: 0,CustomerID,recency,frequency,monetary
0,12346.0,139,1,77183.6
1,12347.0,60,2,1111.64
2,12348.0,62,2,594.44
3,12350.0,124,1,334.4
4,12352.0,76,4,721.51
5,12353.0,18,1,89.0
6,12354.0,46,1,1079.4
7,12355.0,28,1,459.4
8,12356.0,59,2,2753.08
9,12359.0,3,3,3495.73


# Getting the customer from 2011-06-05 to 2011-07-06 : label generation

In [32]:
df_d_seg = df_c.loc[(df_c['Date'] >= '2011-06-05') & (df_c['Date'] <= '2011-07-06')].CustomerID.unique()

In [33]:
type(df_d_seg)

numpy.ndarray

In [34]:
rfm_06_05['Label'] = np.where(rfm_06_05['CustomerID'].isin(df_d_seg), '1', '0')
rfm_06_05.head()

Unnamed: 0,CustomerID,recency,frequency,monetary,Label
0,12346.0,139,1,77183.6,0
1,12347.0,60,2,1111.64,1
2,12348.0,62,2,594.44,0
3,12350.0,124,1,334.4,0
4,12352.0,76,4,721.51,0


# Model

In [35]:
X_1 = rfm_06_05[['recency','frequency','monetary']]
y_1 = rfm_06_05[['Label']]

In [36]:
# test and train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1745, 3) (748, 3) (1745, 1) (748, 1)


In [38]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 10) 
 
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)
 
# performing predictions on the test dataset
y_pred = clf.predict(X_test)
 
# metrics are used to find accuracy or error
from sklearn import metrics 
print()
 
# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL ON TEST DATA SET: ", metrics.accuracy_score(y_test, y_pred))


ACCURACY OF THE MODEL ON TEST DATA SET:  0.7379679144385026


  


# Scoring Data

In [39]:
df_s_seg = df_c.loc[(df_c['Date'] >= '2011-01-04') & (df_c['Date'] <= '2011-07-06')]
df_s_seg.shape

(149211, 6)

In [40]:
df_cohort_07_06 = df_s_seg

In [41]:
df_cohort_07_06['Amount'] = df_cohort_07_06['Quantity']*df_cohort_07_06['UnitPrice']
df_cohort_07_06.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Date,Amount
42481,539993,22386,10,1.95,13313.0,2011-01-04,19.5
42482,539993,21499,25,0.42,13313.0,2011-01-04,10.5
42483,539993,21498,25,0.42,13313.0,2011-01-04,10.5
42484,539993,22379,5,2.1,13313.0,2011-01-04,10.5
42485,539993,20718,10,1.25,13313.0,2011-01-04,12.5


In [42]:
df_rfm_07_06 = df_cohort_07_06.copy()
df_rfm_07_06.shape

(149211, 7)

In [43]:
# extracting the RECENCY
recency = pd.DataFrame(df_rfm_07_06.groupby('CustomerID')['Date'].max().reset_index())
recency['Date'] = pd.to_datetime(recency['Date']).dt.date
recency['MaxDate'] = recency['Date'].max()
recency['recency'] = (recency['MaxDate'] - recency['Date']).dt.days + 1
recency = recency[['CustomerID','recency']]
recency.head()

Unnamed: 0,CustomerID,recency
0,12346.0,170
1,12347.0,28
2,12348.0,93
3,12350.0,155
4,12352.0,107


In [44]:
# extracting the FREQUENCY
frequency = pd.DataFrame(df_rfm_07_06.groupby('CustomerID')['InvoiceNo'].nunique().reset_index())
frequency.columns = ['fCustomerID','frequency']
frequency.head()

Unnamed: 0,fCustomerID,frequency
0,12346.0,1
1,12347.0,3
2,12348.0,2
3,12350.0,1
4,12352.0,4


In [45]:
# extracting the MONETARY
monetary = pd.DataFrame(df_rfm_07_06.groupby('CustomerID')['Amount'].sum().reset_index())
monetary.columns = ['mCustomerID','monetary']
monetary.head()

Unnamed: 0,mCustomerID,monetary
0,12346.0,77183.6
1,12347.0,1494.16
2,12348.0,594.44
3,12350.0,334.4
4,12352.0,721.51


In [51]:
# combining the three into one table
rfm_07_06 = pd.concat([recency,frequency,monetary], axis=1)
rfm_07_06.drop(['fCustomerID','mCustomerID'], axis=1, inplace=True)
rfm_07_06.head(10)

Unnamed: 0,CustomerID,recency,frequency,monetary
0,12346.0,170,1,77183.6
1,12347.0,28,3,1494.16
2,12348.0,93,2,594.44
3,12350.0,155,1,334.4
4,12352.0,107,4,721.51
5,12353.0,49,1,89.0
6,12354.0,77,1,1079.4
7,12355.0,59,1,459.4
8,12356.0,90,2,2753.08
9,12359.0,34,3,3495.73


# Scoring Predictions

In [52]:
X_score = rfm_07_06[['recency','frequency','monetary']]

In [53]:
# performing predictions on the test dataset
y_score_pred = clf.predict(X_score)

In [54]:
rfm_07_06['Predictions'] = y_score_pred
rfm_07_06.head(5)

Unnamed: 0,CustomerID,recency,frequency,monetary,Predictions
0,12346.0,170,1,77183.6,0
1,12347.0,28,3,1494.16,1
2,12348.0,93,2,594.44,1
3,12350.0,155,1,334.4,0
4,12352.0,107,4,721.51,0


In [66]:
rfm_07_06['Propensity'] = clf.predict_proba(X_score)[:,-1]
rfm_07_06.head()

Unnamed: 0,CustomerID,recency,frequency,monetary,Predictions,Propensity
0,12346.0,170,1,77183.6,0,0.4
1,12347.0,28,3,1494.16,1,0.9
2,12348.0,93,2,594.44,1,0.7
3,12350.0,155,1,334.4,0,0.4
4,12352.0,107,4,721.51,0,0.1
