In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline

pd.set_option("max_columns", None)

In [2]:
df = pd.read_csv("online_retail_cleaned.csv")

In [3]:
# cluster using recency,frequency, monetary
# so that we can profile other profiling variables

# need to transform dataset to calculate recency, frequency, and monetary value by each customerid

In [4]:
df.drop("Unnamed: 0", axis = 1, inplace = True)

In [5]:
df.reset_index(drop = True, inplace = True)

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,cancelled,revenue,discount,Continent,damaged,Date,Time
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,0.0,15.3,0,Europe,0,2010-12-01,08:26:00
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,0.0,22.0,0,Europe,0,2010-12-01,08:26:00
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00


In [7]:
# set max date as the 'current' date
current_date = df.InvoiceDate.max()

In [8]:
df.loc[df['InvoiceDate'] == '2011-12-09 12:50:00','InvoiceDate']

406814    2011-12-09 12:50:00
406815    2011-12-09 12:50:00
406816    2011-12-09 12:50:00
406817    2011-12-09 12:50:00
406818    2011-12-09 12:50:00
406819    2011-12-09 12:50:00
406820    2011-12-09 12:50:00
406821    2011-12-09 12:50:00
406822    2011-12-09 12:50:00
406823    2011-12-09 12:50:00
406824    2011-12-09 12:50:00
406825    2011-12-09 12:50:00
406826    2011-12-09 12:50:00
406827    2011-12-09 12:50:00
406828    2011-12-09 12:50:00
Name: InvoiceDate, dtype: object

In [9]:
converted_dates = df['InvoiceDate'].values.astype('datetime64[D]')

current_day = converted_dates.max()
current_day

numpy.datetime64('2011-12-09')

In [10]:
time_since = [current_day - i for i in converted_dates]

In [11]:
time_since = time_since / np.timedelta64(1,'D')

In [12]:
df['time_since'] = time_since

In [13]:
# verify changes
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,cancelled,revenue,discount,Continent,damaged,Date,Time,time_since
0,536365,85123A,white hanging heart t-light holder,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,0.0,15.3,0,Europe,0,2010-12-01,08:26:00,373.0
1,536365,71053,white metal lantern,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00,373.0
2,536365,84406B,cream cupid hearts coat hanger,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,0.0,22.0,0,Europe,0,2010-12-01,08:26:00,373.0
3,536365,84029G,knitted union flag hot water bottle,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00,373.0
4,536365,84029E,red woolly hottie white heart.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,0.0,20.34,0,Europe,0,2010-12-01,08:26:00,373.0


In [14]:
df[df.cancelled == 1.0].revenue.sum()

-611342.090000015

In [15]:
df[df.cancelled == 0.0].revenue.sum()

8911407.904003216

In [16]:
(-(df[df.cancelled == 1.0].revenue.sum())) / df[df.cancelled == 0.0].revenue.sum()

0.06860218908006507

In [17]:
sum(df.cancelled) / len(df)

0.021888803403887137

In [18]:
customers = np.unique(df.CustomerID)

In [19]:
time_since_first_purchase = df.groupby(by = 'CustomerID')['time_since'].max()

In [20]:
time_since_last_purchase = df.groupby(by = 'CustomerID')['time_since'].min()

In [21]:
frequency = df.groupby(by = 'CustomerID')['InvoiceNo'].count()

In [22]:
total_revenue = df.groupby(by = 'CustomerID')['revenue'].sum()

In [23]:
min_spent = df.groupby(by = 'CustomerID')['revenue'].min()
max_spent = df.groupby(by = 'CustomerID')['revenue'].max()

In [24]:
median_spent = df.groupby(by = 'CustomerID')['revenue'].median()

In [25]:
mean_quantity = df.groupby(by = 'CustomerID')['Quantity'].mean()
median_quantity = df.groupby(by = 'CustomerID')['Quantity'].median()
max_quantity = df.groupby(by = 'CustomerID')['Quantity'].max()
min_quantity = df.groupby(by = 'CustomerID')['Quantity'].min()

In [26]:
cancelled_mean = df.groupby(by = 'CustomerID')['cancelled'].mean()

In [27]:
cancelled_num = df.groupby(by = 'CustomerID')['cancelled'].sum()

In [28]:
cancelled_num

CustomerID
12346.0     1.0
12347.0     0.0
12348.0     0.0
12349.0     0.0
12350.0     0.0
12352.0    10.0
12353.0     0.0
12354.0     0.0
12355.0     0.0
12356.0     0.0
12357.0     0.0
12358.0     0.0
12359.0     6.0
12360.0     0.0
12361.0     0.0
12362.0     8.0
12363.0     0.0
12364.0     0.0
12365.0     1.0
12367.0     0.0
12370.0     0.0
12371.0     0.0
12372.0     0.0
12373.0     0.0
12374.0     0.0
12375.0     1.0
12377.0     0.0
12378.0     0.0
12379.0     1.0
12380.0     1.0
           ... 
18245.0     2.0
18246.0     0.0
18248.0     2.0
18249.0     0.0
18250.0     1.0
18251.0     0.0
18252.0     0.0
18255.0     0.0
18256.0     4.0
18257.0     5.0
18259.0     0.0
18260.0     6.0
18261.0     0.0
18262.0     0.0
18263.0     1.0
18265.0     0.0
18268.0     1.0
18269.0     1.0
18270.0     2.0
18272.0     4.0
18273.0     0.0
18274.0    11.0
18276.0     2.0
18277.0     1.0
18278.0     0.0
18280.0     0.0
18281.0     0.0
18282.0     1.0
18283.0     0.0
18287.0     0.0
Name: cancell

In [29]:
j = df.groupby(by = 'CustomerID')['StockCode'].unique()

In [30]:
unique_products = [len(i) for i in j]

In [31]:
unique_products = pd.Series(index = customers, data = unique_products)

In [32]:
unique_products_perc = unique_products / (df.groupby(by = 'CustomerID')['StockCode'].count())

In [33]:
unique_products_perc.head()

12346.0    0.500000
12347.0    0.565934
12348.0    0.709677
12349.0    1.000000
12350.0    1.000000
dtype: float64

In [34]:
customer_rfm = pd.DataFrame([time_since_first_purchase,time_since_last_purchase,
                            frequency,total_revenue,min_spent,max_spent,median_spent,
                            mean_quantity,median_quantity,max_quantity,min_quantity,
                            cancelled_mean,cancelled_num, unique_products,
                            unique_products_perc],
                           index = ['time_since_first_purchase','time_since_last_purchase',
                            'frequency','total_revenue','min_spent','max_spent','median_spent',
                            'mean_quantity','median_quantity','max_quantity','min_quantity',
                            'cancelled_mean','cancelled_num', 'unique_products',
                                   'unique_products_perc']).transpose()

In [35]:
customer_rfm.head()

Unnamed: 0_level_0,time_since_first_purchase,time_since_last_purchase,frequency,total_revenue,min_spent,max_spent,median_spent,mean_quantity,median_quantity,max_quantity,min_quantity,cancelled_mean,cancelled_num,unique_products,unique_products_perc
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
12346.0,325.0,325.0,2.0,0.0,-77183.6,77183.6,0.0,0.0,0.0,74215.0,-74215.0,0.5,1.0,1.0,0.5
12347.0,367.0,2.0,182.0,4310.0,5.04,249.6,17.0,13.505495,12.0,240.0,2.0,0.0,0.0,103.0,0.565934
12348.0,358.0,75.0,31.0,1797.24,13.2,240.0,41.76,75.516129,72.0,144.0,1.0,0.0,0.0,22.0,0.709677
12349.0,18.0,18.0,73.0,1757.55,6.64,300.0,17.7,8.643836,6.0,36.0,1.0,0.0,0.0,73.0,1.0
12350.0,310.0,310.0,17.0,334.4,8.5,40.0,19.8,11.588235,12.0,24.0,1.0,0.0,0.0,17.0,1.0


In [36]:
customer_rfm['time_as_customer'] = customer_rfm.time_since_first_purchase - customer_rfm.time_since_last_purchase

In [37]:
customer_rfm.head()

Unnamed: 0_level_0,time_since_first_purchase,time_since_last_purchase,frequency,total_revenue,min_spent,max_spent,median_spent,mean_quantity,median_quantity,max_quantity,min_quantity,cancelled_mean,cancelled_num,unique_products,unique_products_perc,time_as_customer
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
12346.0,325.0,325.0,2.0,0.0,-77183.6,77183.6,0.0,0.0,0.0,74215.0,-74215.0,0.5,1.0,1.0,0.5,0.0
12347.0,367.0,2.0,182.0,4310.0,5.04,249.6,17.0,13.505495,12.0,240.0,2.0,0.0,0.0,103.0,0.565934,365.0
12348.0,358.0,75.0,31.0,1797.24,13.2,240.0,41.76,75.516129,72.0,144.0,1.0,0.0,0.0,22.0,0.709677,283.0
12349.0,18.0,18.0,73.0,1757.55,6.64,300.0,17.7,8.643836,6.0,36.0,1.0,0.0,0.0,73.0,1.0,0.0
12350.0,310.0,310.0,17.0,334.4,8.5,40.0,19.8,11.588235,12.0,24.0,1.0,0.0,0.0,17.0,1.0,0.0


In [38]:
# save it as another csv file
customer_rfm.to_csv("customer_rfm2.csv")

In [39]:
df.to_csv("new_onlineretail.csv")