### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pySankey.sankey import sankey

pd.set_option("display.max_columns", None)

from datavis_fun import *
from rfm_fun import *

import jupyter_black

jupyter_black.load()

### Preparing dataframes to calculate RFM score

In [2]:
clean_df = pd.read_csv("retail_clean_2009-11.csv")

In [3]:
clean_df["total_exp"] = clean_df.quantity * clean_df.price
clean_df.sort_values(["quantity", "price"])

Unnamed: 0,invoice,stockcode,description,quantity,invoicedate,price,customer_id,country,total_exp
31226,494914,PADS,PADS TO MATCH ALL CUSHIONS,1,2010-01-19 17:04:00,0.001,16705.0,United Kingdom,0.001
37835,496222,PADS,PADS TO MATCH ALL CUSHIONS,1,2010-01-29 13:53:00,0.001,13583.0,United Kingdom,0.001
39597,496473,PADS,PADS TO MATCH ALL CUSHIONS,1,2010-02-01 15:38:00,0.001,17350.0,United Kingdom,0.001
40671,496643,PADS,PADS TO MATCH ALL CUSHIONS,1,2010-02-03 11:58:00,0.001,13408.0,United Kingdom,0.001
46888,497935,PADS,PADS TO MATCH ALL CUSHIONS,1,2010-02-15 10:47:00,0.001,13408.0,United Kingdom,0.001
...,...,...,...,...,...,...,...,...,...
67632,501534,21099,SET/6 STRAWBERRY PAPER CUPS,12960,2010-03-17 13:09:00,0.100,13902.0,Denmark,1296.000
67634,501534,21091,SET/6 WOODLAND PAPER PLATES,12960,2010-03-17 13:09:00,0.100,13902.0,Denmark,1296.000
46921,497946,37410,BLACK AND WHITE PAISLEY FLOWER MUG,19152,2010-02-15 11:57:00,0.100,13902.0,Denmark,1915.200
316195,541431,23166,MEDIUM CERAMIC TOP STORAGE JAR,74215,2011-01-18 10:01:00,1.040,12346.0,United Kingdom,77183.600


In [4]:
format = "%Y/%m/%d %H:%M"
clean_df["invoicedate"] = pd.to_datetime(clean_df.invoicedate, format=format)

In [5]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582857 entries, 0 to 582856
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   invoice      582857 non-null  int64         
 1   stockcode    582857 non-null  object        
 2   description  582857 non-null  object        
 3   quantity     582857 non-null  int64         
 4   invoicedate  582857 non-null  datetime64[ns]
 5   price        582857 non-null  float64       
 6   customer_id  582857 non-null  float64       
 7   country      582857 non-null  object        
 8   total_exp    582857 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(3)
memory usage: 40.0+ MB


In [6]:
lab_df = pd.read_excel("rfm3366_labels_eng_ru.xlsx")

### Calculating RFM scores 

In [7]:
from datetime import timedelta

group = clean_df.groupby(clean_df.invoicedate.dt.year)
for d_crit, gdf in group:
    print("Year:{}".format(d_crit))
    set_date = max(gdf.invoicedate.dt.date)
    print("Last available date in a dataset: {}".format(set_date))
    set_date += timedelta(days=7)
    print("Date for counting recency: {} (+7 days)".format(set_date))
    gdf.invoicedate = gdf.invoicedate.dt.date
    df = (
        gdf.groupby(gdf.customer_id)
        .agg(
            frequency=("customer_id", "count"),
            monetary=("total_exp", "sum"),
            last_active=("invoicedate", "max"),
        )
        .assign(recency=lambda x: (set_date - x["last_active"]).dt.days)
        .drop(columns=["last_active"])
        .sort_values(["recency", "frequency", "monetary"])
        .reset_index()
    )
    rfm_df = get_rfm_bin(
        df=df, r_vals="recency", fr_vals="frequency", mon_vals="monetary"
    )
    fin_df = rfm_df.merge(
        lab_df[["description_eng", "label_eng"]],
        left_on=rfm_df.RFM,
        right_on=lab_df.segment,
        how="left",
    ).drop(columns="key_0")
    fin_df["year"] = d_crit
    fin_df["RFM"] = fin_df["RFM"].astype(str)
    save_csv_custom(df=fin_df, filename=f"rfm_tab_{d_crit}", foldertosave=f"rfm_tables")
    print(fin_df)

Year:2009
Last available date in a dataset: 2009-12-23
Date for counting recency: 2009-12-30 (+7 days)
Getting threshold quantile values from CURRENT dataframe
The dataframe was saved into d:\python\projects\rfm_retail\rfm_tables
     customer_id  frequency  monetary  recency  R  F  M  RFM  \
0        17819.0          4   1157.10        7  3  1  3  313   
1        18252.0         11    222.77        7  3  2  2  322   
2        18092.0         13   1483.26        7  3  2  3  323   
3        17867.0         17    110.55        7  3  2  1  321   
4        14151.0         22    328.65        7  3  2  2  322   
..           ...        ...       ...      ... .. .. ..  ...   
940      17984.0         40    181.49       29  1  3  1  131   
941      12758.0         42   1191.25       29  1  3  3  133   
942      13526.0         44   1182.00       29  1  3  3  133   
943      17611.0         67    393.84       29  1  3  3  133   
944      16393.0         91    259.90       29  1  3  2  132   

 