### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pySankey.sankey import sankey

pd.set_option("display.max_columns", None)

from datavis_fun import *
from rfm_fun import *

import jupyter_black

jupyter_black.load()

### Load data

In [None]:
filestoread = get_csv_fnames_from_folder(
    folder_path="D:\\python\\projects\\rfm_retail\\rfm_tables", path=True
)

print(filestoread)


dfs_y = []
for f in filestoread:
    d = pd.read_csv(f, sep=";")
    d = d[
        ["customer_id", "recency", "frequency", "monetary", "RFM", "label_eng", "year"]
    ]
    d["RFM"] = d["RFM"].astype(str)
    dfs_y.append(d)


dfs_y

### Comparison Graphs

In [None]:
color_dict = {
    RED: ["outflow", "risk of outflow"],
    TRAFFIC_Y: ["drifting", "sleeping"],
    TRAFFIC_G: ["VIP", "loyal"],
}

color_dict_san = {
    "outflow": RED,
    "risk of outflow": RED,
    "drifting": TRAFFIC_Y,
    "sleeping": TRAFFIC_Y,
    "loyal": TRAFFIC_G,
    "VIP": TRAFFIC_G,
}

In [None]:
fig, axs = plt.subplots(figsize=(25, 10), ncols=3)
for d in enumerate(dfs_y):
    lolipop_percent_plot(
        df=d[1],
        coltocount="label_eng",
        ax=axs[d[0]],
        ptitle="Percent of Observations by RFM Segment in {}".format(
            *d[1].year.unique()
        ),
    )

fig.tight_layout()

save_fig_custom(
    filename="seg_distr_2009-11", foldername="comparison_vis", figtosave=fig
)

In [None]:
df_c = pd.concat(dfs_y)
df_c
grpby = df_c.groupby(["label_eng"])
cols = ["recency", "frequency", "monetary"]
for cri, df in grpby:
    fig, axs = plt.subplots(figsize=(40, 30), nrows=3)
    for v in enumerate(cols):
        vplot_mcat_onevar(
            df=df,
            catcol="year",
            valcol=v[1],
            ptitle="Distribution of {} Values by Year (Outliers removed)".format(
                v[1].capitalize()
            ),
            ax=axs[v[0]],
            drop_outliers=True,
        )
    if cri != "VIP":
        ftitletext = cri.capitalize()
    else:
        ftitletext = "VIP"
    fig.suptitle(
        "'{}' Segment".format(ftitletext),
        fontsize=40,
        fontfamily="Bahnschrift",
        fontweight="bold",
    )
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    save_fig_custom(
        filename="{}_vals_distr".format(cri),
        foldername="comparison_vis",
        figtosave=fig,
    )

By these plots we can conclude that all values (recency, monetary and frequency) on average increased for each segment compared to 2009. Nevertheless, segments in 2010 and 2011 are very similar in terms of mean, median and distrubution. In practice it means that on average customers within each segment started to spend more money and visit retail store website (buy) more frequently.

### Yearly RFM summary graphs

In [None]:
for i in range(len(dfs_y)):
    vis_df = dfs_y[i].copy()
    year = vis_df.year.unique()
    vis_df = vis_df.drop(columns=["year", "customer_id"])

    fig, axs = plt.subplots(figsize=(80, 60))
    fig.set_facecolor(BG_WHITE)

    # Count graphs
    ax1 = plt.subplot2grid(
        shape=(25, 10), loc=(0, 0), colspan=2, rowspan=6
    )  # Count absolute
    ax2 = plt.subplot2grid(
        shape=(25, 10), loc=(0, 8), colspan=2, rowspan=6
    )  # Count percent

    # Scatter Plots
    ax3 = plt.subplot2grid(shape=(25, 10), loc=(6, 0), rowspan=3, colspan=2)  # RF
    ax4 = plt.subplot2grid(shape=(25, 10), loc=(6, 8), rowspan=3, colspan=2)  # RM

    # Count-percent by RFM Bin
    ax5 = plt.subplot2grid(shape=(25, 10), loc=(6, 2), rowspan=3, colspan=6)  # Table 1

    # Parallel Coordinate plots
    ax7 = plt.subplot2grid(shape=(25, 10), loc=(0, 2), colspan=3, rowspan=2)
    ax8 = plt.subplot2grid(shape=(25, 10), loc=(0, 5), colspan=3, rowspan=2)
    ax9 = plt.subplot2grid(shape=(25, 10), loc=(2, 2), colspan=3, rowspan=2)
    ax10 = plt.subplot2grid(shape=(25, 10), loc=(2, 5), colspan=3, rowspan=2)
    ax11 = plt.subplot2grid(shape=(25, 10), loc=(4, 2), colspan=3, rowspan=2)
    ax12 = plt.subplot2grid(shape=(25, 10), loc=(4, 5), colspan=3, rowspan=2)

    # r vals, freq vals, mon Bplots by label
    ax13 = plt.subplot2grid(shape=(25, 10), loc=(9, 0), colspan=10, rowspan=4)
    ax14 = plt.subplot2grid(shape=(25, 10), loc=(13, 0), colspan=10, rowspan=4)
    ax15 = plt.subplot2grid(shape=(25, 10), loc=(17, 0), colspan=10, rowspan=4)
    ax16 = plt.subplot2grid(shape=(25, 10), loc=(21, 0), colspan=10, rowspan=4)

    # -------------------- COUNT/PERCENT PLOTS
    lolipop_percent_plot(
        df=vis_df,
        coltocount="label_eng",
        ax=ax1,
        ptitle="Percent of Observations by RFM Label",
        ticksize=30,
        axlabsize=30,
        textlabsize=30,
        ptitlesize=40,
    )
    # Count RFM LABELS
    plot_count_h(
        df=vis_df,
        col="label_eng",
        ax=ax2,
        ptitle="Number of Observations by RFM label",
        ticksize=30,
        textlabsize=30,
        ptitlesize=40,
    )

    # Count RFM BINS
    plot_count_v(
        vis_df,
        col="RFM",
        mycolor=TRAFFIC_Y,
        ax=ax5,
        ticksize=30,
        textlabsize=30,
        ptitle="Count of Observations per RFM Bin",
        ptitlesize=40,
    )
    # -------------------- SCATTER PLOTS
    plot_RM(
        data=vis_df,
        monvals_c="monetary",
        rvals_c="recency",
        ax=ax3,
        ticksize=20,
        axlabsize=20,
        ptitlesize=40,
    )
    plot_RF(
        data=vis_df,
        freqvals_c="frequency",
        rvals_c="recency",
        ax=ax4,
        ticksize=20,
        axlabsize=20,
        ptitlesize=40,
    )

    # -------------------- PARALLEL COORDINATES PLOT(S)

    subdf = vis_df.loc[vis_df.label_eng == "outflow"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax7,
        alpha_val=0.01,
        ptitle="Outflow",
        color=get_key("outflow", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    subdf = vis_df.loc[vis_df.label_eng == "risk of outflow"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax8,
        alpha_val=0.01,
        ptitle="Risk of Outflow",
        color=get_key("risk of outflow", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    subdf = vis_df.loc[vis_df.label_eng == "drifting"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax9,
        alpha_val=0.01,
        ptitle="Drifting",
        color=get_key("drifting", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    subdf = vis_df.loc[vis_df.label_eng == "sleeping"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax10,
        alpha_val=0.01,
        ptitle="Sleeping",
        color=get_key("sleeping", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    subdf = vis_df.loc[vis_df.label_eng == "loyal"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax11,
        alpha_val=0.01,
        ptitle="Loyal",
        color=get_key("loyal", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    subdf = vis_df.loc[vis_df.label_eng == "VIP"]
    parallel_coordinates_custom(
        df=subdf,
        ax=ax12,
        alpha_val=0.03,
        ptitle="VIP",
        color=get_key("VIP", color_dict),
        ptitlesize=40,
        ticktextsize=30,
        axsw=1.5,
        lw=5,
    )

    # --------------------  VIOLINPLOTS
    vplot_mcat_onevar(
        df=vis_df.loc[vis_df.label_eng != "VIP"],
        catcol="label_eng",
        valcol="monetary",
        ptitle="Distribution of Total Sales by RFM Label (Outliers Removed)",
        ptitlesize=40,
        ticksize=30,
        textlabsize=30,
        ax=ax13,
        drop_outliers=True,
    )
    vplot_mcat_onevar(
        df=vis_df.loc[vis_df.label_eng != "VIP"],
        catcol="label_eng",
        valcol="frequency",
        ptitle="Distribution of Frequency Values by RFM Label (Outliers Removed)",
        ptitlesize=40,
        ticksize=30,
        textlabsize=30,
        ax=ax14,
        drop_outliers=True,
    )
    vplot_mcat_onevar(
        df=vis_df.loc[vis_df.label_eng != "VIP"],
        catcol="label_eng",
        valcol="recency",
        ptitle="Distribution of Recency Values by RFM Label (Outliers Removed)",
        ptitlesize=40,
        ticksize=30,
        textlabsize=30,
        ax=ax15,
        drop_outliers=True,
    )

    # vis_df["monetary"] = vis_df["monetary"]/100

    subdf = vis_df.loc[vis_df.label_eng == "VIP"]
    vplot_mvars(
        subdf,
        ticksize=30,
        textlabsize=30,
        ptitle="Distribution of R/F/M Values for VIP Label (Outliers Removed)",
        ptitlesize=40,
        vars=["recency", "monetary", "frequency"],
        ax=ax16,
        drop_outliers=True,
    )

    fig.tight_layout(rect=[0, 0.03, 1, 0.95])
    fig.suptitle(
        "RFM Analysis Visualization for Online Retail Store ({})".format(*year),
        fontsize=60,
        fontfamily="Bahnschrift",
    )
    save_fig_custom(
        filename="rfm_summary_{}".format(*year),
        foldername="rfm_yearly_sum",
        figtosave=fig,
    )