# Author = Alexandros Ioannidis

# Import packages

In [1]:
import numpy as np  
import pandas as pd
from numpy import genfromtxt
import functools
import csv

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

#  1. GLOBAL CONSTANTS  
# Application and Run Type constants

In [3]:
RUN_TYPE = "TRAINING"  # TRAINING , TEST
RUN_TITLE = "BLOOD TRANSFUSION RFM " + RUN_TYPE + " DATASET"
# Data File constants
CustMasterDSFile = "transfusion.data.full.txt";
TrainingDSFile = "transfusion.trainingdata.txt"; 
TestDSFile = "transfusion.testdata.txt" 
# Data Frame Column Name constants
idColName = "ID"; 
recencyColName = "Recency (months)"; 
frequencyColName = "Frequency (times)"; 
monetaryColName = "Monetary (cc)";
timesColName = "Time (months)"; 
churnColName = "Churn (0/1)";
rScoreColName = "R-Score"; 
fScoreColName = "F-Score";
mScoreColName = "M-Score";
rfmScoreColName = "RFM-Score";
# RFM Segmentation Analysis & Score Calculation constants
rfmRecencyClusters = 5;
rfmFrequencyClusters = 5;
rfmMonetaryClusters = 5 
# if TRUE, CUSTSEGMMATRICE_LST is used
RFM_MATRICES_PROVIDED = True 
RECENCY = np.zeros((rfmRecencyClusters,3)); RECENCY = [1, 0, 5, 2, 3, 4, 3, 4, 3, 4, 11, 2, 5, 16, 1];
FREQUENCY = np.zeros((rfmFrequencyClusters,3)); FREQUENCY = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 4, 5, 8, 5];
MONETARY = np.zeros((rfmMonetaryClusters,3)); MONETARY = [1, 250, 1, 2, 500, 2, 3, 750, 3, 4, 1250, 4, 5, 2000, 5];
CUSTSEGMMATRICE_LST = [RECENCY, FREQUENCY, MONETARY]
# [print(i) for i in CUSTSEGMMATRICE_LST]
# R, F, M coefficients for RFMScore
rfmCoefficients = [1, 1, 1]  

#  2. FUNCTIONS   
#### 2.1 BUILT CLUSTER MATRIX FOR AN ATTRIBUTE ###

In [4]:
# This function will create a Cluster Matrix for the specified Feature column, after having sorted the Feature column accordingly (sortDescending) The Cluster Matrix will describe the StartValue and Score for each cluster ID
def createFeatureClusterMatrix(featureColumn, noOfClusters, sortDesc):
    featureColumn = featureColumn.sort(reverse=True) # sort the featureColumn in sortDesc order, which orders values in a way that the top most have the highest scores
# create a clusterMatrix noOfClusters rows by 3 columns (ClusterID, StartValue, Score).if skipped clusters are encountered (due to a large number of equal values) these rows will be removed and the clusterMatrix will contains less rows than noOfClusters
    clusterMatrix = np.zeros(noOfClusters, 3);  tmp_len = len(featureColumn)
    clusterElements = round(tmp_len / noOfClusters) # calculate the ideal number of elements per cluster
    dataIdx = 1 # declare and initialize an index to iterate through featureColumn
    noMoreClustersToProcess = FALSE # declare and initialize a flag to stop processing more clusters
    for clusterID in range(1,noOfClusters+1): # create a for loop with an index e.g. clusterID in the range [1:noOfClusters]
      # check if we must skip an entire cluster!
        if((noMoreClustersToProcess) or (dataIdx > clusterID * clusterElements and clusterID < noOfClusters)):
            clusterMatrix[clusterID][1] = -1 # it signals that this row should be deleted!
            continue  
        clusterMatrix[clusterID][1] = clusterID # define the clusterID of cluster      
        startValue = featureColumn[dataIdx] # define the start value of cluster      
        if (clusterID < noOfClusters): dataIdx = clusterID * clusterElements # update the dataIdx with the position of the ideal-last element of the clusterID
        else: dataIdx = tmp_len
        endValue = featureColumn[dataIdx] # retrieve the value of the ideal-last element of the clusterID
        if (sortDesc): clusterMatrix[clusterID, 2] = endValue # set the start value for the cluster to endValue or startValue
        else: clusterMatrix[clusterID, 2] = startValue
        clusterMatrix[clusterID, 3] = noOfClusters - clusterID + 1 # define the score value of cluster with clusterID
        # advance the dataIdx as many places as necessary in order to find the first element that will have a differnt value from the last recorded endValue. This
        # new different value will be the startValue for the following clusterID
        for duplicatesIndx in range(dataIdx,tmp_len):
            if (featureColumn[duplicatesIndx] != endValue):
                dataIdx = duplicatesIndx; break;
        noMoreClustersToProcess = (featureColumn[tmp_len] == endValue) # check if there are no more cluster to process.
    clusterMatrix = clusterMatrix[clusterMatrix[:, 1] > 0, ] # remove clusterMatrix rows with negative IDs (they represent skipped clusters)
    clusterMatrix = clusterMatrix[order(clusterMatrix[:, 2], clusterMatrix[:, 3], decreasing = FALSE), ] # sort clusterMatrix with column 2 in ascending order (bigger values last, so that featureScore() can start from the last row always)
    return clusterMatrix


### 2.2 CALCULATE SCORE FOR AN ATTRIBUTE VALUE ###

In [5]:
def featureScore(featureValue, featureClusterMatrix):
    retvalue = 0
    tmp = (len(featureClusterMatrix)-1)
    for clusterId in reversed(range(tmp)):
        if (featureValue >= featureClusterMatrix.iloc[featureClusterMatrix.index.get_loc(clusterId) + 1, 1]):
            retvalue = featureClusterMatrix.iloc[featureClusterMatrix.index.get_loc(clusterId) + 1, 2];
            break;
    return(retvalue)

### 2.2 CALCULATE SCORE FOR AN ATTRIBUTE VALUE ### specifically for R-score

In [60]:
def featureRScore(featureValue, featureClusterMatrix):
    retvalue = 0
    tmp = (len(featureClusterMatrix))
    for clusterId in reversed(range(tmp)):
        if (featureValue >= featureClusterMatrix.ix[clusterId, 1]):
            retvalue = featureClusterMatrix.ix[clusterId, 2];
            break;
    return(retvalue)

# 2.3 CALCULATE MOVING AVERAGE  

In [120]:
def calcMovingAverage(v, m):
    tmp_len = len(v); 
    ret_v = [None]*tmp_len
    for i in range(tmp_len):
    # find the position of the start and last element to summarize
        if(i <= m):
            startPos = 1 
            endPos = i + m
        elif (i >= tmp_len - m):
            endPos = tmp_len 
            startPos = i - m
        else:
            startPos = i - m 
            endPos = i + m    
        sum = 0 # accumulate the elements in range [startPos..endPos]
        for j in range(startPos,endPos): 
            sum = sum + v[j]
        #print(sum, endPos - startPos + 1)
        ret_v[i] = sum/(endPos - startPos + 1) # find average and store it in new vector
    return ret_v 

### 2.4  PARTITION CUSTOMER DATA SET TO TRAINING & TEST DATA SETS ###

In [8]:
def partitionCustomerDSToTrainingTestDS(CustMasterDSFile,TrainingDSFile, TestDSFile, RunType): # RUN_TYPE is {"TRAINING" | "TEST"}
    print("partitionCustomerDSToTrainingTestDS()") # print function display header
    custMDF = pd.read_csv(CustMasterDSFile, sep=",", header = 0)
    # The column names can be found in  => data.dtype.names
    custMDF = pd.DataFrame(custMDF) # add autonumber column to create user-ids
    custMDF.insert(0, idColName, range(len(custMDF)))
    trainDF = custMDF[custMDF[idColName] <= 500];
    testDF = custMDF[custMDF[idColName] > 500] # split training and test
    # store data frames to files
    trainDF.to_csv(TrainingDSFile, sep=",",index=False)
    testDF.to_csv(TestDSFile, sep=',',index=False)
    if (RunType == "TRAINING"): RunFile = TrainingDSFile # set RunFile
    else: RunFile = TestDSFile
    return RunFile

### 2.6 RFM DATASET PREPARATION ###  RunFile schema : (ID, Recency, Frequency, Monetary, Time, Churn)

In [9]:
def prepareRFMTCdataset(RunFile):
    print("prepareRFMTCdataset()") # print function display header
    rfmtcDF = pd.read_csv(RunFile, sep=",", header = 0) # read RunFile with header
    tmp_len = len(rfmtcDF['ID'])
    rfmtcDF = pd.DataFrame(rfmtcDF.iloc[:, 1:6]) # add autonumber column to create user-ids and create the rfmtcDF from the rfmtcDF 
    rfmtcDF.insert(0, idColName, range(len(rfmtcDF)))
    # define appropriate column labels
    rfmtcDF.columns = [idColName, recencyColName, frequencyColName, monetaryColName, timesColName, churnColName]
    # and 4 blank score columns
    rfmtcDF[rScoreColName] = 0
    rfmtcDF[fScoreColName] = 0
    rfmtcDF[mScoreColName] = 0
    rfmtcDF[rfmScoreColName] = 0
    return rfmtcDF

### 2.7 CALCULATE & STORE RFM SEGMENTATION MATRICES ###

In [11]:
def calcRFMSegmentationMatrices(rfmtcReadyDF): 
    print("calcRFMSegmentationMatrices()")
    # Logic to create segmentation should be added here If segmentation matrices are provided by customer logic  then load them from CUSTSEGMMATRICE_LST
    if not RFM_MATRICES_PROVIDED:   
        rfmRecencyClusterMatrix = createFeatureClusterMatrix(rfmtcReadyDF[recencyColName], rfmRecencyClusters, FALSE)
        rfmFrequencyClusterMatrix = createFeatureClusterMatrix(rfmtcReadyDF[frequencyColName], rfmFrequencyClusters, TRUE)
        rfmMonetaryClusterMatrix = createFeatureClusterMatrix(rfmtcReadyDF[monetaryColName], rfmMonetaryClusters, TRUE)
    else:
        rfmRecencyClusterMatrix = CUSTSEGMMATRICE_LST[0]; 
        rfmFrequencyClusterMatrix = CUSTSEGMMATRICE_LST[1]; 
        rfmMonetaryClusterMatrix = CUSTSEGMMATRICE_LST[2]  
    # save the segmentation matrices to external files
    # convert Recency to dataframe 
    rfmRecencyClusterMatrix = np.array(rfmRecencyClusterMatrix);
    new_rfmRecencyClusterMatrix = rfmRecencyClusterMatrix.reshape(5,3)
    RECENCYdf = pd.DataFrame(new_rfmRecencyClusterMatrix)
    # convert Frequency to dataframe
    rfmFrequencyClusterMatrix = np.array(rfmFrequencyClusterMatrix)
    new_rfmFrequencyClusterMatrix = rfmFrequencyClusterMatrix.reshape(5,3)
    FREQUENCYdf = pd.DataFrame(new_rfmFrequencyClusterMatrix)
    # convert Monetary to dataframe
    rfmMonetaryClusterMatrix = np.array(rfmMonetaryClusterMatrix)
    new_rfmMonetaryClusterMatrix = rfmMonetaryClusterMatrix.reshape(5,3)
    MONETARYdf = pd.DataFrame(new_rfmMonetaryClusterMatrix)
    # write to file 
    RECENCYdf.to_csv("RECENCY_SEGMENTATION_MATRIX.csv",sep=",",index=False,header=False)
    FREQUENCYdf.to_csv("FREQUENCY_SEGMENTATION_MATRIX.csv",sep=",",index=False,header=False)
    MONETARYdf.to_csv("MONETARY_SEGMENTATION_MATRIX.csv",sep=",",index=False,header=False)

# make my own paste() method for python equivalent to paste() in R ## Start ##

In [12]:
def reduce_concat(x, sep=""):                                                      
    return functools.reduce(lambda x, y: str(x) + sep + str(y), x)

In [13]:
def paste(*lists, sep=" ", collapse=None):
    result = map(lambda x: reduce_concat(x, sep=sep), zip(*lists))
    if collapse is not None:
        return reduce_concat(result, sep=collapse)
    return list(result)
                                                                     ## End ##

### 2.8 LOAD RFM SEGMENTATION MATRICES ###

In [14]:
def loadRFMSegmentationMatrices():
    print("loadRFMSegmentationMatrices()")
    # load the segmentation matrices from the external files
    rsm = pd.read_csv("RECENCY_SEGMENTATION_MATRIX.csv", sep=",", header = None)
    fsm = pd.read_csv("FREQUENCY_SEGMENTATION_MATRIX.csv", sep=",", header = None)
    msm = pd.read_csv("MONETARY_SEGMENTATION_MATRIX.csv", sep=",", header = None)
    rsm.columns = ["Cluster No", "Start Value", "Score"]
    fsm.columns = ["Cluster No", "Start Value", "Score"]
    msm.columns = ["Cluster No", "Start Value", "Score"]
    RECENCY = rsm
    FREQUENCY = fsm
    MONETARY = msm
    SEGMMATRICE_LST = [RECENCY, FREQUENCY, MONETARY] # store segmentation matrices to a list
    return SEGMMATRICE_LST

### 2.9 CALCULATE R,F,M SCORES & RFM TOTAL SCORE ###

In [82]:
def rfmScoreCalculation(rfmtcReadyDF, SEGMMATRICE_LST):
    print("rfmScoreCalculation()")
    tmp_col1 = [None]*len(rfmtcReadyDF.iloc[:,1])
    tmp_col2 = [None]*len(rfmtcReadyDF.iloc[:,1])
    tmp_col3 = [None]*len(rfmtcReadyDF.iloc[:,1])
    # calculate R, F, M feature scores
    for idx, obj in rfmtcReadyDF.iterrows():
        tmp_col1[idx] = featureRScore(obj[recencyColName], SEGMMATRICE_LST[0])
    for idx, obj in rfmtcReadyDF.iterrows():
        tmp_col2[idx] = featureScore(obj[frequencyColName], SEGMMATRICE_LST[1])
    # replace zeros with 1 
    for n,i in enumerate(tmp_col2):
        if i==0:
            tmp_col2[n]=1
    for idx, obj in rfmtcReadyDF.iterrows():
        tmp_col3[idx] = featureScore(obj[monetaryColName], SEGMMATRICE_LST[2])
    # replace zeros with 1 
    for n,i in enumerate(tmp_col3):
        if i==0:
            tmp_col3[n]=1
    # calculate RFM Total score
    #rfmtcReadyDF.insert(7, recencyColName, tmp_col1)
    rfmtcReadyDF[rScoreColName] = tmp_col1
    rfmtcReadyDF[fScoreColName] = tmp_col2
    rfmtcReadyDF[mScoreColName] = tmp_col3    
    rfmtcReadyDF[rfmScoreColName] = rfmCoefficients[0] * rfmtcReadyDF[rScoreColName] + rfmCoefficients[1] * rfmtcReadyDF[fScoreColName] + rfmCoefficients[2] * rfmtcReadyDF[mScoreColName] 
    #print(rfmtcReadyDF)
    # sort data on all feature scores in desc order
    rfmtcReadyDF = rfmtcReadyDF.sort_values(by=[rScoreColName, fScoreColName, mScoreColName], ascending=[False, False, False]) 
    return rfmtcReadyDF

### 2.10 CALCULATE P[B] & RFM RESPONSE PROBABILITY & STORE RESULTS ###

In [113]:
def calculatePB_RespProb(rfmtcScoredDF, m):
    print("calculatePB_RespProb()") # m comes from RFMTC training set calculations
    # sort data frame with the RFM-Score in descending order RFM-Score desc & R desc & F asc & M asc
    rfmtcScoredDF = rfmtcScoredDF.sort_values(by=['RFM-Score', 'Recency (months)', 'Frequency (times)', 'Monetary (cc)'], ascending=[False, False, True, True])
    tmp_vector = calcMovingAverage(rfmtcScoredDF[churnColName], m)
    #print(tmp_vector)
    tmp_df = pd.DataFrame(tmp_vector)
    rfmtcScoredDF["P[B]"] = tmp_df
    # Calculate RFM Segment Response Probability / select RFM-Score, avg(P[B]) from rfmtcScoredDF group by RFM-Score
    g = rfmtcScoredDF.groupby('RFM-Score')
    rfmAggrData  = g[['RFM-Score','P[B]']].mean()
    rfmAggrData.columns = ['Segment','Response Probability']
    # Lookup RFM Customer Response Probabilities from rfmAggrData data frame and fill them into the Validation data frame
    tmp_len = len(rfmtcScoredDF.iloc[:,1]) 
    rfmtcScoredDF = pd.DataFrame(rfmtcScoredDF)
    rfmtcScoredDF['RFM Resp Prob'] = 0
    #print(rfmtcScoredDF)
    for i in range(tmp_len):
        x = rfmAggrData.loc[:,'Segment'] == rfmtcScoredDF.ix[i,rfmScoreColName]
        rfmtcScoredDF.ix[i,'RFM Resp Prob'] = rfmAggrData.ix[x[x].index[0], 1]        
    rfmtcScoredDF.to_csv("OUR_RFM_" + RUN_TYPE + ".csv", sep=',')    # write data to file
    return rfmtcScoredDF

### 3. MAIN PROGRAM ### ## PART A  convert transaction datasets to RFM-RFMTC customer datasets

In [121]:
print(RUN_TITLE); print("-------------------------------")
## PART B partition customer dataset to training & test datasets
RunFileName =  partitionCustomerDSToTrainingTestDS(CustMasterDSFile, TrainingDSFile, TestDSFile,  RUN_TYPE) 
rfmtcReadyDF = prepareRFMTCdataset(RunFileName) # PART C RFM dataset preparation
if (RUN_TYPE == "TRAINING"): calcRFMSegmentationMatrices(rfmtcReadyDF) # calculate/load RFM Clustering matrices 
SEGMMATRICE_LST = loadRFMSegmentationMatrices() # read the segmentation matrices from external files
rfmtcScoredDF = rfmScoreCalculation(rfmtcReadyDF, SEGMMATRICE_LST) # calculate R,F,M scores & RFM total score
m = 4
rfmtcScoredDF = calculatePB_RespProb(rfmtcScoredDF, m) # calculate & RFM response probability & store results
print(rfmtcScoredDF)

BLOOD TRANSFUSION RFM TRAINING DATASET
-------------------------------
partitionCustomerDSToTrainingTestDS()
prepareRFMTCdataset()
calcRFMSegmentationMatrices()
loadRFMSegmentationMatrices()
rfmScoreCalculation()
calculatePB_RespProb()
      ID  Recency (months)  Frequency (times)  Monetary (cc)  Time (months)  \
35    35                 2                  8           2000             28   
61    61                 2                  8           2000             35   
8      8                 2                  9           2250             22   
46    46                 2                  9           2250             36   
140  140                 2                  9           2250             74   
12    12                 2                 10           2500             28   
62    62                 2                 10           2500             49   
109  109                 2                 10           2500             64   
50    50                 2                 11        