<a href="https://colab.research.google.com/github/kasprova/advanced_customer_analytics/blob/master/notebooks/markov_chain_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MARKOV CHAIN MODEL (update: 2019-10-26)

26 Oct,2019: #clusters = 25

1. Calculate **transition matrix** (12 months - average vs the story for each household) - how many people from one cluster moved to another one - probability to move to another segment

2. Calculate **average revenue** for each cluster

3. Calculate **average churn score** for each cluster

4. transition_matrix^(number of months)\*revenue*(1-churn_score) = transition_matrix^(number of months)\*revenue*(ave_retention_score)

5. Evaluation: compare estimated values with real world picture (2nd year of transaction data)

In [0]:
#load libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:20,.2f}'.format

import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')
#plt.style.use('default')
plt.rcParams["figure.figsize"] = (10,5)

In [16]:
#load data from google disk

#mount google drive to get access to the data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!ls drive/My\ Drive/analytics/

cluster_labels_25clstrs_191026.parquet
cluster_labels_50clstrs_191020.parquet
customer_analytics.zip
DATA.zip
scaled_df_kmeans_201509_201608.zip
universal_features_manual_cat_201509_201608__.zip
universal_features_manual_cat_201609_201709__.zip


In [4]:
df = pd.read_parquet("drive/My Drive/analytics/cluster_labels_25clstrs_191026.parquet")
df.head(2)

Unnamed: 0,month_id,household_id,labels
0,201509,100200513,5
1,201509,100212142,6


### 1. Transition Matrix (average of a year-long-range)

In [0]:
NUM_UNIQUE_CLUSTERS = df.labels.unique().shape[0]

In [0]:
def get_history_of_cluster_move(df):
  temp = df.set_index(['month_id','household_id']).unstack(level=0)
  temp.columns = temp.columns.droplevel()
  temp.columns.name = None
  history_cluster_move = temp.reset_index()
  
  return history_cluster_move

history_cluster_move = get_history_of_cluster_move(df)

In [0]:
def get_transition_matrix(df, month_id1, month_id2, num_unique_clusters = NUM_UNIQUE_CLUSTERS):
  #initializing df of required size
  init_matrix = np.zeros((num_unique_clusters,num_unique_clusters))
  init_df = pd.DataFrame(init_matrix)
  init_df.columns = np.arange(0,NUM_UNIQUE_CLUSTERS, 1)
  
  #calculate number of movers from one cluster to another
  transitions = df[['household_id',month_id1,month_id2]].groupby([month_id1,month_id2]).count().reset_index().rename({'household_id': 'movers'}, axis='columns')
                                                                                    
  #number of ppl moved from one segment to another - pivot into matrix
  transition_matrix = pd.pivot_table(transitions, index = [month_id1], columns=[month_id2], values=['movers'], aggfunc = ['first'])
  #remove unnecessary indexing
  transition_matrix.columns = transition_matrix.columns.droplevel([0,1])

  transition_matrix.columns = transition_matrix.columns.values.astype(int)
  transition_matrix.index = transition_matrix.index.values.astype(int)
  
  #if there were no move between 2 clusters 
  init_df.loc[:,:] = transition_matrix.loc[:,:]
  #fulfill NaN with 0 
  init_df = init_df.fillna(0)
                                                                                 
  return init_df                                                                              

In [0]:
#build transition matricies for selected month range (12 months originally - 11 matricies)
month_id1_list = df.month_id.unique()[:-1]
month_id2_list = df.month_id.unique()[1:]
transition_matrix_dict = {}
for i in zip(month_id1_list,month_id2_list):
  transition_matrix_dict[i] = get_transition_matrix(df = history_cluster_move, month_id1 = i[0], month_id2 = i[1])

In [9]:
s = 0 #sum of matricies
n = 0 #number of matricies

for key, value in transition_matrix_dict.items(): 
  s += value
  n += 1
transition_matrix = round(s/n,0).astype(int)
transition_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,98,43,1,3,33,112,17,32,121,10,42,25,71,53,58,7,1,20,22,20,107,50,0,185,62
1,40,398,2,4,56,169,21,49,212,30,58,44,91,67,111,24,2,36,74,43,137,137,1,396,81
2,0,2,354,75,1,21,47,1,4,3,1,1,2,1,1,319,10,2,33,11,2,74,4,10,1
3,2,5,80,364,3,43,68,2,14,8,4,2,5,3,5,264,5,3,58,155,8,169,16,34,6
4,33,55,1,3,341,153,21,30,143,15,40,28,79,51,76,12,1,29,36,25,129,89,0,238,57
5,108,173,22,40,153,4774,322,119,368,71,134,34,309,207,205,244,11,87,423,251,433,956,2,52,254
6,16,21,46,68,23,327,2985,22,5,23,26,3,34,24,31,260,12,14,52,113,40,8,1,0,31
7,33,46,1,3,34,122,24,228,126,13,52,26,82,53,65,11,1,17,22,19,103,66,0,212,58
8,118,207,3,15,145,348,4,127,1093,73,143,195,291,218,227,64,6,104,248,107,463,528,1,1487,280
9,11,34,2,7,18,70,24,12,69,627,13,14,32,17,24,25,2,13,35,23,40,87,1,156,20


In [10]:
#probability matrix
transition_matrix_prob = transition_matrix/np.sum(transition_matrix, axis=1)[:,None]
#fulfill NaN with 0 
transition_matrix_prob = transition_matrix_prob.fillna(0)
transition_matrix_prob.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,0.08,0.04,0.0,0.0,0.03,0.09,0.01,0.03,0.1,0.01,0.04,0.02,0.06,0.04,0.05,0.01,0.0,0.02,0.02,0.02,0.09,0.04,0.0,0.16,0.05
1,0.02,0.17,0.0,0.0,0.02,0.07,0.01,0.02,0.09,0.01,0.03,0.02,0.04,0.03,0.05,0.01,0.0,0.02,0.03,0.02,0.06,0.06,0.0,0.17,0.04
2,0.0,0.0,0.36,0.08,0.0,0.02,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.01,0.0,0.03,0.01,0.0,0.08,0.0,0.01,0.0
3,0.0,0.0,0.06,0.27,0.0,0.03,0.05,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.04,0.12,0.01,0.13,0.01,0.03,0.0
4,0.02,0.03,0.0,0.0,0.2,0.09,0.01,0.02,0.08,0.01,0.02,0.02,0.05,0.03,0.05,0.01,0.0,0.02,0.02,0.01,0.08,0.05,0.0,0.14,0.03


### 2. Average Revenue per cluster


In [0]:
!unzip -q "drive/My Drive/analytics/universal_features_manual_cat_201509_201608__.zip"

In [0]:
universal_features = "universal_features_manual_cat_201509_201608__.parquet"
universal_features_df = pd.read_parquet(universal_features)
data = pd.merge(universal_features_df[['month_id', 'household_id','monetary', 'churn']],
        df[['month_id', 'household_id', 'labels']],
        how = 'left',
        left_on = ['month_id', 'household_id'],
        right_on = ['month_id', 'household_id'])

In [20]:
data.head(2)

Unnamed: 0,month_id,household_id,monetary,churn,labels
0,201509,100200513,48.44,0,5
1,201509,100212142,342.95,0,6


In [21]:
temp = data[['month_id','labels','monetary']].groupby(['month_id','labels']).sum().reset_index()
temp.head(2)

Unnamed: 0,month_id,labels,monetary
0,201509,0,43684.49
1,201509,1,113546.81


In [22]:
ave_revenue = temp[['labels','monetary']].groupby(['labels']).mean()
ave_revenue.head(2)

Unnamed: 0_level_0,monetary
labels,Unnamed: 1_level_1
0,56074.56
1,163016.39


### 3. Average Retention Score = (1-churn_score)

In [23]:
data.head(2)

Unnamed: 0,month_id,household_id,monetary,churn,labels
0,201509,100200513,48.44,0,5
1,201509,100212142,342.95,0,6


In [0]:
temp = pd.merge(data[['month_id','labels','household_id']].groupby(['month_id','labels']).count().reset_index().rename({'household_id': 'num_ppl_in_cluster'}, axis='columns'),
                data[['month_id','labels','churn']][data['churn']==1].groupby(['month_id','labels']).count().reset_index().rename({'churn': 'num_churned'}, axis='columns'),
                how = 'left',
                left_on = ['month_id', 'labels'],
                right_on = ['month_id', 'labels']) 
temp['num_churned'] = temp['num_churned'].fillna(0)
temp['retention_score'] = round((1 - temp['num_churned']/temp['num_ppl_in_cluster']),4)

In [25]:
temp.head(10)

Unnamed: 0,month_id,labels,num_ppl_in_cluster,num_churned,retention_score
0,201509,0,1686,0.0,1.0
1,201509,1,2627,0.0,1.0
2,201509,2,558,0.0,1.0
3,201509,3,817,0.0,1.0
4,201509,4,2051,0.0,1.0
5,201509,5,10935,0.0,1.0
6,201509,6,4485,0.0,1.0
7,201509,7,1949,1.0,1.0
8,201509,8,7857,0.0,1.0
9,201509,9,1428,0.0,1.0


In [0]:
ave_retention_score = temp[['labels','retention_score']].groupby(['labels']).mean().rename({'retention_score': 'ave_retention_score'}, axis='columns')

In [27]:
ave_retention_score.head(10)

Unnamed: 0_level_0,ave_retention_score
labels,Unnamed: 1_level_1
0,0.9
1,0.9
2,1.0
3,1.0
4,0.9
5,1.0
6,1.0
7,0.91
8,1.0
9,0.93


### 4. Prediction

In [0]:
#revenue prediction
revenue_prediction_matrix = np.zeros(shape=(NUM_UNIQUE_CLUSTERS,14))
revenue_prediction_matrix[:,0] = ave_revenue.values.flatten()
for i in range(1,14):
  revenue_prediction_matrix[:,i] = revenue_prediction_matrix[:,i-1]*ave_retention_score.values.flatten()@transition_matrix_prob.values

In [33]:
#14 months prediction
month_id_list = ['201608','201609','201610','201611','201612','201701','201702','201703','201704','201705','201706','201707','201708','201709']
revenue_prediction_df = pd.DataFrame(revenue_prediction_matrix, columns = month_id_list, index = np.arange(0,NUM_UNIQUE_CLUSTERS,1))
revenue_prediction_df

Unnamed: 0,201608,201609,201610,201611,201612,201701,201702,201703,201704,201705,201706,201707,201708,201709
0,56074.56,115341.94,140598.45,151451.25,155148.27,155041.03,152854.85,149532.74,145607.78,141387.03,137049.21,132698.75,128396.55,124177.73
1,163016.39,249104.8,285735.09,300753.75,304826.43,302971.49,297836.83,290901.58,283016.25,274677.64,266178.24,257690.74,249316.74,241115.26
2,671771.19,450994.35,334087.78,265617.66,223246.18,195925.44,177526.27,164485.71,154700.16,146921.62,140407.78,134715.53,129578.88,124836.75
3,503307.64,426096.68,354779.74,302980.5,267340.77,242677.98,225001.06,211677.65,201068.89,192178.4,184405.58,177388.47,170907.67,164828.77
4,91618.38,171802.75,208331.84,223945.47,229095.15,228731.41,225378.52,220402.16,214569.38,208320.48,201911.14,195490.54,189145.55,182926.19
5,1258264.65,1415334.6,1495607.53,1517156.61,1504852.47,1474809.78,1436068.26,1393468.27,1349556.75,1305660.91,1262463.37,1220305.8,1179348.95,1139657.46
6,796139.22,838941.61,841273.06,821071.21,790296.0,755896.98,721502.2,688784.67,658353.21,630280.27,604396.44,580446.02,558163.91,537309.47
7,69976.55,136221.8,167003.76,180390.42,184994.17,184948.73,182375.53,178425.98,173747.56,168711.94,163534.87,158342.17,153207.03,148171.61
8,650783.11,812793.88,878739.84,904747.89,908226.42,898581.69,881361.04,859877.85,836122.84,811297.3,786127.92,761054.11,736340.23,712143.14
9,173454.84,183770.1,189745.12,191630.76,190556.96,187556.81,183386.62,178552.64,173378.39,168065.05,162735.36,157463.02,152291.37,147245.15


In [0]:
#prediction: Sept 2016
#pred1 = pd.DataFrame(transition_matrix_prob.values*ave_retention_score.values*ave_revenue.values, columns = np.arange(0,50,1))
#pred1

In [36]:
#size of cluster prediction
cluster_size_prediction_matrix = np.zeros(shape=(NUM_UNIQUE_CLUSTERS,14))
cluster_size_prediction_matrix[:,0] = np.sum(transition_matrix, axis = 1)
for i in range(1,14):
  cluster_size_prediction_matrix[:,i] = cluster_size_prediction_matrix[:,i-1]@transition_matrix_prob.values

pd.DataFrame(cluster_size_prediction_matrix, columns = month_id_list, index = np.arange(0,NUM_UNIQUE_CLUSTERS,1)).astype(int)

Unnamed: 0,201608,201609,201610,201611,201612,201701,201702,201703,201704,201705,201706,201707,201708,201709
0,1193,1166,1145,1126,1109,1093,1078,1063,1048,1034,1020,1007,994,981
1,2283,2262,2226,2190,2156,2125,2095,2066,2038,2010,1984,1957,1932,1906
2,980,978,980,980,976,968,959,949,938,927,916,904,893,881
3,1326,1341,1340,1334,1324,1311,1297,1282,1267,1251,1235,1220,1204,1189
4,1685,1713,1694,1669,1644,1620,1597,1575,1554,1533,1513,1493,1473,1454
5,9752,9795,9786,9720,9624,9513,9396,9277,9157,9039,8921,8805,8691,8577
6,4185,4178,4173,4163,4146,4121,4090,4053,4013,3970,3925,3879,3832,3785
7,1417,1395,1370,1348,1327,1308,1289,1272,1254,1238,1221,1205,1189,1174
8,6495,6585,6460,6346,6244,6151,6064,5980,5898,5819,5741,5665,5591,5517
9,1376,1373,1359,1342,1324,1306,1288,1271,1254,1237,1221,1205,1189,1173


In [0]:
#QA
#transition_matrix/np.sum(transition_matrix, axis=1)[:,None]
#is equal to:
#for i in range(transition_matrix.shape[0]):
#    total = sum(transition_matrix.iloc[i,:])
#    for j in range(transition_matrix.shape[1]): 
#         transition_matrix.iloc[i,j] = transition_matrix.iloc[i,j]/total

#### References:
1. https://medium.com/jbennetcodes/how-to-rewrite-your-sql-queries-in-pandas-and-more-149d341fc53e