# Implementation of Dynamic Classification of Online Customers

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import sys  
import hmcb



In [3]:
# LOAD DATA
seq_file = str(in_dir + '\\seq_df.csv')
seq_df = pd.read_csv(seq_file)
seq_df.head()

Unnamed: 0,session_id_hash,outcome,click_seq,first_event
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,0,PDAPPPPPPPDPPDPPDP,P
1,00000277639fc5c6f816654b78bf3654ece7fd53a7338f...,0,PSPPPPP,P
2,000009f36a40de1d557afc083dbb3fc03eef2473337bad...,0,PP,P
3,00000e812c3076d18245710a31b348d3f23314b7d0dc90...,0,P,P
4,000010504025397b03290c7457e0e7ef7ae93529f21eae...,0,DPDPDPPDPPPDDPDPPDDPPDADPDPDPDPPPPDPAPPPPPDPPD...,D


## Split the data
Here, we first split the data into  train and test set. We then further split the train dataset into BUY sessions and NOBUY session, to estimate two separate MC.

In [4]:
seq_train, seq_test = train_test_split(seq_df, test_size=0.2, random_state=123)

In [5]:
BUY_train = seq_train[seq_train['outcome']==1]
NOBUY_train = seq_train[seq_train['outcome']!=1]

## Create first-order MC transition probability matrix from observed data

In [6]:
BUY_mc = hmcb.HMCB().from_pandas(BUY_train, 'click_seq')

In [7]:
NOBUY_mc = hmcb.HMCB().from_pandas(NOBUY_train, 'click_seq')

In [8]:
pd.DataFrame(BUY_mc.observed_p_matrix, columns=BUY_mc.signals, index=BUY_mc.signals)

Unnamed: 0,A,D,P,R,S
A,0.045491,0.044465,0.784391,0.106299,0.019354
D,0.092849,0.107597,0.785217,0.010475,0.003862
P,0.03426,0.220705,0.679666,0.04193,0.023438
R,0.034948,0.00099,0.444478,0.519464,0.00012
S,0.000495,0.00066,0.996371,0.000247,0.002227


In [9]:
pd.DataFrame(NOBUY_mc.observed_p_matrix, columns=NOBUY_mc.signals, index=NOBUY_mc.signals)

Unnamed: 0,A,D,P,R,S
A,0.049053,0.097175,0.746101,0.087913,0.019758
D,0.011827,0.113046,0.869389,0.001206,0.004532
P,0.006091,0.355495,0.603971,0.004197,0.030246
R,0.088345,0.00178,0.522342,0.387158,0.000376
S,0.00037,0.000897,0.9952,5.6e-05,0.003476


In [10]:
np.savetxt("BUY_tpm.csv", BUY_mc.observed_p_matrix, delimiter=",")
np.savetxt("NOBUY_tpm.csv", BUY_mc.observed_p_matrix, delimiter=",")

In [11]:
BUY_first_event_uni, BUY_first_event_cnt = np.unique(BUY_train.first_event.values, return_counts=True)
BUY_init_prob = BUY_first_event_cnt / sum(BUY_first_event_cnt)
pd.DataFrame(BUY_init_prob, index=BUY_first_event_uni)

Unnamed: 0,0
A,0.00695
B,0.035883
D,0.153591
P,0.79189
R,0.004005
S,0.007681


In [12]:
NOBUY_first_event_uni, NOBUY_first_event_cnt = np.unique(NOBUY_train.first_event.values, return_counts=True)
NOBUY_init_prob = NOBUY_first_event_cnt / sum(NOBUY_first_event_cnt)
pd.DataFrame(NOBUY_init_prob, index=NOBUY_first_event_uni)

Unnamed: 0,0
A,0.000339
D,0.22012
P,0.771389
R,0.000126
S,0.008026


### Signals key
* A -> add
* B -> purchase (BUY)
* D -> detail
* P -> page view
* R -> remove
* S -> search query

## Purchase probability 

Let P(B|session) be the probability of purchase given the session, we start from an initial estimate of the probability, and update it using Baye's Rule throughout the session. 

We consider the following initial probabilities: 
* the observed frequency of sessions with at least one purchase 
* 0.5

In [13]:
rho = BUY_train.shape[0]/seq_train.shape[0]
p_b_init = BUY_train.shape[0]/seq_train.shape[0]
# p_b_init = 0.5

In [14]:
seq_test.shape

(986940, 4)

In [15]:
# naive implementation of the purchase probability computation using Baye's Rule

prob_chain = []
final_prob = []
final_cat = []
for i in range(seq_test.shape[0]):
    temp_prob_chain = []
    temp_click_seq = seq_test.click_seq.values[i]
    PB, PN = 1,1
    for j in range(len(temp_click_seq)):
        if j==0:
            temp_BUY_init_signal_idx = np.where(BUY_first_event_uni==temp_click_seq[j])[0][0]
            temp_NOBUY_init_signal_idx = np.where(NOBUY_first_event_uni==temp_click_seq[j])[0][0]
            
            PB *= BUY_init_prob[temp_BUY_init_signal_idx]
            PN *= NOBUY_init_prob[temp_NOBUY_init_signal_idx]
        else:
            temp_BUY_row_idx = np.where(BUY_mc.signals==temp_click_seq[j-1])[0][0]
            temp_BUY_col_idx = np.where(BUY_mc.signals==temp_click_seq[j])[0][0]
            temp_NOBUY_row_idx = np.where(BUY_mc.signals==temp_click_seq[j-1])[0][0]
            temp_NOBUY_col_idx = np.where(BUY_mc.signals==temp_click_seq[j])[0][0]
        
            PB *= BUY_mc.observed_p_matrix[temp_BUY_row_idx][temp_BUY_col_idx]
            PN *= NOBUY_mc.observed_p_matrix[temp_NOBUY_row_idx][temp_NOBUY_col_idx]
            
        temp_prob = (rho * PB) / ((rho * PB) + ((1-rho) * PN))
        temp_prob_chain.append(temp_prob)
    prob_chain.append(temp_prob_chain)
    final_prob.append(temp_prob_chain[-1])
    
    if temp_prob_chain[-1] < (1 - temp_prob_chain[-1]):
        final_cat.append(0)
    else:
        final_cat.append(1)

In [16]:
seq_test['pred_prob_chain'] = prob_chain
seq_test['pred_final_prob'] = final_prob
seq_test['pred_final_cat'] = final_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## model evaluation

In [17]:
# accuracy
accuracy_score(seq_test.outcome.values, seq_test.pred_final_cat.values)

0.9876709830384826

In [18]:
# confusion matrix
print(confusion_matrix(seq_test.outcome.values, seq_test.pred_final_cat.values))
tn, fp, fn, tp = confusion_matrix(seq_test.outcome.values, seq_test.pred_final_cat.values).ravel()
tn, fp, fn, tp

[[972080   4095]
 [  8073   2692]]


(972080, 4095, 8073, 2692)

In [19]:
# F1 score
f1_score(seq_test.outcome.values, seq_test.pred_final_cat.values)

0.3067456700091158