In [26]:
import pandas as pd
from itertools import product
import numpy as np

In [27]:
%pdb

Automatic pdb calling has been turned OFF


In [50]:
def Naive_Markov(filename):
    
    #Read in Sessions .csv file
    session_columns = [
        'sessionid',
        'category',
        'imageurl',
        'createddate',
        'pagetitle',
        'pageurl',
        'userid',
        'fullurl',
        'providerid',
        'productid',
        'normalizedpageurl',
        'rawpageurl',
        'referrerurl',
        'rawreferrerurl',
        'utmsource',
        'utmmedium',
        'utmcontent',
        'utmcampaign',
        'utmterm',
        'ipaddress',
        'deviceid',
        'requesttype',
        'eventtype',
        'quantity',
        'price'
    ]
    session = pd.read_csv(filename, header = None)
    session.columns = session_columns
    
    #Drop unused variables
    session.drop(['category','imageurl','pagetitle','pageurl','userid'
                 ,'fullurl','providerid','productid','normalizedpageurl'
                 ,'rawpageurl','referrerurl','rawreferrerurl','utmsource'
                 ,'utmmedium','utmcontent','utmcampaign','utmterm'
                 ,'ipaddress','deviceid','requesttype','quantity','price']
                 ,axis = 1, inplace = True)
    
    #Format date/time variables
    session['createddate'] = pd.to_datetime(session['createddate'])
    
    #Drop sessionID NA's, since can't be sure they're always same session
    session = session[session['sessionid'].notnull()]
    
    #Initialize dictionary for possible transition counts
    possible_events = [-1, 0, 1, 3, 4, 6, 7, 8, 104, 100]
    events_dict = {}
    
    for i in product(possible_events,repeat=2):
        events_dict[i] = 0
    
    #Create variables for session length in actions and time
    session_actions = []
    session_times = []
    
    #Add to transition counts
    for j in session['sessionid'].unique():
        one_session = session[session['sessionid'] == j].sort_values('createddate')
        session_actions.append(one_session.shape[0])
        begin = (0,one_session['eventtype'].iloc[0])
        end = (one_session['eventtype'].iloc[one_session.shape[0]-1],100)
        events_dict[begin] += 1
        events_dict[end] += 1
        for k in range(one_session.shape[0]-2):
            events_dict[(one_session['eventtype'].iloc[k],one_session['eventtype'].iloc[k+1])] += 1
    
    #convert to numpy matrix of transition counts
    events_map = {-1 : 8
                  ,0 : 0
                  ,1 : 1
                  ,3 : 2
                  ,4 : 3
                  ,6 : 6
                  ,7 : 7
                  ,8 : 4
                  ,100 : 9
                  ,104 : 5}
   
    np_events_dict = {}
    for k,v in events_dict.items():
        mapped_key = (events_map[k[0]],events_map[k[1]])
        np_events_dict[mapped_key] = v
    
    transition_counts = np.zeros((len(possible_events),len(possible_events)))
    for x,y in np_events_dict.items():
        transition_counts[x[0]][x[1]] = y
    
    #Convert to transition probability matrix
    transition_counts[9][9] = 1
    transition_probs = transition_counts.astype(float) / transition_counts.sum(axis = 1,keepdims = True)
        
    return transition_probs, session_actions

In [51]:
a, b = Naive_Markov('data/session_data_sample_030113.csv')

In [53]:
a.sum(axis = 1,keepdims = True)

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.]])

In [54]:
print len(b), np.mean(b)

32489 3.0849210502


In [None]:
events_desc = {
    -1: 'Delete from Wishlist'
    1: 'Page View'
    3: 'Add to Cart'
    4: 'Add to Wishlist'
    6: 'Purchase'
    7: 'Remove from Cart'
    8: 'Add to Watchlist'
    104: 'Begin Checkout'
}

In [65]:
%run 'Markovify.py'

In [79]:
np.argmax(test.Markov_mat,axis=1)

array([0, 0, 0, 0, 4, 5, 6, 7])

In [67]:
print len(test.action_counts), np.mean(test.action_counts)

32489 3.0849210502


In [78]:
test.current_state = 104
for i in range (5):
    print(test.predict_next())

Begin Checkout
Begin Checkout
Begin Checkout
Begin Checkout
Begin Checkout


In [82]:
possible_events = [-1, 1, 3, 4, 6, 7, 8, 104]
events_desc = {
            -1: 'Delete from Wishlist'
            ,1: 'Page View'
            ,3: 'Add to Cart'
            ,4: 'Add to Wishlist'
            ,6: 'Purchase'
            ,7: 'Remove from Cart'
            ,8: 'Add to Watchlist'
            ,104: 'Begin Checkout'
            ,0: 'Session Start'
            ,100: 'Session End'
}
events_row_map = {
            -1 : 7
            ,1 : 0
            ,3 : 1
            ,4 : 2
            ,6 : 5
            ,7 : 6
            ,8 : 3
            ,104 : 4
        }

In [89]:
def pred_next(current_state):
    current_mat = np.zeros(len(possible_events))
    current_mat[events_row_map[current_state]] = 1
    result = (current_mat * test.Markov_mat.T).T
    max_idx = result.argmax()
    idx_tup = np.unravel_index(max_idx, result.shape)
    current_state = [key for key, value in events_row_map.items() if value == idx_tup[1]][0]
    return current_state

In [95]:
current_state = 6
for i in range(5):
    current_state = pred_next(current_state)
    print current_state

6
6
6
6
6


In [112]:
from numpy import linalg as LA
for i in range(10):
    print LA.matrix_power(test.Markov_mat,i+1).argmax(axis = 1)

[0 0 0 0 4 5 6 7]
[0 0 0 0 4 5 0 7]
[0 0 0 0 4 5 0 7]
[0 0 0 0 5 5 0 7]
[0 0 0 0 5 5 0 7]
[0 0 0 0 5 5 0 7]
[0 0 0 0 5 5 0 0]
[0 0 0 0 5 5 0 0]
[0 0 0 0 5 5 0 0]
[0 0 0 0 0 5 0 0]
