In [26]:
import pandas as pd
from itertools import product
import numpy as np

In [27]:
%pdb

Automatic pdb calling has been turned OFF


In [50]:
def Naive_Markov(filename):
    
    #Read in Sessions .csv file
    session_columns = [
        'sessionid',
        'category',
        'imageurl',
        'createddate',
        'pagetitle',
        'pageurl',
        'userid',
        'fullurl',
        'providerid',
        'productid',
        'normalizedpageurl',
        'rawpageurl',
        'referrerurl',
        'rawreferrerurl',
        'utmsource',
        'utmmedium',
        'utmcontent',
        'utmcampaign',
        'utmterm',
        'ipaddress',
        'deviceid',
        'requesttype',
        'eventtype',
        'quantity',
        'price'
    ]
    session = pd.read_csv(filename, header = None)
    session.columns = session_columns
    
    #Drop unused variables
    session.drop(['category','imageurl','pagetitle','pageurl','userid'
                 ,'fullurl','providerid','productid','normalizedpageurl'
                 ,'rawpageurl','referrerurl','rawreferrerurl','utmsource'
                 ,'utmmedium','utmcontent','utmcampaign','utmterm'
                 ,'ipaddress','deviceid','requesttype','quantity','price']
                 ,axis = 1, inplace = True)
    
    #Format date/time variables
    session['createddate'] = pd.to_datetime(session['createddate'])
    
    #Drop sessionID NA's, since can't be sure they're always same session
    session = session[session['sessionid'].notnull()]
    
    #Initialize dictionary for possible transition counts
    possible_events = [-1, 0, 1, 3, 4, 6, 7, 8, 104, 100]
    events_dict = {}
    
    for i in product(possible_events,repeat=2):
        events_dict[i] = 0
    
    #Create variables for session length in actions and time
    session_actions = []
    session_times = []
    
    #Add to transition counts
    for j in session['sessionid'].unique():
        one_session = session[session['sessionid'] == j].sort_values('createddate')
        session_actions.append(one_session.shape[0])
        begin = (0,one_session['eventtype'].iloc[0])
        end = (one_session['eventtype'].iloc[one_session.shape[0]-1],100)
        events_dict[begin] += 1
        events_dict[end] += 1
        for k in range(one_session.shape[0]-2):
            events_dict[(one_session['eventtype'].iloc[k],one_session['eventtype'].iloc[k+1])] += 1
    
    #convert to numpy matrix of transition counts
    events_map = {-1 : 8
                  ,0 : 0
                  ,1 : 1
                  ,3 : 2
                  ,4 : 3
                  ,6 : 6
                  ,7 : 7
                  ,8 : 4
                  ,100 : 9
                  ,104 : 5}
   
    np_events_dict = {}
    for k,v in events_dict.items():
        mapped_key = (events_map[k[0]],events_map[k[1]])
        np_events_dict[mapped_key] = v
    
    transition_counts = np.zeros((len(possible_events),len(possible_events)))
    for x,y in np_events_dict.items():
        transition_counts[x[0]][x[1]] = y
    
    #Convert to transition probability matrix
    transition_counts[9][9] = 1
    transition_probs = transition_counts.astype(float) / transition_counts.sum(axis = 1,keepdims = True)
        
    return transition_probs, session_actions

In [51]:
a, b = Naive_Markov('data/session_data_sample_030113.csv')

In [53]:
a.sum(axis = 1,keepdims = True)

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.]])

In [54]:
print len(b), np.mean(b)

32489 3.0849210502


In [None]:
events_desc = {
    -1: 'Delete from Wishlist'
    1: 'Page View'
    3: 'Add to Cart'
    4: 'Add to Wishlist'
    6: 'Purchase'
    7: 'Remove from Cart'
    8: 'Add to Watchlist'
    104: 'Begin Checkout'
}

In [3]:
%run 'Markovify.py'

In [4]:
test.Markov_mat

array([[  0.00000000e+00,   9.77592416e-01,   1.09575549e-02,
          4.58616763e-03,   0.00000000e+00,   1.56976207e-03,
          1.35430453e-03,   3.69355782e-03,   2.46237188e-04,
          0.00000000e+00],
       [  0.00000000e+00,   4.62056021e-01,   8.60092405e-02,
          1.45105400e-02,   1.44383483e-05,   5.60207912e-03,
          1.22725960e-03,   4.92347675e-03,   8.95177592e-04,
          4.24761767e-01],
       [  0.00000000e+00,   5.41877023e-01,   1.47572816e-01,
          7.89644013e-03,   0.00000000e+00,   1.09644013e-01,
          6.60194175e-03,   6.29126214e-02,   1.55339806e-03,
          1.21941748e-01],
       [  0.00000000e+00,   4.08353808e-01,   5.40540541e-02,
          3.85257985e-01,   0.00000000e+00,   1.96560197e-03,
          4.91400491e-04,   4.91400491e-03,   1.52334152e-02,
          1.29729730e-01],
       [  0.00000000e+00,   1.42857143e-01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00

In [7]:
print len(test.action_counts), np.mean(test.action_counts)

32489 3.0849210502
