In [1]:
import pandas as pd
session = pd.read_csv('data/session_data_sample_030113.csv', header = None)
session_columns = [
    'sessionid',
    'category',
    'imageurl',
    'createddate',
    'pagetitle',
    'pageurl',
    'userid',
    'fullurl',
    'providerid',
    'productid',
    'normalizedpageurl',
    'rawpageurl',
    'referrerurl',
    'rawreferrerurl',
    'utmsource',
    'utmmedium',
    'utmcontent',
    'utmcampaign',
    'utmterm',
    'ipaddress',
    'deviceid',
    'requesttype',
    'eventtype',
    'quantity',
    'price'
]
session.columns = session_columns

In [2]:
import pandas as pd
device = pd.read_csv('data/devices_data_sample_030113.csv', header = None)
device_columns = [
    'deviceid',
    'devicecategory',
    'devicetype',
    'agenttype',
    'os',
    'osversion',
    'useragent',
    'providerid',
    'createddate',
    'userid',
    'authtype'
]
device.columns = device_columns

In [3]:
def clean_swym_device_data(device):
    
    df = device.copy()
    
    category_list = ['iPhone','Windows PC','Android phone','Mac','iPad','Linux PC'
                    ,'Android PC','Android tablet','Windows phone']
    df['devicecategory'] = df['devicecategory'].apply(lambda x: x if x in category_list else 'Other')
    
    type_list = ['Smartphone','Personal computer', 'Tablet']
    df['devicetype'] = df['devicetype'].apply(lambda x: x if x in type_list else 'Other')
    
    agent_list = ['Mobile Browser','Browser']
    df['agenttype'] = df['agenttype'].apply(lambda x: x if x in agent_list else 'Other')
    
    os_list = ['iOS','Android','Windows','OS X', 'Linux']
    df['os'] = df['os'].apply(lambda x: x if x in os_list else 'Other')
    
    df.drop(['osversion','useragent','providerid','createddate','userid','authtype']
           , axis = 1, inplace = True)
    df = df[df.notnull()]
    
    return df

In [4]:
import numpy as np
import pandas as pd
from urlparse import urlparse

def clean_swym_data(session, device):
    
    df = session.copy()
    df2 = device.copy()
    
    #Drop unnecessary columns
    df.drop(['imageurl','pageurl','fullurl','normalizedpageurl','rawpageurl','rawreferrerurl'
            ,'utmsource','utmmedium','utmcontent','utmcampaign','utmterm','ipaddress','requesttype']
            ,axis = 1, inplace = True)
    
    #Drop null sessionid, createddate and eventtype
    #Affect ability to derive predicted variable
    df = df[df['sessionid'].notnull()]
    df = df[df['createddate'].notnull()]
    df = df[df['eventtype'].notnull()]
    
    #Feature engineering
    df['category'] = df['category'].fillna('Unknown')
    df['createddate'] = pd.to_datetime(df['createddate'])
    df['dayofweek'] = df['createddate'].dt.dayofweek
    df['hour'] = df['createddate'].dt.hour
    df['pagetitle'] = df['pagetitle'].fillna('Unknown')
    df['userid'] = df['userid'].fillna('Unknown')
    df['providerid'] = df['providerid'].fillna('Unknown')
    df['productid'] = df['productid'].fillna(0.0)
    df['referrerurl'] = df['referrerurl'].fillna('Unknown')
    df['referrerurl'] = df['referrerurl'].apply(urlparse)
    df['referrerurl'] = df['referrerurl'].apply(lambda x: x.netloc)
    df['deviceid'] = df['deviceid'].fillna('Unknown')
    df['quantity'] = df['quantity'].fillna(0.0)
    df['price'] = df['price'].fillna(0.0)
    
    #Join on device data
    df2 = clean_swym_device_data(df2)
    df2.set_index('deviceid', inplace = True)
    df = df.join(df2, on = 'deviceid', how = 'left')
    df['devicecategory'] = df['devicecategory'].fillna('Unknown')
    df['devicetype'] = df['devicetype'].fillna('Unknown')
    df['agenttype'] = df['agenttype'].fillna('Unknown')
    df['os'] = df['os'].fillna('Unknown')
    
    #Prior history within timeframe feature
    
    return df

In [5]:
from sklearn.model_selection import train_test_split

def split_swym_data(data):
    
    df = data.copy()
    unique_sessions = df['sessionid'].unique()
    train_sess, test_sess = train_test_split(unique_sessions)
    train = df[df['sessionid'].isin(train_sess)]
    test = df[df['sessionid'].isin(test_sess)]
    
    return train, test

In [6]:
def swym_next_action(data):
    
    df = data.copy()
    output_columns = list(df.columns)
    output_columns.append('elapsedtime')
    output_columns.append('nextaction')
    output = pd.DataFrame(columns = output_columns)
    for i in df['sessionid'].unique():
        one_session = df[df['sessionid'] == i].sort_values('createddate')
        elapsedtime = np.zeros(one_session.shape[0],dtype = int)
        nextaction = np.zeros(one_session.shape[0],dtype = int)
        for j in range(one_session.shape[0]):
            if j > 0:
                timedelta = one_session['createddate'].iloc[j]-one_session['createddate'].iloc[j-1]
                elapsedtime[j] = (timedelta/np.timedelta64(1,'s')).astype(int)
            if j < one_session.shape[0]-1:
                nextaction[j] = one_session['eventtype'].iloc[j+1]
        one_session['elapsedtime'] = elapsedtime
        one_session['nextaction'] = nextaction
        one_session = one_session[one_session['nextaction'] != 0]
        output = output.append(one_session, ignore_index = True)
    return output

In [26]:
def swym_next_action_v2(data):
    
    df = data.copy()
    output_columns = list(df.columns)
    output_columns.append('elapsedtime')
    output_columns.append('nextaction')
    output_columns.append('nextnextaction')
    output = pd.DataFrame(columns = output_columns)
    for i in df['sessionid'].unique():
        one_session = df[df['sessionid'] == i].sort_values('createddate')
        elapsedtime = np.zeros(one_session.shape[0],dtype = int)
        nextaction = np.zeros(one_session.shape[0],dtype = int)
        nextnextaction = np.zeros(one_session.shape[0],dtype = int)
        for j in range(one_session.shape[0]):
            if j > 0:
                timedelta = one_session['createddate'].iloc[j]-one_session['createddate'].iloc[j-1]
                elapsedtime[j] = (timedelta/np.timedelta64(1,'s')).astype(int)
            if j < one_session.shape[0]-1:
                nextaction[j] = one_session['eventtype'].iloc[j+1]
            if j < one_session.shape[0]-2:
                nextnextaction[j] = one_session['eventtype'].iloc[j+2]
        one_session['elapsedtime'] = elapsedtime
        one_session['nextaction'] = nextaction
        one_session['nextnextaction'] = nextnextaction
        one_session = one_session[one_session['nextnextaction'] != 0]
        output = output.append(one_session, ignore_index = True)
    return output

In [27]:
df = clean_swym_data(session, device)
swym_next_next = swym_next_action_v2(df)

In [29]:
x2, y2 = swym_featurize_v2(swym_next_next)

In [32]:
print x2.shape, y2.shape

(54049, 673) (54049,)


In [31]:
rfc2 = RandomForestClassifier()
print np.mean(cross_val_score(rfc2,x2,y2,cv=5))

0.705287253192


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

def swym_featurize(data):
    
    df = data.copy()
    
    #Dependent Variable
    y = df['nextaction']
    
    #Create dummy variables for session data
    events_desc = {
        -1: 'Delete from Wishlist',
        1: 'Page View',
        3: 'Add to Cart',
        4: 'Add to Wishlist',
        6: 'Purchase',
        7: 'Remove from Cart',
        8: 'Add to Watchlist',
        104: 'Begin Checkout'
    }
    for i, j in events_desc.items():
        df[j] = df['eventtype'].apply(lambda x: 1 if x == i else 0)
    df.drop('Delete from Wishlist', axis = 1, inplace = True)

    dow_desc = {
        0.0: 'Monday',
        1.0: 'Tuesday',
        2.0: 'Wednesday',
        3.0: 'Thursday',
        4.0: 'Friday',
        5.0: 'Saturday',
        6.0: 'Sunday'
    }
    for i, j in dow_desc.items():
        df[j] = df['dayofweek'].apply(lambda x: 1 if x == i else 0)
    df.drop('Monday', axis = 1, inplace = True)
    
    hour_desc = {}
    for a in range(24):
        hour_desc[float(a)] = 'Hour '+str(a)
    for i, j in hour_desc.items():
        df[j] = df['hour'].apply(lambda x: 1 if x == i else 0)
    df.drop('Hour 0', axis = 1, inplace = True)
    
    #Device dummies
    category_list = ['iPhone','Windows PC','Android phone','Mac','iPad','Linux PC'
                    ,'Android PC','Android tablet','Windows phone']
    for a in category_list:
        df[a] = df['devicecategory'].apply(lambda x: 1 if x == a else 0)
        
    type_list = ['Smartphone','Personal computer', 'Tablet']
    for a in type_list:
        df[a] = df['devicetype'].apply(lambda x: 1 if x == a else 0)
        
    agent_list = ['Mobile Browser','Browser']
    for a in agent_list:
        df[a] = df['agenttype'].apply(lambda x: 1 if x == a else 0)
        
    os_list = ['iOS','Android','Windows','OS X', 'Linux']
    for a in os_list:
        df[a] = df['os'].apply(lambda x: 1 if x == a else 0)
        
    #NLP variables
    
    tf_idf = TfidfVectorizer(stop_words = 'english')
    tf_idf.fit(df['referrerurl'])
    referrer_vect = tf_idf.transform(df['referrerurl'])
    referrer_columns = tf_idf.get_feature_names()
    referrer_df = pd.DataFrame(referrer_vect.toarray(), columns = referrer_columns)
    df = pd.concat([df,referrer_df], axis = 1)
    
    tf_idf2 = TfidfVectorizer(stop_words = 'english', max_features = 100)
    tf_idf2.fit(df['category'])
    category_vect = tf_idf2.transform(df['category'])
    category_columns = tf_idf2.get_feature_names()
    category_df = pd.DataFrame(category_vect.toarray(), columns = category_columns)
    df = pd.concat([df,category_df], axis = 1)
    
    #Drop variables
    df.drop(['sessionid','createddate','userid','deviceid','nextaction','providerid','productid'
            ,'referrerurl','category','pagetitle'
            ,'eventtype','dayofweek','hour'
            ,'devicecategory','devicetype','agenttype','os']
            , axis = 1, inplace = True)
    
    return df, y

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

def swym_featurize_v2(data):
    
    df = data.copy()
    
    #Dependent Variable
    y = df['nextnextaction']
    
    #Create dummy variables for session data
    events_desc = {
        -1: 'Delete from Wishlist',
        1: 'Page View',
        3: 'Add to Cart',
        4: 'Add to Wishlist',
        6: 'Purchase',
        7: 'Remove from Cart',
        8: 'Add to Watchlist',
        104: 'Begin Checkout'
    }
    for i, j in events_desc.items():
        df[j] = df['eventtype'].apply(lambda x: 1 if x == i else 0)
    df.drop('Delete from Wishlist', axis = 1, inplace = True)
    
    events_desc_v2 = {
        -1: 'Delete from Wishlist 2',
        1: 'Page View 2',
        3: 'Add to Cart 2',
        4: 'Add to Wishlist 2',
        6: 'Purchase 2',
        7: 'Remove from Cart 2',
        8: 'Add to Watchlist 2',
        104: 'Begin Checkout 2'
    }
    for i, j in events_desc_v2.items():
        df[j] = df['nextaction'].apply(lambda x: 1 if x == i else 0)
    df.drop('Delete from Wishlist 2', axis = 1, inplace = True)

    dow_desc = {
        0.0: 'Monday',
        1.0: 'Tuesday',
        2.0: 'Wednesday',
        3.0: 'Thursday',
        4.0: 'Friday',
        5.0: 'Saturday',
        6.0: 'Sunday'
    }
    for i, j in dow_desc.items():
        df[j] = df['dayofweek'].apply(lambda x: 1 if x == i else 0)
    df.drop('Monday', axis = 1, inplace = True)
    
    hour_desc = {}
    for a in range(24):
        hour_desc[float(a)] = 'Hour '+str(a)
    for i, j in hour_desc.items():
        df[j] = df['hour'].apply(lambda x: 1 if x == i else 0)
    df.drop('Hour 0', axis = 1, inplace = True)
    
    #Device dummies
    category_list = ['iPhone','Windows PC','Android phone','Mac','iPad','Linux PC'
                    ,'Android PC','Android tablet','Windows phone']
    for a in category_list:
        df[a] = df['devicecategory'].apply(lambda x: 1 if x == a else 0)
        
    type_list = ['Smartphone','Personal computer', 'Tablet']
    for a in type_list:
        df[a] = df['devicetype'].apply(lambda x: 1 if x == a else 0)
        
    agent_list = ['Mobile Browser','Browser']
    for a in agent_list:
        df[a] = df['agenttype'].apply(lambda x: 1 if x == a else 0)
        
    os_list = ['iOS','Android','Windows','OS X', 'Linux']
    for a in os_list:
        df[a] = df['os'].apply(lambda x: 1 if x == a else 0)
        
    #NLP variables
    
    tf_idf = TfidfVectorizer(stop_words = 'english')
    tf_idf.fit(df['referrerurl'])
    referrer_vect = tf_idf.transform(df['referrerurl'])
    referrer_columns = tf_idf.get_feature_names()
    referrer_df = pd.DataFrame(referrer_vect.toarray(), columns = referrer_columns)
    df = pd.concat([df,referrer_df], axis = 1)
    
    tf_idf2 = TfidfVectorizer(stop_words = 'english', max_features = 100)
    tf_idf2.fit(df['category'])
    category_vect = tf_idf2.transform(df['category'])
    category_columns = tf_idf2.get_feature_names()
    category_df = pd.DataFrame(category_vect.toarray(), columns = category_columns)
    df = pd.concat([df,category_df], axis = 1)
    
    #Drop variables
    df.drop(['sessionid','createddate','userid','deviceid','nextnextaction','providerid','productid'
            ,'referrerurl','category','pagetitle'
            ,'eventtype','dayofweek','hour', 'nextaction'
            ,'devicecategory','devicetype','agenttype','os']
            , axis = 1, inplace = True)
    
    return df, y

In [8]:
df = clean_swym_data(session, device)
#train_df, test_df = split_swym_data(df)
swym_next = swym_next_action(df)

In [22]:
x, y = swym_featurize(swym_next)

In [33]:
print x.shape

(67737, 778)


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

rfc = RandomForestClassifier()
#gbc = GradientBoostingClassifier()
print np.mean(cross_val_score(rfc,x,y,cv=5))
#print np.mean(cross_val_score(gbc,test_x,test_y,cv=5))

0.697551376711


In [91]:
df['referrerurl'].value_counts()

                                                       24306
www.charliesproject.com                                10597
www.bellerose.be                                        5589
www.google.com                                          3617
www.fabulegsmelissa.com                                 3288
www.oillife.com                                         3205
www.alrugaibfurniture.com                               2931
m.facebook.com                                          2645
www.annsfabulousfinds.com                               2263
www.swaggsauce.com                                      2143
shop.harpdesignco.com                                   1857
keller4salon.com                                        1464
www.southernshirt.com                                   1143
www.bestswimwear.com                                     963
straightrazordesigns.com                                 951
www.poppoly.com                                          890
www.thelongbowshop.com  

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf2 = TfidfVectorizer(stop_words = 'english')
tf_idf2.fit(swym_next['category'])
category_vect = tf_idf2.transform(swym_next['category'])
category_columns = tf_idf2.get_feature_names()
category_df = pd.DataFrame(category_vect.toarray(), columns = category_columns)
swym_next = pd.concat([swym_next,category_df], axis = 1)

In [44]:
category_vect.toarray().shape

(67737, 4343)

In [16]:
columns

[u'10adspay',
 u'2u',
 u'3426',
 u'460',
 u'521836',
 u'561juices',
 u'aa',
 u'abcostore',
 u'adeptmobility',
 u'admin',
 u'adorehervirginhair',
 u'ads',
 u'ae',
 u'africa',
 u'aliceandjames',
 u'allstuffshop',
 u'alltrueist',
 u'almasgioielli',
 u'alphaomegafurniture',
 u'alrugaibfurniture',
 u'amenity',
 u'amenityshoes',
 u'americanteak',
 u'amikiknits',
 u'amis',
 u'ampproject',
 u'android',
 u'animefit',
 u'annieandoak',
 u'annsfabulouscloseouts',
 u'annsfabulousfinds',
 u'aphroditeandantoinette',
 u'apogeetoronto',
 u'app',
 u'apps',
 u'apusworld',
 u'archive1',
 u'arm',
 u'arrows',
 u'artistshopbirmingham',
 u'ask',
 u'asylumzone',
 u'au',
 u'augustdays',
 u'australianbladeforums',
 u'autopilotandchill',
 u'awesometoolboxes',
 u'ayspremiergifts',
 u'bagsmasters',
 u'balancedayspa',
 u'bandmax',
 u'barbeedollboutique',
 u'bathbomb',
 u'beauchapeau',
 u'beerrepublic',
 u'bella',
 u'bellepromenade',
 u'bellerose',
 u'bellolane',
 u'bestswimwear',
 u'bg',
 u'bhtk',
 u'bighaat',
 u'bi

In [10]:
test_xy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17013 entries, 0 to 17012
Data columns (total 16 columns):
sessionid      17013 non-null object
category       17013 non-null object
createddate    17013 non-null datetime64[ns]
pagetitle      17013 non-null object
userid         17013 non-null object
providerid     17013 non-null object
productid      17013 non-null float64
referrerurl    17013 non-null object
deviceid       17013 non-null object
eventtype      17013 non-null float64
quantity       17013 non-null float64
price          17013 non-null float64
dayofweek      17013 non-null float64
hour           17013 non-null float64
elapsedtime    17013 non-null float64
nextaction     17013 non-null float64
dtypes: datetime64[ns](1), float64(8), object(7)
memory usage: 2.1+ MB


In [19]:
test_xy['hour'].value_counts()

17.0    1173
19.0    1141
1.0     1068
21.0    1061
2.0     1006
22.0     954
20.0     943
3.0      936
16.0     848
18.0     763
14.0     741
0.0      716
13.0     683
4.0      662
15.0     603
23.0     603
12.0     487
5.0      444
11.0     433
9.0      421
10.0     356
8.0      331
7.0      331
6.0      309
Name: hour, dtype: int64

In [7]:
df = clean_swym_data(session)
train_df, test_df = split_swym_data(df)
test_xy = swym_next_action(test_df)
test_xy.head()

Unnamed: 0,sessionid,category,createddate,pagetitle,userid,providerid,productid,referrerurl,deviceid,eventtype,quantity,price,dayofweek,hour,elapsedtime,nextaction
0,p3zh6dakjgonbx2aw7ueow6doit8nj3a60jjcxcr1isnw6...,Paisly Prints,2017-03-01 00:00:01,Margarita Paisley,kutupmcf@yahoo.com,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,32028940000.0,www.charliesproject.com,968b6735-e18e-4ec2-b8d8-2f64cbd8fae9,1.0,0.0,18.0,2.0,0.0,0.0,1.0
1,p3zh6dakjgonbx2aw7ueow6doit8nj3a60jjcxcr1isnw6...,Paisly Prints,2017-03-01 00:01:37,Purple Paisley,kutupmcf@yahoo.com,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,31360180000.0,www.charliesproject.com,968b6735-e18e-4ec2-b8d8-2f64cbd8fae9,1.0,0.0,18.0,2.0,0.0,96.0,1.0
2,p3zh6dakjgonbx2aw7ueow6doit8nj3a60jjcxcr1isnw6...,"Fun Prints!,Plaids & Camo",2017-03-01 00:01:58,Black Polka Dots,kutupmcf@yahoo.com,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,33115720000.0,www.charliesproject.com,968b6735-e18e-4ec2-b8d8-2f64cbd8fae9,1.0,0.0,18.0,2.0,0.0,21.0,3.0
3,p3zh6dakjgonbx2aw7ueow6doit8nj3a60jjcxcr1isnw6...,Unknown,2017-03-01 00:02:40,Black Polka Dots,kutupmcf@yahoo.com,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,33115720000.0,,968b6735-e18e-4ec2-b8d8-2f64cbd8fae9,3.0,1.0,25.0,2.0,0.0,42.0,1.0
4,p3zh6dakjgonbx2aw7ueow6doit8nj3a60jjcxcr1isnw6...,Solid Tops,2017-03-01 00:03:00,Modal long Sleeve Top with Slits,kutupmcf@yahoo.com,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,30181090000.0,www.charliesproject.com,968b6735-e18e-4ec2-b8d8-2f64cbd8fae9,1.0,0.0,30.0,2.0,0.0,20.0,104.0


In [100]:
test_xy[test_xy['sessionid']==test_xy['sessionid'][500]].T

Unnamed: 0,490,491,492,493,494,495,496,497,498,499,500,501,502,503
sessionid,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...,mrpbtc7b4a71anb8hxnzsemi2bh1hnulparr30mf44dral...
category,"$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$100.00 - $250.00,All Modern Ventless Fireplac...","$500.00 - $1,000.00,All Modern Ventless Firepl...","$500.00 - $1,000.00,All Modern Ventless Firepl...","$1,000.00 & Above,All Modern Ventless Fireplac...","$1,000.00 & Above,All Modern Ventless Fireplac...","$1,000.00 & Above,All Modern Ventless Fireplac..."
createddate,2017-03-01 00:42:01,2017-03-01 00:42:03,2017-03-01 00:45:34,2017-03-01 00:50:42,2017-03-01 00:58:06,2017-03-01 00:59:00,2017-03-01 01:07:42,2017-03-01 01:07:47,2017-03-01 01:13:02,2017-03-01 01:30:26,2017-03-01 01:30:31,2017-03-01 01:40:37,2017-03-01 01:41:11,2017-03-01 01:49:15
pagetitle,Eco-Feu Sunset Table Top Ethanol Fireplace - M...,Eco-Feu Sunset Table Top Ethanol Fireplace - M...,Eco-Feu Sunset Table Top Ethanol Fireplace - S...,Eco-Feu Superior Quality Bio-Ethanol Fuel - 12...,Eco-Feu Toulouse Table Top Ethanol Fireplace -...,Eco-Feu Toulouse Table Top Ethanol Fireplace -...,Eco-Feu Toulouse Table Top Ethanol Fireplace -...,Eco-Feu Toulouse Table Top Ethanol Fireplace -...,Eco-Feu Toulouse Table Top Ethanol Fireplace -...,Eco-Feu Vision I Free Standing Ethanol Firepla...,Eco-Feu Vision I Free Standing Ethanol Firepla...,"Eco-Feu Vision III - 51"" Free Standing Ethanol...","Eco-Feu Vision III - 51"" Free Standing Ethanol...","Eco-Feu Wellington - 33.5"" UL Listed Built-in/..."
userid,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
providerid,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=,syTNykmRUcbYNPUy2D025EL6wcWqGrpILDNIDO/OZIE=
productid,1.64496e+10,1.64496e+10,1.64501e+10,1.64449e+10,1.64621e+10,1.64621e+10,1.64637e+10,1.64637e+10,1.64639e+10,1.65469e+10,1.65469e+10,1.65484e+10,1.65484e+10,1.65381e+10
referrerurl,,,,,,,,,,,,,,
deviceid,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785,b69cd6ba-346b-4913-ac51-a7b82f2c5785
eventtype,1,4,1,1,1,1,1,4,1,1,4,1,4,1


In [None]:
df['referrerurl'].value_counts()

In [106]:
print len(df['category'].unique())
df['category'].value_counts()

9961


Unknown                                                                                                                                                                                                     17304
FabuLegs                                                                                                                                                                                                     3276
CLOSE OUT                                                                                                                                                                                                    2454
Animal Prints                                                                                                                                                                                                1895
Fun Prints!                                                                                                                                                     

In [26]:
df = session.copy()
df['referrerurl'] = df['referrerurl'].fillna('None')
df['rawreferrerurl'] = df['rawreferrerurl'].fillna('None')
df['equivalent'] = df['referrerurl']==df['rawreferrerurl']

In [44]:
df[df['equivalent']==False].T

Unnamed: 0,27,48,52,58,60,61,63,64,69,70,...,100001,100012,100016,100038,100044,100056,100090,100113,100138,100193
sessionid,v84c9we2re77x3de41imb1emi1gbe7q0yf93ksph99kx7d...,4odaws1c3div0207xke7y14iu8zwf1ypsyvijirpgi0h33...,g4l69ftj4ixslnxw2902h982gldinppic1o23ses714i0y...,8fcvocywah5miuazx8qdaq0k9sjnykg1fw29fqzlzfiggb...,iipsk3p7gb95ya4tkgwrk94z53wn7zgds4i18otmgmvlso...,yqecxytx1orumqt42qehfr37kjqe3766reb3az2phqia4i...,cxvol8tzkt9nlz213y2e29q19uftakyb9i3b2u26s9k9c3...,q6lvfpf1orjil3sg7gb9zyb34k7qfrev30jp2e29mqke4d...,af5ekfls1yvigyej8suac3dilc3symlmzpvikl05cheyiu...,qgx03sor4dk33ul3di270zkedzvkj4in36cgcc5l8frgsq...,...,ciua0fcqy9he7z2xhgl9885mie5zfks0hugvfnv0thdw9a...,ajexnpf8h1tnujamfdi6a0hmx2rl6ddxifnvacbi8aj81q...,bbkyc2wmxu85dxer1snfav2t9vuvrysmgpyx2iunasq3er...,ajexnpf8h1tnujamfdi6a0hmx2rl6ddxifnvacbi8aj81q...,ajexnpf8h1tnujamfdi6a0hmx2rl6ddxifnvacbi8aj81q...,vzubmmi5nk92cvcywp1qldxggxo6m4ted5y8rcp3jkfi73...,x09oale1h68e5nqndlmp5akh4xr8upj0cihmxkk4vo59ic...,r2vd7l4cvbsup38yyamuuywz9cbiwqvybjdqqql6us4sgq...,5c1t75rm9snyqhg18nyw0icnmidrpl935nc2bacgj311ul...,w8fhs15paal7i1c596civn296a0zot8flgw4n43glbmbnh...
category,CLOSE OUT,"All Products,Computer Accessories,Home page","All Products,Games & Consoles,Home page,Specia...","All Products,Games & Consoles,Home page","All Products,Camera & Audio,Home page,Special ...","All Products,Computer Accessories,Security Device","All Products,Cell Phone Accessories,Home page","All Products,Games & Consoles,Home page,Specia...","All Products,Games & Consoles,Home page,Specia...","All Products,Games & Consoles,Home page",...,"Books and Media,Books and Media > Books,Busine...","Clocks,Wall Clocks","Browse All Products,Discount Allowed Also,Filt...","Clocks,Wall Clocks","Clocks,Wall Clocks","Bright Ideas,Close to Ceiling Lights,Discount ...","Campus Classics,Gifts For Him,Men,Men / Polos,...",Ros?,"Furniture,Harp Design,Tables","Beauty,Beauty > Apparel > Accessories,Cases & ..."
imageurl,//cdn.shopify.com/s/files/1/1264/3941/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,//cdn.shopify.com/s/files/1/1377/5865/products...,...,//cdn.shopify.com/s/files/1/1514/9080/products...,//cdn.shopify.com/s/files/1/1234/4330/products...,//cdn.shopify.com/s/files/1/0382/8869/products...,//cdn.shopify.com/s/files/1/1234/4330/products...,//cdn.shopify.com/s/files/1/1234/4330/products...,//cdn.shopify.com/s/files/1/0382/8869/products...,//cdn.shopify.com/s/files/1/0704/5125/products...,//cdn.shopify.com/s/files/1/1468/2294/products...,//cdn.shopify.com/s/files/1/1393/7989/products...,//cdn.shopify.com/s/files/1/1514/9080/products...
createddate,2017-03-01 00:00:30.000,2017-03-01 00:00:52.000,2017-03-01 00:00:54.000,2017-03-01 00:00:58.000,2017-03-01 00:00:58.000,2017-03-01 00:00:59.000,2017-03-01 00:01:00.000,2017-03-01 00:01:00.000,2017-03-01 00:01:01.000,2017-03-01 00:01:02.000,...,2017-03-01 23:56:27.000,2017-03-01 23:56:36.000,2017-03-01 23:56:47.000,2017-03-01 23:57:07.000,2017-03-01 23:57:13.000,2017-03-01 23:57:21.000,2017-03-01 23:57:52.000,2017-03-01 23:58:13.000,2017-03-01 23:58:34.000,2017-03-01 23:59:28.000
pagetitle,Neon Paisley,Mini Keyboard iPazzPort Voice 2.4 G,G300 Wired Gaming Mouse 2500dpi Logitech,MD2 MD3 Pocket Handheld Game Player Built-in 6...,LC-8200 Stereo bluetooth headset support two d...,NETAC 16G USB 2.0 Flash Drive Keypad Lock AES ...,Bonorda 64GB USB2.0 Flash Drive For iPhone iPad,G300 Wired Gaming Mouse 2500dpi Logitech,James Donkey 007 Gaming Wired Mouse USB 2.0 35...,BETOP BTP-3189 Shock Computer Driving Game Rac...,...,Essential Life Book - 3rd Edition,Wooden Skeleton Wall Clock with White Trim 76cms,0-004450>American Patriot Musical Clock Oak,Black Metal Skeleton Wall Clock 90cms,Churchill Wall Clock 60cm,0-016972>Belmont 2-Light Semi-Flush Burnished ...,Heritage Performance Polo,Monowai Estate Pinot Noir Ros? 2015,Chevron Side Table,Leather Case Keychain - Holds 3 Sample Vials -...
pageurl,https://www.charliesproject.com/products/neon-...,https://epicbuy.org/products/mini-keyboard-ipa...,https://epicbuy.org/products/g300-wired-gaming...,https://epicbuy.org/products/md2-md3-pocket-ha...,https://epicbuy.org/products/lc-8200-stereo-bl...,https://epicbuy.org/products/netac-16g-usb-2-0...,https://epicbuy.org/products/copy-of-azm-roll-...,https://epicbuy.org/products/g300-wired-gaming...,https://epicbuy.org/products/usb-2-0-james-don...,https://epicbuy.org/products/betop-btp-3189-sh...,...,https://www.oillife.com/products/essential-lif...,https://retailtherapyinteriors.co.uk/products/...,http://www.lampsusa.com/products/rhythm-clocks...,https://retailtherapyinteriors.co.uk/products/...,https://retailtherapyinteriors.co.uk/products/...,http://www.lampsusa.com/products/capital-light...,https://www.southernshirt.com/products/heritag...,https://winesonline.co.nz/products/monowai-est...,https://shop.harpdesignco.com/products/chevron...,https://www.oillife.com/products/leather-case-...
userid,,,,,,,,,,,...,,,,,,,,shirties@xtra.co.nz,,
fullurl,https://www.charliesproject.com/collections/cl...,https://epicbuy.org/products/mini-keyboard-ipa...,https://epicbuy.org/products/g300-wired-gaming...,https://epicbuy.org/products/md2-md3-pocket-ha...,https://epicbuy.org/products/lc-8200-stereo-bl...,https://epicbuy.org/products/netac-16g-usb-2-0...,https://epicbuy.org/products/copy-of-azm-roll-...,https://epicbuy.org/products/g300-wired-gaming...,https://epicbuy.org/products/usb-2-0-james-don...,https://epicbuy.org/products/betop-btp-3189-sh...,...,https://www.oillife.com/products/essential-lif...,https://retailtherapyboutique.myshopify.com/co...,http://www.lampsusa.com/search?q=School+house&...,https://retailtherapyboutique.myshopify.com/co...,https://retailtherapyboutique.myshopify.com/co...,http://www.lampsusa.com/search?q=Semi+flush+mo...,https://www.southernshirt.com/collections/men-...,https://winesonline.co.nz/collections/rose/pro...,https://shop.harpdesignco.com/collections/furn...,https://www.oillife.com/products/leather-case-...
providerid,/JnX+WmfJI4b2+NBp7+e81TFoCCSGT8QarGV6f1hHf0=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,zAjRHLisdPvCheN4fPbJ77kTr1H7uRNzoMOeznuEUn4=,...,yS2PwSnfvUUqbb3lUe+neF9YlNTd1jELqV4q0Z8LQK4=,0O5sp2+KhmZj9Y1MwVs4U6d6gANM5BZvcdnAW3ZMJ5Y=,2bhOZF0TwixpAPVPM0cw0YB/hjasuebKjH3Cb8HlHHg=,0O5sp2+KhmZj9Y1MwVs4U6d6gANM5BZvcdnAW3ZMJ5Y=,0O5sp2+KhmZj9Y1MwVs4U6d6gANM5BZvcdnAW3ZMJ5Y=,2bhOZF0TwixpAPVPM0cw0YB/hjasuebKjH3Cb8HlHHg=,8s0Xm3m4OAS6c8RtSSY6IxlFs0n7imnqpCEpes5ubzI=,DNwtk3eoK9R244smT8pSdYOEQYvtqnnrc40M968U9Zc=,5yonMWG4/yzCmpJkd1oruhYBmNRqJ04Y09syjoCr2Fc=,yS2PwSnfvUUqbb3lUe+neF9YlNTd1jELqV4q0Z8LQK4=
productid,3.24374e+10,3.23769e+10,3.2427e+10,3.24398e+10,3.06752e+10,2.4264e+10,2.56932e+10,3.2427e+10,3.26345e+10,3.13841e+10,...,2.86607e+10,3.81138e+10,1.00293e+09,3.82179e+10,3.84436e+10,2.40088e+10,2.68195e+10,3.2087e+10,2.53862e+10,2.5483e+10


In [28]:
df['equivalent'].value_counts()

True     94680
False     5552
Name: equivalent, dtype: int64

In [48]:
num = 27
print df['pageurl'][num]
print df['fullurl'][num]
print df['rawpageurl'][num]
print df['referrerurl'][num]
print df['rawreferrerurl'][num]
print df['utmsource'][num]
print df['utmmedium'][num]
print df['utmcontent'][num]
print df['utmcampaign'][num]
print df['utmterm'][num]

https://www.charliesproject.com/products/neon-paisley-leggings
https://www.charliesproject.com/collections/close-out/products/neon-paisley-leggings
https://www.charliesproject.com/products/neon-paisley-leggings
https://www.charliesproject.com/collections/close-out
https://www.charliesproject.com/collections/close-out?utm_campaign=CLOSE-OUTS%2B%252B%2B20%2525%2Boff%2Bthis%2Bweek%2BOnly%2521_58b31bf7597ed702d689977f&utm_medium=email&utm_source=newsletter
nan
nan
nan
nan
nan


In [36]:
events_row_map = {
            0: 1,
            1: 3,
            2: 4,
            3: 8,
            4: 104,
            5: 6,
            6: 7,
            7: -1
        }
max_prob = [0, 0, 0, 0, 4, 5, 6, 7]
max_prob_dict = {}
for i in range(len(max_prob)):
    max_prob_dict[events_row_map[i]] = events_row_map[max_prob[i]]
max_prob_dict

{-1: -1, 1: 1, 3: 1, 4: 1, 6: 6, 7: 7, 8: 1, 104: 104}

In [42]:
a = swym_next['eventtype'].map(max_prob_dict)
b = swym_next['nextaction']
c = a == b

In [44]:
c.value_counts()

True     52446
False    15291
dtype: int64

In [45]:
52446./(52446+15291)

0.774259267460915

In [46]:
b.value_counts()

 1.0      48613
 3.0       8233
 104.0     3753
 6.0       2570
 4.0       2099
 7.0       1728
-1.0        734
 8.0          7
Name: nextaction, dtype: int64

In [47]:
48613./52446

0.9269153033596461