In [12]:
import pickle as pkl
import pandas as pd
import random
import numpy as np
from collections import Counter

In [2]:
RAW_DATA_FILE = 'data/UserBehavior_sample.csv'  # input
MAX_LEN_ITEM = 200   ### TODO: Get length distribution from *edav*

In [3]:
def remap(df):  
    '''
    remap every id into [0,total_feature_num]. For embedding table use.
    '''
    ##### map item id
    item_key = sorted(df['iid'].unique().tolist())
    item_len = len(item_key)
    item_map = dict(zip(item_key, range(item_len)))
    df['iid'] = df['iid'].map(lambda x: item_map[x])
    ##### map user id
    user_key = sorted(df['uid'].unique().tolist())
    user_len = len(user_key)
    user_map = dict(zip(user_key, range(item_len, item_len + user_len)))
    df['uid'] = df['uid'].map(lambda x: user_map[x])
    ## map category id
    cate_key = sorted(df['cid'].unique().tolist())
    cate_len = len(cate_key)
    cate_map = dict(zip(cate_key, range(user_len + item_len, user_len + item_len + cate_len)))
    df['cid'] = df['cid'].map(lambda x: cate_map[x])
    return df, item_len

In [4]:
def gen_dataset(user_df, item_df, item_cnt):

    user_last_touch_time = []  ## the last interaction time of each user ###

    for uid, history in user_df:  ###history is a dataframe for each user, with column ["uid","iid","cid","btag","time"], sorted by timestamp
        user_last_touch_time.append(list(history['time'])[-1])

    user_last_touch_time_sorted = sorted(user_last_touch_time)
    #### split test and training set with time, to prevent future information leak ####
    split_time = user_last_touch_time_sorted[int(len(user_last_touch_time_sorted) * 0.7)] 

    cnt = 0
    test_uid=[]
    #### Create feature from raw data
    uid_list=[]
    hist_len=[]
    target_iid=[]
    target_cid=[]
    label=[]
    most_freq_cid=[]
    most_freq_iid=[]
    buy_num=[]
    fav_num=[]
    cart_num=[]
    pv_num=[]
    for uid, history in user_df:
        
        ######### get history #######
        item_hist = list(history['iid'])
        cate_hist = list(history['cid'])
        btag_hist = list(history['btag'])

        ######## target item is the last touch ########
        target_item = item_hist[-1]
        uid_list.append(uid)
        hist_len.append(len(item_hist)-1)
        target_iid.append(target_item)
        target_cid.append(cate_hist[-1])
        label.append(1)  # positive sample
        most_freq_cid.append(history.cid.mode()[0])
        most_freq_iid.append(history.iid.mode()[0])
        buy_num.append(Counter(history.btag)['buy'])
        fav_num.append(Counter(history.btag)['fav'])
        pv_num.append(Counter(history.btag)['pv'])
        cart_num.append(Counter(history.btag)['cart'])
        if list(history['time'])[-1] > split_time: # decide whether test or train
            test_uid.append(uid)
        

        #############    negative sampling   ###############
        ## TODO: only use random negative sampling now. Should change it to in-batch
        ## negative sampling, with hard negative sampling method or negative sampling
        ## based on frequency prediction (Youtube Two Tower)

        neg = random.randint(0, 1)
        #if neg == 1:
        
        while target_item == item_hist[-1]: ## must be different than target item
            label.append(0)
            target_item = random.randint(0, item_cnt - 1)
            target_item_cate = item_df.get_group(target_item)['cid'].tolist()[0]

            uid_list.append(uid)
            hist_len.append(len(item_hist)-1)
            target_iid.append(target_item)
            target_cid.append(target_item_cate)
            most_freq_cid.append(history.cid.mode()[0])
            most_freq_iid.append(history.iid.mode()[0])
            buy_num.append(Counter(history.btag)['buy'])
            fav_num.append(Counter(history.btag)['fav'])
            pv_num.append(Counter(history.btag)['pv'])
            cart_num.append(Counter(history.btag)['cart'])
    
    data=pd.DataFrame()
    data['uid']=uid_list
    data['hist_len']=hist_len
    data['most_freq_cid']=most_freq_cid
    data['most_freq_iid']=most_freq_iid
    data['buy_num']=buy_num
    data['fav_num']=fav_num
    data['pv_num']=pv_num
    data['cart_num']=cart_num
    data['target_iid']=target_iid
    data['target_cid']=target_cid
    data['label']=label
    
    dev_df=data[~data['uid'].isin(test_uid)]
    dev_X = dev_df.drop(columns=['label','uid'])
    dev_y = dev_df['label']
    
    test_df=data[data['uid'].isin(test_uid)]
    test_X = test_df.drop(columns=['label','uid'])
    test_y = test_df['label']

    return dev_X, dev_y, test_X, test_y

In [5]:
df = pd.read_csv(RAW_DATA_FILE, header=None, names=['uid', 'iid', 'cid', 'btag', 'time'])

In [6]:
df, item_cnt= remap(df)

In [7]:
item_key = sorted(df['iid'].unique().tolist())
item_len = len(item_key) #count number of unique item
item_map = dict(zip(item_key, range(item_len)))

In [8]:
user_df = df.sort_values(['uid', 'time']).groupby('uid')
item_df = df.sort_values(['iid', 'time']).groupby('iid')

In [9]:
item_df.get_group(114843)

Unnamed: 0,uid,iid,cid,btag,time
402974,242381,114843,251340,pv,1511942464
370826,246945,114843,251340,pv,1512037321
180558,245295,114843,251340,pv,1512222109
405240,242858,114843,251340,pv,1512281441
116092,244712,114843,251340,pv,1512309260


# Default random forest

In [13]:
dev_X, dev_y, test_X, test_y = gen_dataset(user_df, item_df, item_cnt)

In [14]:
dev_X.head()

Unnamed: 0,hist_len,most_freq_cid,most_freq_iid,buy_num,fav_num,pv_num,cart_num,target_iid,target_cid
0,54,249485,232527,0,0,55,0,26715,248483
1,54,249485,232527,0,0,55,0,144320,251302
2,97,250022,152225,8,6,84,0,56808,249332
3,97,250022,152225,8,6,84,0,78261,251302
4,80,250730,85376,0,12,67,2,164381,249242


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rfc = RandomForestClassifier(n_estimators=2000, max_depth=5)
rfc.fit(dev_X, dev_y)

y_pred = rfc.predict(test_X)

In [16]:
Counter(y_pred)

Counter({0: 1546, 1: 1366})

In [17]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(test_y, y_pred)
metrics.auc(fpr, tpr)

0.540521978021978

## Tune hyperparameters 

In [18]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1,warm_start=True)
param_grid = { "min_samples_leaf" : [1, 5, 10], "max_depth" : [4,5,6], "n_estimators": [100,500,1000,1500]}
gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
gs = gs.fit(dev_X, dev_y)

  "X does not have valid feature names, but"


In [19]:
gs.best_params_

{'max_depth': 6, 'min_samples_leaf': 1, 'n_estimators': 1500}

In [20]:
gs.best_score_

0.562011678200692

In [21]:
pred_y=gs.best_estimator_.predict(test_X)

In [22]:
fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y)
metrics.auc(fpr, tpr)

0.5436126373626373

In [23]:
metrics.accuracy_score(test_y, pred_y)

0.5436126373626373