In [2]:
import numpy as np
import pandas as pd

In [8]:
class RandomPred:
    '''
    RandomPred()
    
    Initializes a random predcitor, which is a baseline predictor that gives back a random score for each item.  
    
    '''
    def fit(self, data):
        '''
        Dummy function for training.
        
        Parameters
        ----------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network(session_key, item_key, time_key properties).
        '''
        pass
    def predict_next(self, session_id, input_item_id, predict_for_item_ids):
        '''
        Gives prediction scores for a selected set of items on how likely they be the next item in the session.
        
        Parameters
        ---------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores.
        
        Returns
        ---------
        out : pandas.Series
            Prediction scores for selected items on how likely to be next item of this session. Indexed by the item IDs.
        
        '''
        return pd.Series(data=np.random.rand(len(predict_for_item_ids)), index=predict_for_item_ids)

In [9]:
class Pop:
    '''
    Pop(top_n=100, item_key='ItemId', support_by_key=None)
    
    Popularity predictor that gives higher scores to items with larger support.
    
    The score is given by:
    
    .. math::
        r_{i}=\\frac{supp_i}{(1+supp_i)}
        
    Parameters
    --------
    top_n : int
        Only give back non-zero scores to the top N ranking items. Should be higher or equal than the cut-off of your evaluation.(Default value:100)
    item_key : string
        The header of the item IDs in the training data. (Default value: 'ItemID')
    support_by_key : string or None
        If not None, count the number of unique values of the attribute of the training data given by the specified header. If None, count the events (Default value: None)

    '''
    def __init__(self, top_n = 100, item_key = 'ItemId', support_by_key = None):
        self.top_n = top_n
        self.item_key = item_key
        self.support_by_key = support_by_key
    
    def fit(self, data):
        '''
        Trains the predictor.
        
        Parameters
        --------
        data: pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network(session_key, item_key, time_key properties).
            
        '''
        grp = data.groupby(self.item_key)
        self.pop_list = grp.size() if self.support_by_key is None else grp[self.support_by_key].nunique()
        self.pop_list = self.pop_list / (self.pop_list + 1)
        self.pop_list.sort_values(ascending=False, inplace=True)
        self.pop_list = self.pop_list.head(self.top_n)
    
    def predict_next(self, session_id, input_item_id, predict_for_item_ids):
        '''
        Gives prediction scores for a selected set of items on how likely they be the next item in the session.
        
        Parameters
        --------
        session_id : int or string
            The session IDs of the event.
        input_item_id : int or string
            The item ID of the event.
        predict_for_item_ids : 1D array
            IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
            
        Returns
        --------
        out : pandas.Series
            Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs
        
        '''
        preds = np.zeros(len(predict_for_item_ids))
        mask = np.in1d(predict_for_item_ids, self.pop_list.index)
        preds[mask] = self.pop_list[predict_for_item_ids[mask]]
        return pd.Series(data=preds, index=predict_for_item_ids)

In [None]:
class SessionPop:
    '''
    SessionPop(top_n=100, item_key='ItemId', support_by_key=None)
    
    Session popularity predictor that gives higher scores to items with higher number of occurrences in the session. Ties are broken up by adding the popularity score of the item.
    
    The score is given by:
    
    .. math::
        r_{s, i} = supp_{s,i} + \\frac{supp_i}{(1+supp_i)}
        
    Parameters
    --------
    top_n : int
        Only give back non-zero scores to the top N ranking items. Should be higher or equal than the cut-off of your evaluation.(Default value:100)
    item_key : string
        The header of the item IDs in the training data. (Default value: 'ItemId')
    support_by_key : string or None
        If not None, count the number of unique values of the attribute of the training data given by the specified header. If None, count the events (Default value: None)
   
    '''
    def __init__(self, top_n = 100, item_key = 'ItemId', support_by_key = None):
        self.top_n = top_n
        self.item_key = item_key
        self.support_by_key = support_by_key
        
    def fit(self, data):
        

In [4]:
df_action = pd.read_csv('../input/제6회 L.POINT Big Data Competition-분석용데이터-01.온라인 행동 정보.csv', parse_dates=['sess_dt'])

In [5]:
df_action.tail()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm
3196357,31040,1,1,0,A02,2019-08-15,03:47,13525,,다이슨 에어랩,4.0,62.0,unknown,mobile_web
3196358,4129,1,2,0,A02,2019-07-21,01:22,250099,,여성메탈시계,3.0,250.0,unknown,mobile_web
3196359,4129,1,1,0,A02,2019-07-21,01:18,1525,,여성메탈시계,3.0,250.0,unknown,mobile_web
3196360,54403,3,1,0,A02,2019-07-31,20:14,16905,,비비고왕교자,3.0,68.0,unknown,mobile_app
3196361,54403,4,1,0,A02,2019-08-18,16:35,82700,,에포테쉬볼륨팝브러쉬,5.0,101.0,unknown,mobile_app


In [7]:
df0 = df_action.dropna(subset = ['trans_id']).reset_index(drop=True)
print(df0.shape)
df0.head()

(56989, 14)


Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm
0,28304,1,22,6,A03,2019-07-16,11:41,1212297,51903.0,,56.0,1303.0,PUSH,mobile_web
1,24276,17,31,6,A03,2019-08-06,16:52,983154,69420.0,,26.0,998.0,DIRECT,
2,24276,1,87,6,A03,2019-07-02,21:42,8131793,40228.0,,99.0,8132.0,PUSH,
3,47444,27,40,6,A03,2019-09-30,19:56,2612017,112652.0,,70.0,2612.0,DIRECT,PC
4,47444,12,27,6,A03,2019-08-18,12:05,1146060,78445.0,,47.0,1146.0,DIRECT,PC
