# Simple Collaborative Filtering model using Matrix Factorization

In [1]:
import pandas as pd
import yaml

# Load Config


In [2]:
%cd ..
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)

/Users/yingkang/4thBrain/GNN-eCommerce


# Load Raw Dataset

In [3]:
u_i_e_df = pd.read_csv(config['data']['preprocessed'] + "user_item_event.csv")
u_i_e_df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,event_type
0,0,595414620,5809910,view
1,1,595414640,5812943,view
2,2,595412617,5798924,view
3,3,420652863,5793052,view
4,4,484071203,5899926,view
...,...,...,...,...
20692835,20692835,429913900,5848062,remove_from_cart
20692836,20692836,556138645,5885631,view
20692837,20692837,564441099,5784898,view
20692838,20692838,429913900,5870838,view


### A small dataset used for code development only

In [10]:
mini = u_i_e_df.head(100000)

## Map eventType to eventWeight, compute rawEdgeWeight

In [16]:
def raw_edge_weight(u_i_e, eType_weights_config):
    # Map eventWeight
    u_i_e['weight'] = u_i_e.event_type.apply(lambda x: eType_weights_config[x])
    # compute raw edge weight
    view = eType_weights_config['view']
    u_i_e = u_i_e.groupby(['user_id', 'item_id']).agg({
        'event_type': list,
        'weight': lambda x: view if x.sum() < view else x.sum()}).reset_index()
    u_i_e['purchased'] = u_i_e.event_type.apply(lambda x: x.__contains__('purchase'))
    u_i_e = u_i_e.rename(columns={"event_type": "event_type_list"})
    return u_i_e

In [6]:
eType_weights_config1 = config['training']['event_type_weights_v1']
eType_weights_config2 = config['training']['event_type_weights_v2']

In [7]:
u_i_weight_1 = raw_edge_weight(u_i_e_df, eType_weights_config1)
u_i_weight_1

Unnamed: 0,user_id,item_id,event_type,weight,purchased
0,465496,5769989,[view],0.01,False
1,465496,5865524,[view],0.01,False
2,465496,5865526,[view],0.01,False
3,1120748,5240,[view],0.01,False
4,1180452,5881337,[view],0.01,False
...,...,...,...,...,...
10157403,622090043,5850628,[view],0.01,False
10157404,622090052,5688691,[view],0.01,False
10157405,622090052,5931986,[view],0.01,False
10157406,622090098,5650609,[view],0.01,False


In [22]:
u_i_weight_2 = raw_edge_weight(u_i_e_df, eType_weights_config2)
u_i_weight_2

Unnamed: 0,user_id,item_id,event_type_list,weight,purchased
0,465496,5769989,[view],0.15,False
1,465496,5865524,[view],0.15,False
2,465496,5865526,[view],0.15,False
3,1120748,5240,[view],0.15,False
4,1180452,5881337,[view],0.15,False
...,...,...,...,...,...
10157403,622090043,5850628,[view],0.15,False
10157404,622090052,5688691,[view],0.15,False
10157405,622090052,5931986,[view],0.15,False
10157406,622090098,5650609,[view],0.15,False


## Compute proper edge weight

In [17]:
def proper_edge_weight(raw_u_i_e_w):
    # exceeds 1.0 and True --> 1.0
    # exceeds 1.0 and False --> 0.5
    raw_u_i_e_w.loc[(raw_u_i_e_w['weight']>1.0) & (raw_u_i_e_w['purchased']), 'weight'] = 1.0
    raw_u_i_e_w.loc[(raw_u_i_e_w['weight']>1.0) & (~raw_u_i_e_w['purchased']), 'weight'] = 0.5
    u_i_weight = raw_u_i_e_w[['user_id', 'item_id', 'weight']]
    return u_i_weight

In [18]:
u_i_weight_1 = proper_edge_weight(u_i_weight_1)
u_i_weight_1

Unnamed: 0,user_id,item_id,weight
0,465496,5769989,0.01
1,465496,5865524,0.01
2,465496,5865526,0.01
3,1120748,5240,0.01
4,1180452,5881337,0.01
...,...,...,...
10157403,622090043,5850628,0.01
10157404,622090052,5688691,0.01
10157405,622090052,5931986,0.01
10157406,622090098,5650609,0.01


In [23]:
u_i_weight_2 = proper_edge_weight(u_i_weight_2)
u_i_weight_2

Unnamed: 0,user_id,item_id,weight
0,465496,5769989,0.15
1,465496,5865524,0.15
2,465496,5865526,0.15
3,1120748,5240,0.15
4,1180452,5881337,0.15
...,...,...,...
10157403,622090043,5850628,0.15
10157404,622090052,5688691,0.15
10157405,622090052,5931986,0.15
10157406,622090098,5650609,0.15


## Persist 2 version u_i_weight data into csv

In [21]:
u_i_weight_1.to_csv(config['data']['preprocessed'] + "u_i_weight_0.01_0.1_-0.09.csv")

In [25]:
u_i_weight_2.to_csv(config['data']['preprocessed'] + "u_i_weight_0.15_0.35_-0.2.csv")

# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.