# Simple Collaborative Filtering model using Matrix Factorization

In [None]:
import pandas as pd
import yaml
import glob

# Load Config


In [None]:
%cd ..

In [None]:
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)

config

# Load Raw Dataset

In [None]:
# Get CSV files list of cosmetic shop dataset
path = config['data']['cosmetic_shop']
csv_files = glob.glob(path + "/20*.csv")

# Read each CSV file into DataFrame. This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
all_events = pd.concat(df_list, ignore_index=True)

In [None]:
print(all_events.shape)
all_events.sample(10)

In [None]:
# Number of unique users
users_count = len(pd.unique(all_events['user_id']))
print("Unique users: ", users_count)

# Number of unique products
products_count = len(pd.unique(all_events['product_id']))
print("Unique products: ", products_count)

print("Event types: ", pd.unique(all_events.event_type).tolist())


In [None]:
# a small dataset used for code development only
all_events_mini = all_events
all_events_mini = all_events_mini[['user_id', 'product_id', 'event_type']]


In [None]:
def generate_interaction_matrix(events, event_type_weights):

    events = events[['user_id', 'product_id', 'event_type']]
    # Add weight column to `events` dataframe
    events['weight'] = events.event_type.apply(lambda x: event_type_weights[x])

    i_matrix = events.groupby(['user_id', 'product_id'], as_index=False) \
        .agg({'event_type': list, 'weight': lambda x: 0.01 if x.sum() < 0 else 1.0 if x.sum() > 1.0 else x.sum()})

    return i_matrix

In [None]:
im = generate_interaction_matrix(all_events[['user_id', 'product_id', 'event_type']], config['training']['event_type_weights'])

In [None]:
# Save interaction matrix to csv
im.to_csv(config['data']['preprocessed'] + "interaction_matrix.csv")

##### refer code for relabeling user_nodes, item_nodes

In [None]:
# refer code for relabeling user_nodes, item_nodes
from sklearn.model_selection import train_test_split

# change df to im??
train, test = train_test_split(df.values, test_size=0.2, random_state=16)
train_df = pd.DataFrame(train, columns=df.columns)
test_df = pd.DataFrame(test, columns=df.columns)

In [None]:
from sklearn import preprocessing as pp

le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['item_id_idx'] = le_item.fit_transform(train_df['item_id'].values)

In [None]:
train_user_ids = train_df['user_id'].unique()
train_item_ids = train_df['item_id'].unique()

print('Unique train set user_ids/ item_ids:', len(train_user_ids), len(train_item_ids))

test_df = test_df[
  (test_df['user_id'].isin(train_user_ids)) & \
  (test_df['item_id'].isin(train_item_ids))
]
print('Size of test set before/ after(remove user/item nodes not in train set):', len(test), len(test_df))

In [None]:
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['item_id_idx'] = le_item.transform(test_df['item_id'].values)

# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.