In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, cluster
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

import os, pickle
from time import time
from tqdm import tqdm 
from proj_util import cal_avg_precision, get_user_product_matrix

from sklearn.decomposition import NMF
import scipy


FOLDER_PATH = ''


In [None]:
train_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','train_set.p'),'rb'))
validation_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','validation_set.p'),'rb'))
test_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','test_set.p'),'rb'))

In [2]:
user_mapping = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','user_mapping.p'),'rb'))
product_mapping = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','product_mapping.p'),'rb'))

In [None]:
order_prior_set = pd.concat([train_set,validation_set])
order_prior_set.drop('order_id',axis=1, inplace=True)
del train_set, validation_set

In [None]:
order_prior_set.user_id = order_prior_set.user_id.map(user_mapping)
order_prior_set.product_id = order_prior_set.product_id.map(product_mapping)

test_set.user_id = test_set.user_id.map(user_mapping)
test_set.product_id = test_set.product_id.map(product_mapping)

In [None]:
user_product_prior = order_prior_set.groupby(['user_id', 'product_id']).size().reset_index().rename(columns={0: 'quantity'})
qty_max = max(user_product_prior.quantity)

user_product_prior.quantity = user_product_prior.quantity.apply(lambda x: (x-1)/float(qty_max-1))

In [None]:
user_product_prior_coo = get_user_product_matrix(user_product_prior)

In [None]:
nmf = NMF(n_components=120, random_state=421, alpha=0.001) 
nmf.fit(user_product_prior_coo)

In [None]:
V = nmf.transform(user_product_prior_coo)
U = nmf.components_.T

In [None]:
pickle.dump(V, open(os.path.join(FOLDER_PATH,'pickle','final-nmf-prior-V.p'),'wb'))
pickle.dump(U, open(os.path.join(FOLDER_PATH,'pickle','final-nmf-prior-U.p'),'wb'))

In [None]:
scores = []
c = 0
for u in (test_set.user_id):
    user_item_score = np.matmul(U[mapped_u:mapped_u+1], V.T)[0]
    recomm_items = np.argsort(user_item_score)[-1:-11:-1]

    gt_product = test_set.query('user_id=={}'.format(u)).product_id.values
    score = cal_avg_precision(gt_product, recomm_items)
    scores.append(score)

    c+=1
    if c % 10000 == 0:
        print(c, end=' | ')       
    

In [None]:
np.mean(scores) # 0.5057114628611186