In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, cluster
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

import os, pickle
from time import time
from tqdm import tqdm 
from proj_util import cal_avg_precision, get_user_product_matrix

from sklearn.decomposition import NMF
import scipy


FOLDER_PATH = ''


In [None]:
train_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','train_set.p'),'rb'))
validation_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','validation_set.p'),'rb'))
test_set = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','test_set.p'),'rb'))

user_mapping = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','user_mapping.p'),'rb'))
product_mapping = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','product_mapping.p'),'rb'))

In [None]:
order_prior_set = pd.concat([train_set,validation_set])
order_prior_set.drop('order_id',axis=1, inplace=True)
del train_set, validation_set

In [None]:
order_prior_set.user_id = order_prior_set.user_id.map(user_mapping)
order_prior_set.product_id = order_prior_set.product_id.map(product_mapping)

test_set.user_id = test_set.user_id.map(user_mapping)
test_set.product_id = test_set.product_id.map(product_mapping)

In [None]:
user_product_prior = order_prior_set.groupby(['user_id', 'product_id']).size().reset_index().rename(columns={0: 'quantity'})
qty_max = max(user_product_prior.quantity)

user_product_prior.quantity = user_product_prior.quantity.apply(lambda x: (x-1)/float(qty_max-1))

In [None]:
user_product_prior_coo = get_user_product_matrix(user_product_prior)

### Co-Clustering

In [15]:
from coclust.coclustering import *
import coclust
from sklearn.decomposition import NMF

In [17]:
model = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','coclust-model-prior.p'),'rb'))

In [18]:
cluster_with_uid= {}
cluseter_with_pid= {}
for uid, uc in enumerate(model.column_labels_):
    try:
        cluster_with_uid[uc].append(uid)
    except:
        cluster_with_uid[uc] = [uid]

for pid, pc in enumerate(model.row_labels_):
    try:
        cluseter_with_pid[pc].append(pid)
    except:
        cluseter_with_pid[pc] = [pid]

In [19]:
user_cluster_mapping = {i:model.column_labels_[i] for i in range(len(model.column_labels_))}

In [20]:
delta_kl_matrix = model.delta_kl_
delta_kl_matrix.shape

(10, 40)

In [22]:
# prior then test test_set
start = time()
scores = []
c = -1
for u in (test_set.user_id.unique()):

    user_cluster = user_cluster_mapping[u]
    user_item_cluster_scores = delta_kl_matrix[:,user_cluster].T
    best_product_cluster = np.argsort(np.array(user_item_cluster_scores))[0][-1]

    user_item_cluster = model.get_submatrix(user_product_prior_coo.tocsc(),best_product_cluster, user_cluster).todense()

    nmf = pickle.load(open(os.path.join(FOLDER_PATH,'pickle','coclust-nmf-prior',f'{best_product_cluster}-{user_cluster}-nmf.p'),'rb'))

    V = nmf.transform(user_item_cluster)
    U = nmf.components_.T
    
    uids = cluster_with_uid[user_cluster]
    uid_idx = uids.index(u)

    user_item_score = np.matmul(U[uid_idx:uid_idx+1], V.T)[0]
    top10_item_idx = np.argsort(user_item_score)[-1:-11:-1]

    recomm_items = [cluseter_with_pid[best_product_cluster][x] for x in top10_item_idx]

    gt_product = test_set.query('user_id=={}'.format(u)).product_id.values
    score = cal_avg_precision(gt_product, recomm_items)
    scores.append(score)
    
    del nmf, U, V, user_item_cluster, user_item_cluster_scores
    c+=1
    if c % 500 == 0:
        print(f'{c}:{time()-start}', end=' | ') 
        pickle.dump(scores,open(os.path.join(FOLDER_PATH,'output','coclust-nmf-prior','coclust-nmf-prior-test_set_scores.p'),'wb'))
    if c> 0 and c % 10000 == 0:   
        print()

pickle.dump(scores,open(os.path.join(FOLDER_PATH,'output','coclust-nmf-prior','coclust-nmf-prior-test_set_scores.p'),'wb'))

0:10.444486618041992 | 500:2092.6367037296295 | 

In [None]:
np.mean(scores)