In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import heapq
import os
import time
import math

In [92]:
dataset = 'movielens'
algorithm = 'gcn'

In [93]:
ds_path = os.path.join(os.getcwd(), f'Data/{dataset}')
embed_path = os.path.join(os.getcwd(), f'embeds/{dataset}/{algorithm}')
plot_path = os.path.join(os.getcwd(), f'plots/{dataset}')
scores_path = os.path.join(os.getcwd(), f'scores/{dataset}/{algorithm}')
results_path = os.path.join(os.getcwd(), f'results/{dataset}/{algorithm}')
print("Dataset path:", ds_path)
print("Embed path:", embed_path)
print("Plots path:", plot_path)
print("Score path:", scores_path)
print("Results path:", results_path)

Dataset path: /Users/johnhannebery/gitrepos/recsys/Data/movielens
Embed path: /Users/johnhannebery/gitrepos/recsys/embeds/movielens/gcn
Plots path: /Users/johnhannebery/gitrepos/recsys/plots/movielens
Score path: /Users/johnhannebery/gitrepos/recsys/scores/movielens/gcn
Results path: /Users/johnhannebery/gitrepos/recsys/results/movielens/gcn


In [94]:
def _load_ratings(file_name):
    user_dict = dict()
    inter_mat = list()

    lines = open(file_name, 'r').readlines()
    for l in lines:
        tmps = l.strip()
        inters = [int(i) for i in tmps.split(' ')]

        u_id, pos_ids = inters[0], inters[1:]
        pos_ids = list(set(pos_ids))

        for i_id in pos_ids:
            inter_mat.append([u_id, i_id])

        if len(pos_ids) > 0:
            user_dict[u_id] = pos_ids
    return np.array(inter_mat), user_dict

In [95]:
if dataset != 'movielens':
    train, train_user_dict = _load_ratings(os.path.join(ds_path,'train.txt'))
    test, test_user_dict = _load_ratings(os.path.join(ds_path,'test.txt'))

In [96]:
if dataset == 'movielens':
    cols = ['userId', 'movieId', 'rating', 'timestamp']

    df = pd.read_csv(os.path.join(ds_path,'u.data'),sep='\t', names=cols)
    df = df[df['rating']>=4]

    df.rename(columns={'userId':'user_id', 'movieId':'item_id'},inplace=True)

    df = df[['user_id', 'item_id']]

In [97]:
if dataset == 'movielens':
    train, test = train_test_split(df, test_size=0.1, random_state=1234)
else:
    cols = ['user_id', 'item_id']
    train = pd.DataFrame(train, columns = cols)
    test = pd.DataFrame(test, columns = cols)

In [98]:
test = test[test['user_id'].isin(train['user_id'].tolist())]
test = test[test['item_id'].isin(train['item_id'].tolist())]

In [99]:
embed_path

'/Users/johnhannebery/gitrepos/recsys/embeds/movielens/gcn'

In [100]:
user_embed = pd.read_csv(os.path.join(embed_path,'user_embed.csv'))
if algorithm == 'kgat':
    item_embed = pd.read_csv(os.path.join(embed_path,'entity_embed.csv'))
else:
    item_embed = pd.read_csv(os.path.join(embed_path,'item_embed.csv'))

In [101]:
#read scores
scores_df = pd.read_csv(os.path.join(scores_path,'scores_df.csv'))

In [102]:
scores_df

Unnamed: 0.1,Unnamed: 0,user_id,item_id,score,rank
0,555385,428,313,12.460766,1
1,445460,748,50,12.120422,1
2,555114,428,300,11.930076,2
3,977000,519,1612,11.883427,1
4,666420,507,300,11.832407,1
...,...,...,...,...,...
18835,636462,286,172,3.864752,18
18836,636901,286,781,3.854414,19
18837,636575,286,269,3.827556,20
18838,836764,914,1041,3.791851,19


In [103]:
item_counts = train.groupby('item_id', as_index = False).agg({'user_id':'count'}).rename(columns={'user_id':'n_ratings'})
item_counts['novelty'] = -np.log2(item_counts['n_ratings'] / train['user_id'].nunique())

class metrics():
  
  @staticmethod
  def dcg_at_k(r, k, method: int =1) -> float:
    r_k = np.asfarray(r)[:k]
    if r_k.size:
        if method == 0:
            return r_k[0] + np.sum(r_k[1:] / np.log2(np.arange(2, r_k.size + 1)))
        elif method == 1:
            return np.sum(r_k / np.log2(np.arange(2, r_k.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0

class Metrics():
  
  def __init__(self, r: list, k: int, all_pos_num: int):
    self.r = r
    self.k = k
    self.all_pos_num = all_pos_num
  
  def precision_at_k(self) -> float:
      r_k = np.asarray(self.r)[:self.k]
      return np.mean(r_k)


  def dcg_at_k(self, reverse_r = False, method: int = 1) -> float:
      if reverse_r == True:
        r_k = np.asfarray(sorted(self.r, reverse=True))[:self.k]
      else:
        r_k = np.asfarray(self.r)[:self.k]
      if r_k.size:
          if method == 0:
              return r_k[0] + np.sum(r_k[1:] / np.log2(np.arange(2, r_k.size + 1)))
          elif method == 1:
              return np.sum(r_k / np.log2(np.arange(2, r_k.size + 2)))
          else:
              raise ValueError('method must be 0 or 1.')
      return 0


#   def ndcg_at_k(self, method: int = 1, reverse_r = True) -> float:
#       dcg_max = self.dcg_at_k(reverse_r = reverse_r, method = method)
#       if not dcg_max:
#           return 0.
#       return self.dcg_at_k(self.r, self.k, method) / dcg_max
    
  def ndcg_at_k(self, method: int = 1) -> float:
      dcg_max = metrics.dcg_at_k(sorted(self.r, reverse=True), self.k, method)
      if not dcg_max:
          return 0.
      return metrics.dcg_at_k(self.r, self.k, method) / dcg_max


  def recall_at_k(self) -> float:
      r_k = np.asfarray(self.r)[:self.k]
      return np.sum(r_k) / self.all_pos_num


  def hit_at_k(self) -> float:
      r_k = np.array(self.r)[:self.k]
      if np.sum(r_k) > 0:
          return 1.
      else:
          return 0.
        
  @staticmethod
  def F1(pre: float, rec: float) -> float:
      if pre + rec > 0:
          return (2.0 * pre * rec) / (pre + rec)
      else:
          return 0.


# COMMAND ----------

def calculate_diversity(item_ids, item_embed):
  if len(item_ids) == 1:
    return 1
  
  div = 0
  n_items = len(item_ids)
  
  df_embed = item_embed[item_embed['item_id'].isin(item_ids)]
  
  np_embed = np.array(df_embed.drop(columns='item_id'))
  
  
  for embed_index in range(n_items):
    for embed_index_2 in range(n_items):
      if embed_index != embed_index_2:
        div += np.linalg.norm(np_embed[embed_index] - np_embed[embed_index_2])
        
  div /= n_items * (n_items-1)
  
  return div

# COMMAND ----------

def calculate_novelty(item_ids):
  n_items = len(item_ids)
  
  df_novelty = item_counts[item_counts['item_id'].isin(item_ids)]
  
  nov = np.sum(df_novelty['novelty'])
  
  nov /= n_items
  
  return nov


In [104]:
Ks = [1, 5, 10, 20]


def test_all_users(test_df, scores, user_embed, item_embed):
    
    result_append = []
    result = []
    
    recset = set()
    reclist = []

    coverage_dict_set = {i:set() for i in Ks}
    coverage_dict_list = {i:list() for i in Ks}

    testset = set()

    result = {'precision': np.zeros(len(Ks)), 'recall': np.zeros(len(Ks)), 'ndcg': np.zeros(len(Ks)),
              'hit_ratio': np.zeros(len(Ks)), 'F1': np.zeros(len(Ks)), 'diversity': np.zeros(len(Ks)), 'novelty': np.zeros(len(Ks)), 'coverage': np.zeros(len(Ks))}

    test_users = test_df['user_id'].unique().tolist()

    test_items = item_embed['item_id'].tolist()

    testset.update(test_items)

    for u in test_users:
        precision, recall, ndcg, hit_ratio, F1, diversity, novelty = [], [], [], [], [], [], []
        r = []
        user_pos_test = test_df[test_df['user_id']==u]['item_id'].tolist()
        K_max_item_score = scores[scores['user_id']==u]['item_id'].tolist()[:20]
        
        for i in K_max_item_score:
            reclist.append(i)
            if i in user_pos_test:
                r.append(1)
            else:
                r.append(0)
                
                
        for K in Ks:
        
            metric_K = Metrics(r, K, len(user_pos_test))

            precision.append(metric_K.precision_at_k())
            recall.append(metric_K.recall_at_k())
            ndcg.append(metric_K.ndcg_at_k())
            hit_ratio.append(metric_K.hit_at_k())
            F1.append(metric_K.F1(metric_K.precision_at_k(), metric_K.recall_at_k()))
            diversity.append(calculate_diversity(K_max_item_score[:K], item_embed))
            novelty.append(calculate_novelty(K_max_item_score[:K]))
            coverage_dict_list[K].extend(K_max_item_score[:K])
        
        result_append.append({'recall': np.array(recall), 'precision': np.array(precision),
                'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'F1': np.array(F1), 
                       'diversity': np.array(diversity), 'novelty': np.array(novelty)})
        
    for re in result_append:
      result['precision'] += re['precision']/len(test_users)
      result['recall'] += re['recall']/len(test_users)
      result['ndcg'] += re['ndcg']/len(test_users)
      result['hit_ratio'] += re['hit_ratio']/len(test_users)
      result['F1'] += re['F1']/len(test_users)
      result['diversity'] += re['diversity']/len(test_users)
      result['novelty'] += re['novelty']/len(test_users)
    
    recset.update(reclist)
  
    for i,K in enumerate(Ks):
        coverage_dict_set[K].update(coverage_dict_list[K])
        result['coverage'][i] = len(coverage_dict_set[K]) / len(testset)

    result_df = pd.DataFrame(result)
    result_df['K'] = Ks

    return result_df

In [105]:
#Split into high and low activity groups
user_counts_data = train.groupby('user_id',as_index=False).agg({'item_id':'count'}).rename(columns={'item_id':'count'})
item_counts_data = train.groupby('item_id',as_index=False).agg({'user_id':'count'}).rename(columns={'user_id':'count'})

user_cutoff = user_counts_data['count'].quantile(0.8)
print(user_cutoff)

item_cutoff = item_counts_data['count'].quantile(0.8)
print(item_cutoff)

user_low_activity = user_counts_data[user_counts_data['count'] <= user_cutoff]['user_id'].tolist()

item_low_activity = item_counts_data[item_counts_data['count'] <= item_cutoff]['item_id'].tolist()

user_low_activity_test = test[test['user_id'].isin(user_low_activity)]
user_high_activity_test = test[~test['user_id'].isin(user_low_activity)]

item_low_activity_test = test[test['user_id'].isin(item_low_activity)]
item_high_activity_test = test[~test['user_id'].isin(item_low_activity)]

87.0
54.799999999999955


In [106]:
t1 = time.time()
results_all = test_all_users(test, scores_df, user_embed, item_embed)
print(time.time() - t1)
results_all.to_csv(os.path.join(results_path,'results_all.csv'))

4.086740016937256


In [107]:
t1 = time.time()
results_low_user = test_all_users(user_low_activity_test, scores_df, user_embed, item_embed)
print(time.time() - t1)
results_low_user.to_csv(os.path.join(results_path,'results_low_user.csv'))

3.289842128753662


In [108]:
t1 = time.time()
results_high_user = test_all_users(user_high_activity_test, scores_df, user_embed, item_embed)
print(time.time() - t1)
results_high_user.to_csv(os.path.join(results_path,'results_high_user.csv'))

0.8882946968078613


In [109]:
t1 = time.time()
results_low_item = test_all_users(item_low_activity_test, scores_df, user_embed, item_embed)
print(time.time() - t1)
results_low_item.to_csv(os.path.join(results_path,'results_low_item.csv'))

2.702528953552246


In [110]:
t1 = time.time()
results_high_item = test_all_users(item_high_activity_test, scores_df, user_embed, item_embed)
print(time.time() - t1)
results_high_item.to_csv(os.path.join(results_path,'results_high_item.csv'))

1.3516180515289307
