In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
from tqdm import tqdm
from config import *

## UCF

In [5]:
class UCF:
    def __init__(self, np_user, np_item, df_inter):
        self.np_user = np_user
        self.np_item = np_item
        self.df_inter = df_inter

    def substract_mean(self):
        ls_mean = []
        for user_id in tqdm(self.np_user, desc='Substract Mean'):
            mask = self.df_inter['user_id'] == user_id
            tmp_mean = self.df_inter.loc[mask, 'rating'].mean()
            self.df_inter.loc[mask, 'rating'] = self.df_inter.loc[mask, 'rating'] - tmp_mean
            ls_mean.append(tmp_mean)
        self.np_mean = np.array(ls_mean)

    def calc_similarity(self, user1, user2, type='consine'):
        df_u1 = self.df_inter.loc[self.df_inter['user_id'] == user1].drop(columns=['user_id'])
        df_u2 = self.df_inter.loc[self.df_inter['user_id'] == user2].drop(columns=['user_id'])
        df_merge = df_u1.merge(df_u2, on='item_id', how='outer')
        if type == 'consine':
            df_merge = df_merge.fillna(0)
        elif type == 'pearson':
            df_merge = df_merge.dropna()

        fraction = (norm(df_merge['rating_x']) * norm(df_merge['rating_y']))
        if len(df_merge) == 0 or fraction == 0:
            return 0
        else:
            return np.dot(df_merge['rating_x'], df_merge['rating_y']) / fraction

    def calc_all_similarity(self, type='consine'):
        ls_sim = []
        for user1 in (self.np_user):
            for user2 in tqdm(self.np_user):
                if user1 == user2:
                    continue
                sim = self.calc_similarity(user1, user2, type=type)
                ls_sim.append([user1, user2, sim])
        self.df_similarity = pd.DataFrame(ls_sim, columns=['user1_id', 'user2_id', 'similarity'])



### douban

In [6]:
df_douban_inter = pd.read_csv(data_preprocessing + 'douban_train_interaction.csv')
np_douban_user = np.load(data_preprocessing + 'douban_user_unique.npy')
np_douban_item = np.load(data_preprocessing + 'douban_item_unique.npy')

In [7]:
ucf_douban = UCF(np_douban_user, np_douban_item, df_douban_inter)
ucf_douban.substract_mean()
ucf_douban.calc_all_similarity(type='consine')

Substract Mean: 100%|██████████| 13024/13024 [00:29<00:00, 435.77it/s]
100%|██████████| 13024/13024 [00:59<00:00, 218.59it/s]
100%|██████████| 13024/13024 [00:59<00:00, 217.58it/s]
100%|██████████| 13024/13024 [00:58<00:00, 221.64it/s]
100%|██████████| 13024/13024 [00:59<00:00, 218.22it/s]
100%|██████████| 13024/13024 [00:59<00:00, 217.53it/s]
  2%|▏         | 271/13024 [00:01<00:57, 221.87it/s]


KeyboardInterrupt: 