# 6조 (17011709 정선아, 17011741 문성용, 17011742 김소영)

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

In [2]:
MovieLens_df = pd.read_table("ratings.dat", sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'])
MovieLens_df.drop(['Timestamp'], axis=1, inplace=True)
MovieLens_pivot = MovieLens_df.pivot_table(index='UserID', columns='MovieID', values='Rating')
row_index = MovieLens_pivot.isnull().sum(axis = 1).sort_values() # NaN이 적은 순으로 행 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, index=row_index.index)
# rating이 많은 사용자 1,000명
MovieLens_pivot = MovieLens_pivot.iloc[:1000,:]
col_index = MovieLens_pivot.isnull().sum(axis = 0).sort_values() # NaN이 적은 순으로 열 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, columns=col_index.index)
# rating이 많은 영화 1,000개
MovieLens_pivot = MovieLens_pivot.iloc[:,:1000]
user_index = MovieLens_pivot.index.values # User ID 순서 기억하기
user_index = pd.Series(user_index) # Index rename 인자로 넣기 위해 형태 변경
MovieLens_pivot.fillna(0, inplace=True)
MovieLens_pivot = MovieLens_pivot.astype('int64')
MovieLens_pivot = np.array(MovieLens_pivot) # 함수 인자로 넣기 위해 numpy array 형태로 변경

# 1. Cosine Simiarity

In [3]:
def COS(data):
    size = np.size(data, axis = 0)
    simCOS = np.zeros(shape = (size, size)) # 0으로 초기화 된 행렬 생성
    
    for u in range(0, size): # 각 유저별로 for문 반복
        arridx_u = np.where(data[u, ] == 0)
        for v in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx)
            V = np.delete(data[v, ], arridx)
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simCOS[u, v] = 0
            else:
                simCOS[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simCOS[v, u] = simCOS[u, v]
    return simCOS # 최종 행렬값 반환

# 2. Pearson Correlation Coefficient

In [4]:
def PCC(data):
    size = np.size(data, axis = 0)
    simPCC = np.full((size, size), -1.0) #-1로 초기화
    
    mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
    
    for u in range(0, size):
        arridx_u = np.where(data[u, ] == 0)
        for v in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx) - mean[u]
            V = np.delete(data[v, ], arridx) - mean[v]
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simPCC[u, v] = 0
            else:
                simPCC[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simPCC[v, u] = simPCC[u, v]
        
        return simPCC

# Rating

In [15]:
A = np.array([[4, 1, 1, 4, 2, 3, 5, 0, 4, 0],
              [0, 4, 2, 0, 3, 2, 5, 0, 4, 3],
              [2, 0, 4, 5, 2, 0, 1, 3, 4, 2],
              [1, 4, 0, 1, 4, 5, 0, 3, 1, 2],
              [1, 2, 3, 4, 0, 3, 4, 4, 2, 5]])

In [80]:
# 임시로 만들어 놓기
def basic_mean(mat, sim, k):
    predicted_rating = np.zeros(mat.shape)
    
    mean = np.nanmean(np.where(mat != 0, mat, np.nan), axis=1)

    if(sim == 'COS'):
        Sim = COS(mat)
    elif(sim == 'PCC'):
        Sim = PCC(mat)
        
    k_neighbors = np.argsort(-Sim)
    k_neighbors = np.delete(k_neighbors, np.s_[k:], 1)

    NumUsers = np.size(mat, axis=0)
    
    for u in range(0, NumUsers):
        list_sim = Sim[u, k_neighbors[u]]
        list_rating = mat[k_neighbors[u]]
        list_mean = mean[k_neighbors[u]]
        
        mom = np.sum(list_sim)
        sun = np.sum(list_sim.reshape(-1,1) * (list_rating - list_mean.reshape(-1,1)), axis=0)
        predicted_rating[u] = mean[u] + sun / mom
        
    return predicted_rating

In [100]:
def basic_baseline(data, sim, k):
    # initializing (1000, 1000) numpy array with zeros 
    pred_rating = np.zeros(data.shape)

    # calculating means
    mean = np.nanmean(np.where(data != 0, data, np.nan)) # the mean of all ratings
    mean_u = np.nanmean(np.where(data != 0, data, np.nan), axis=1) # the mean of all users
    mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis=0) # the mean of all items
    
    # base user, item
    b_u = mean_u - mean 
    b_i = mean_i - mean
    # b_ui = mean + b_u + b_i

    # selecting similarity fuction
    if(sim == 'COS'):
        Sim = COS(data)
    elif(sim == 'PCC'):
        Sim = PCC(data)

    # selecting top k users by sorting similarity array
    k_users = np.argsort(-Sim)
    k_users = np.delete(k_users, np.s_[k:], 1) # delete colomn under k
        
    # number of users with axis = 0 condition
    num_users = np.size(data, axis=0)
    num_items = np.size(data, axis=1)
    
    # calculating predicted ratings
    for u in range(0, num_users):
        list_sim = Sim[u, k_users[u]] # selecting top k similarity
        for i in range(0, num_items):
            list_rating = data[k_users[u], i].astype('float64') # k users' ratings on item i

            b_ui = mean + b_u[u] + b_i[i] # scalar
            b_vi = mean + b_u[k_users[u]] + b_i[i] # list
            
            # explanation of varialbles
            # mean_u[u] : user u의 평균
            # mean_i[i] : item i의 평균
            # b_u[u] : user u의 baseline
            # b_i[i] : item i의 baseline

            # calculation
            mom = np.sum(list_sim) #분모
            son = np.sum(list_sim * (list_rating - b_vi)) #분자
            pred_rating[u, i] = b_ui + son / mom
            
    return pred_rating

In [99]:
basic_baseline(MovieLens_pivot, 'COS', 2)

[[897 999 979 ... 899 822 489]
 [822 561 386 ... 822 979 456]
 [999 897 822 ... 579 742 336]
 ...
 [172 179 828 ... 172 672 183]
 [179 172 172 ... 891 179 179]
 [  0   1   2 ... 997 998 999]]


array([[ 0.90143893,  1.39986697,  0.90143893, ..., -0.59384522,
        -0.59384522, -0.59384522],
       [ 4.33672454,  4.33672454,  4.33672454, ..., -0.66327546,
         1.85843357,  1.31935732],
       [ 1.45751178,  1.95745313,  1.45751178, ..., -0.04231225,
        -0.04231225, -0.04231225],
       ...,
       [ 4.52037615,  4.52037615,  2.62035971, ...,  0.553704  ,
         2.58703186,  0.553704  ],
       [ 4.9088151 ,  4.9088151 ,  2.82060205, ...,  0.87941075,
         2.8500064 ,  0.87941075],
       [ 5.50970633,  4.01671684,  4.01204316, ...,  1.00736949,
         0.50970633,  5.00736949]])