# 6조 (17011709 정선아, 17011741 문성용, 17011742 김소영)

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

In [2]:
MovieLens_df = pd.read_table("ratings.dat", sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'])
MovieLens_df.drop(['Timestamp'], axis=1, inplace=True)
MovieLens_pivot = MovieLens_df.pivot_table(index='UserID', columns='MovieID', values='Rating')
row_index = MovieLens_pivot.isnull().sum(axis = 1).sort_values() # NaN이 적은 순으로 행 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, index=row_index.index)
# rating이 많은 사용자 1,000명
MovieLens_pivot = MovieLens_pivot.iloc[:1000,:]
col_index = MovieLens_pivot.isnull().sum(axis = 0).sort_values() # NaN이 적은 순으로 열 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, columns=col_index.index)
# rating이 많은 영화 1,000개
MovieLens_pivot = MovieLens_pivot.iloc[:,:1000]
user_index = MovieLens_pivot.index.values # User ID 순서 기억하기
user_index = pd.Series(user_index) # Index rename 인자로 넣기 위해 형태 변경
MovieLens_pivot.fillna(0, inplace=True)
MovieLens_pivot = MovieLens_pivot.astype('int64')
MovieLens_pivot = np.array(MovieLens_pivot) # 함수 인자로 넣기 위해 numpy array 형태로 변경

# 1. Cosine Simiarity

In [18]:
def COS(data):
    size = np.size(data, axis = 0)
    simCOS = np.zeros(shape = (size, size)) # 0으로 초기화 된 행렬 생성
    
    for u in range(0, size): # 각 유저별로 for문 반복
        arridx_u = np.where(data[u, ] == 0)
        for j in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx)
            V = np.delete(data[v, ], arridx)
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simCos[u, v] = 0
            else:
                simCos[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simCos[v, u] = simCos[u, v]
    return simCOS # 최종 행렬값 반환

# 2. Pearson Correlation Coefficient

In [19]:
def PCC(data):
    size = np.size(data, axis = 0)
    simPCC = np.full((size, size), -1.0) #-1로 초기화
    
    mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
    
    for u in range(0, size):
        arridx_u = np.where(data[u, ] == 0)
        for v in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx) - mean[u]
            V = np.delete(data[v, ], arridx) - mean[v]
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simPCC[u, v] = 0
            else:
                simPCC[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simPCC[v, u] = simPCC[u, v]
        
        return simPCC

# 레이팅

In [70]:
def basic_baseline(data, sim, k):
    #initializing (1000, 1000) numpy array with zeros 
    pre_rating = np.zeros(data.shape)
    
    #the mean of all ratings
    mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
    mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis = 0)
    
    #selecting similarity fuction
    if(sim == 'COS'):
        Sim = COS(data)
    else:
        Sim = PCC(data)
    
    #selecting top k users by sorting similarity array
    k_users = np.argsort(-Sim) #index zero is the biggest
    k_users = np.delete(k_users, np.s_[k:], 1) #delete colomn under k
    
    #number of users with axis = 0 condition
    num_users = np.size(data, axis = 0)
    
    #test area
    #print(mean.shape)
    #print(mean_i.shape)
    
    for u in range(0, num_users):
        list_sim = Sim[u, k_users[u, ]] #similarity list
        list_rating = data[k_users[u, ], ].astype('float64') #rating list
        mean_user = mean[k_users[u, ], ] #the mean of all ratings given by user u
        mean_item = mean_i[k_users[u, ], ] #the mean of all ratings given to item i
        #print(mean_user.shape)
        base_user = np.subtract(mean_user, mean.reshape(-1, 1)) #baseline user
        base_item = np.subtract(mean_item, mean.reshape(-1, 1)) #baseline item
        baseline = np.add(mean, base_user, base_item) #baseline on u, i
        
        #calculation
        mom = np.sum(list_sim) #denominator
        son = np.sum(list_sim.reshape(-1, 1) * (list_rating - baseline), axis = 0)
        pre_rating[u, ] = baseline[u, ] + son/mom
    
    return predicted

In [7]:
data = MovieLens_pivot[:20, :20]
sim = 'COS'
k = 2
#initializing (1000, 1000) numpy array with zeros 
pre_rating = np.zeros(data.shape)
    
    #the mean of all ratings
mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis = 0)
    
    #selecting similarity fuction
if(sim == 'COS'):
    Sim = COS(data)
else:
    Sim = PCC(data)
    
    #selecting top k users by sorting similarity array
k_users = np.argsort(-Sim) #작은 값부터 순서대로 인덱싱
k_users = np.delete(k_users, np.s_[k:], 1) #delete colomn under k
"""
    #number of users with axis = 0 condition
num_users = np.size(data, axis = 0)
    
    
for u in range(0, num_users):
    list_sim = Sim[u, k_users[u, ]] #top k similarity list
    list_rating = data[k_users[u, ], ].astype('float64') #top krating list
    
    base_user = np.subtract(mean[u], mean.mean()) #baseline user
    base_item = np.subtract(mean_i[u], mean.mean()) #baseline item
    baseline = np.add(mean, base_user, base_item) #baseline on u, i
        
    #calculation
    mom = np.sum(list_sim) #분모
    son = np.sum(list_sim * (list_rating - __buj__), axis = 0)
    pre_rating[u, ] = __bui__ + son/mom
"""

"\n    #number of users with axis = 0 condition\nnum_users = np.size(data, axis = 0)\n    \n    \nfor u in range(0, num_users):\n    list_sim = Sim[u, k_users[u, ]] #top k similarity list\n    list_rating = data[k_users[u, ], ].astype('float64') #top krating list\n    \n    base_user = np.subtract(mean[u], mean.mean()) #baseline user\n    base_item = np.subtract(mean_i[u], mean.mean()) #baseline item\n    baseline = np.add(mean, base_user, base_item) #baseline on u, i\n        \n    #calculation\n    mom = np.sum(list_sim) #분모\n    son = np.sum(list_sim * (list_rating - __buj__), axis = 0)\n    pre_rating[u, ] = __bui__ + son/mom\n"