# 6조 (17011709 정선아, 17011741 문성용, 17011742 김소영)

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

In [2]:
MovieLens_df = pd.read_table("ratings.dat", sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'])
MovieLens_df.drop(['Timestamp'], axis=1, inplace=True)
MovieLens_pivot = MovieLens_df.pivot_table(index='UserID', columns='MovieID', values='Rating')
row_index = MovieLens_pivot.isnull().sum(axis = 1).sort_values() # NaN이 적은 순으로 행 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, index=row_index.index)
# rating이 많은 사용자 1,000명
MovieLens_pivot = MovieLens_pivot.iloc[:1000,:]
col_index = MovieLens_pivot.isnull().sum(axis = 0).sort_values() # NaN이 적은 순으로 열 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, columns=col_index.index)
# rating이 많은 영화 1,000개
MovieLens_pivot = MovieLens_pivot.iloc[:,:1000]
user_index = MovieLens_pivot.index.values # User ID 순서 기억하기
user_index = pd.Series(user_index) # Index rename 인자로 넣기 위해 형태 변경
MovieLens_pivot.fillna(0, inplace=True)
MovieLens_pivot = MovieLens_pivot.astype('int64')
MovieLens_pivot = np.array(MovieLens_pivot) # 함수 인자로 넣기 위해 numpy array 형태로 변경

# 1. Cosine Simiarity

In [16]:
def COS(data):
    size = np.size(data, axis = 0)
    simCOS = np.zeros(shape = (size, size)) # 0으로 초기화 된 행렬 생성
    
    for i in range(0, size): # 각 유저별로 for문 반복
        for j in range(i, size):
            normI = np.linalg.norm(data[i, ]) # i벡터의 크기 계산
            normJ = np.linalg.norm(data[j, ]) # j벡터의 크기 계산
            inputData = np.dot(data[i, ], data[j, ])/(normI * normJ) # Cosine similarity 공식
            simCOS[i, j] = inputData # 행렬에 계산 값 대입하기
            simCOS[j, i] = inputData # 대각선 값 대입하기
    
    return simCOS # 최종 행렬값 반환

# 2. Pearson Correlation Coefficient

In [17]:
def cal_PCC(i, j): # 두 벡터의 PCC값을 계산해 주는 함수
    i = np.array(i) # numpy array로 형변환
    j = np.array(j) # numpy array로 형변환
    
    # 벡터의 0인 원소를 null값으로 바꾸고 null값을 제외한 평균값
    mean_i = np.nanmean(np.where(i != 0, i, np.nan)) 
    mean_j = np.nanmean(np.where(j != 0, j, np.nan))
    
    # 벡터의 원소가 0인 부분의 인덱스를 저장한 리스트 생성
    zero_i = np.where(i == 0)
    zero_j = np.where(j == 0)
    
    # zero_i와 zero_j 벡터를 하나로 합쳐준다.
    zeros = np.concatenate((zero_i, zero_j), axis = None)
    
    # 각 벡터의 원소가 0인 인덱스를 삭제
    del_i = np.delete(i, zeros) 
    del_j = np.delete(j, zeros)
    
    # 각 벡터의 원소에서 평균값을 빼준다.
    del_i = del_i - mean_i
    del_j = del_j - mean_j
    
    return np.dot(del_i, del_j)/(np.linalg.norm(del_i) * np.linalg.norm(del_j))
    # PCC 공식을 통한 similarity 값 반환

def PCC(data):
    size = np.size(data, axis = 0)
    simPCC = np.zeros(shape = (size, size)) # 0으로 초기화 된 행렬 생성
    
    for i in range(0, size):
        for j in range(i, size):
            inputData = cal_PCC(data[i, ], data[j, ]) # cal_PCC함수를 이용
            simPCC[i, j] = inputData
            simPCC[j, i] = inputData
    
    return simPCC

# 레이팅

In [70]:
def basic_baseline(data, sim, k):
    #initializing (1000, 1000) numpy array with zeros 
    pre_rating = np.zeros(data.shape)
    
    #the mean of all ratings
    mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
    mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis = 0)
    
    #selecting similarity fuction
    if(sim == 'COS'):
        Sim = COS(data)
    else:
        Sim = PCC(data)
    
    #selecting top k users by sorting similarity array
    k_users = np.argsort(-Sim) #index zero is the biggest
    k_users = np.delete(k_users, np.s_[k:], 1) #delete colomn under k
    
    #number of users with axis = 0 condition
    num_users = np.size(data, axis = 0)
    
    #test area
    #print(mean.shape)
    #print(mean_i.shape)
    
    for u in range(0, num_users):
        list_sim = Sim[u, k_users[u, ]] #similarity list
        list_rating = data[k_users[u, ], ].astype('float64') #rating list
        mean_user = mean[k_users[u, ], ] #the mean of all ratings given by user u
        mean_item = mean_i[k_users[u, ], ] #the mean of all ratings given to item i
        #print(mean_user.shape)
        base_user = np.subtract(mean_user, mean.reshape(-1, 1)) #baseline user
        base_item = np.subtract(mean_item, mean.reshape(-1, 1)) #baseline item
        baseline = np.add(mean, base_user, base_item) #baseline on u, i
        
        #calculation
        mom = np.sum(list_sim) #denominator
        son = np.sum(list_sim.reshape(-1, 1) * (list_rating - baseline), axis = 0)
        pre_rating[u, ] = baseline[u, ] + son/mom
    
    return predicted

In [105]:
data = MovieLens_pivot[:20, :20]
sim = 'COS'
k = 2
#initializing (1000, 1000) numpy array with zeros 
pre_rating = np.zeros(data.shape)
    
    #the mean of all ratings
mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis = 0)
    
    #selecting similarity fuction
if(sim == 'COS'):
    Sim = COS(data)
else:
    Sim = PCC(data)
    
    #selecting top k users by sorting similarity array
k_users = np.argsort(-Sim) #index zero is the biggest
k_users = np.delete(k_users, np.s_[k:], 1) #delete colomn under k
    
    #number of users with axis = 0 condition
num_users = np.size(data, axis = 0)
    
    
for u in range(0, num_users):
    list_sim = Sim[u, k_users[u, ]] #top k similarity list
    list_rating = data[k_users[u, ], ].astype('float64') #top krating list
    
    base_user = np.subtract(mean[u], mean.mean()) #baseline user
    base_item = np.subtract(mean_i[u], mean.mean()) #baseline item
    baseline = np.add(mean, base_user, base_item) #baseline on u, i
        
    #calculation
    mom = np.sum(list_sim) #분모
    son = np.sum(list_sim * (list_rating - __buj__), axis = 0)
    pre_rating[u, ] = __bui__ + son/mom

TypeError: return arrays must be of ArrayType

In [115]:
list_sim

array([1.       , 0.9868704])

In [93]:
mean

array([3.77345538, 3.74473358, 4.32469136, 3.26097867, 2.9352518 ,
       3.04418262, 3.10098177, 3.17062937, 2.57463884, 3.87749288,
       3.49284785, 3.50402145, 3.62386511, 3.97319035, 2.98347107,
       2.97335203, 3.91439689, 3.33541342, 3.81923077, 4.06925208])