# 6조 (17011709 정선아, 17011741 문성용, 17011742 김소영)

In [30]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

In [31]:
MovieLens_df = pd.read_table("ratings.dat", sep='::', header=None, names=['UserID','MovieID','Rating','Timestamp'])
MovieLens_df.drop(['Timestamp'], axis=1, inplace=True)
MovieLens_pivot = MovieLens_df.pivot_table(index='UserID', columns='MovieID', values='Rating')
row_index = MovieLens_pivot.isnull().sum(axis = 1).sort_values() # NaN이 적은 순으로 행 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, index=row_index.index)
# rating이 많은 사용자 1,000명
MovieLens_pivot = MovieLens_pivot.iloc[:1000,:]
col_index = MovieLens_pivot.isnull().sum(axis = 0).sort_values() # NaN이 적은 순으로 열 뽑아내기
MovieLens_pivot = pd.DataFrame(MovieLens_pivot, columns=col_index.index)
# rating이 많은 영화 1,000개
MovieLens_pivot = MovieLens_pivot.iloc[:,:1000]
user_index = MovieLens_pivot.index.values # User ID 순서 기억하기
user_index = pd.Series(user_index) # Index rename 인자로 넣기 위해 형태 변경
MovieLens_pivot.fillna(0, inplace=True)
MovieLens_pivot = MovieLens_pivot.astype('int64')
MovieLens_pivot = np.array(MovieLens_pivot) # 함수 인자로 넣기 위해 numpy array 형태로 변경

# 1. Cosine Similarity without NULL

In [26]:
def COS(data):
    size = np.size(data, axis = 0)
    simCOS = np.zeros(shape = (size, size)) # 0으로 초기화 된 행렬 생성
    
    for u in range(0, size): # 각 유저별로 for문 반복
        arridx_u = np.where(data[u, ] == 0)
        for v in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx)
            V = np.delete(data[v, ], arridx)
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simCOS[u, v] = 0
            else:
                simCOS[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simCOS[v, u] = simCOS[u, v]
    return simCOS # 최종 행렬값 반환

# 2. Pearson Correlation Coefficient

In [27]:
def PCC(data):
    size = np.size(data, axis = 0)
    simPCC = np.full((size, size), -1.0) #-1로 초기화
    
    mean = np.nanmean(np.where(data != 0, data, np.nan), axis = 1)
    
    for u in range(0, size):
        arridx_u = np.where(data[u, ] == 0)
        for v in range(u + 1, size):
            arridx_v = np.where(data[v, ] == 0)
            arridx = np.unique(np.concatenate((arridx_u, arridx_v), axis = None))
            
            U = np.delete(data[u, ], arridx) - mean[u]
            V = np.delete(data[v, ], arridx) - mean[v]
            
            if(np.linalg.norm(U) == 0 or np.linalg.norm(V) == 0):
                simPCC[u, v] = 0
            else:
                simPCC[u, v] = np.dot(U, V) / (np.linalg.norm(U) * np.linalg.norm(V))
            
            simPCC[v, u] = simPCC[u, v]
        
    return simPCC

# 3. CF algorithm with baseline rating

In [28]:
def basic_baseline(data, sim, k):
    # initializing (1000, 1000) numpy array with zeros 
    pred_rating = np.zeros(data.shape)

    # calculating means
    mean = np.nanmean(np.where(data != 0, data, np.nan)) # the mean of all ratings
    mean_u = np.nanmean(np.where(data != 0, data, np.nan), axis=1) # the mean of all users
    mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis=0) # the mean of all items

    # base user, item
    b_u = mean_u - mean 
    b_i = mean_i - mean
    # b = mean + b_u + b_i

    # selecting similarity fuction
    if(sim == 'COS'):
        Sim = COS(data)
    elif(sim == 'PCC'):
        Sim = PCC(data)

    # selecting top k users by sorting similarity array
    k_users = np.argsort(-Sim) 
    k_users = np.delete(k_users, np.s_[k:], 1) # delete colomn under k

    # number of users with axis = 0 condition
    num_users = np.size(data, axis=0)
    num_items = np.size(data, axis=1)
    
    # calculating predicted ratings
    for u in range(0, num_users):
        list_sim = Sim[u, k_users[u]] # selecting top k similarity
        for i in range(0, num_items):
            list_rating = data[k_users[u], i].astype('float64') # k users' ratings on item i

            b_ui = mean + b_u[u] + b_i[i] # scalar
            b_vi = mean + b_u[k_users[u]] + b_i[i] # list
            
            #explanation of varialbles
            # mean_u[u] : user u의 평균
            # mean_i[i] : item i의 평균
            # b_u[u] : user u의 b
            # b_i[i] : item i의 b

            # calculation
            mom = np.sum(list_sim) # 분모
            son = np.sum(list_sim * (list_rating - b_vi)) #분 자
            pred_rating[u,i] = b_ui + son / mom
    
    return pred_rating

In [41]:
prediction = basic_baseline(MovieLens_pivot, 'COS', 2)
pd.DataFrame(prediction) # row: users, column: items

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,1.816553,1.816553,1.816553,-0.680315,-0.680315,1.317180,-0.680315,-0.680315,3.820311,4.319685,...,-0.680315,-0.680315,-0.680315,-0.680315,-0.680315,-0.680315,-0.680315,-0.680315,-0.680315,1.317180
1,3.421797,3.922534,3.421797,2.922534,3.423270,0.421797,0.925480,2.423270,3.922534,1.919588,...,0.925480,-0.576730,-0.576730,-0.576730,-0.576730,-0.576730,-0.576730,-0.576730,1.919588,-0.576730
2,4.586480,5.086484,5.086484,2.086467,3.586488,4.586480,2.086467,2.086467,5.086484,5.086484,...,2.086467,2.086467,0.086484,0.086484,0.086484,0.086484,0.086484,0.086484,0.086484,2.086501
3,1.135598,-0.864693,1.135598,-0.864693,-0.864693,-0.864693,-0.864693,1.135598,3.135162,3.635235,...,-0.864693,-0.864693,-0.864693,-0.864693,1.135598,-0.864693,-0.864693,-0.864693,-0.864693,-0.864693
4,3.128561,3.628865,1.127343,0.630082,0.630082,3.129169,0.630082,1.130387,3.129169,3.628865,...,-1.371135,-1.371135,-1.371135,-1.371135,-1.371135,-1.371135,-1.371135,-1.371135,-1.371135,0.627648
5,1.316994,0.817548,0.817548,0.817548,-0.181346,1.316994,0.817548,0.817548,3.819761,3.819761,...,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239,-1.180239
6,3.690046,1.191169,3.690046,0.689148,1.188923,0.690945,1.188923,1.188923,3.189822,3.690046,...,-1.309954,-1.309954,-1.309954,0.689148,-1.309954,0.189372,-1.309954,-1.309954,-1.309954,0.690945
7,1.273142,1.273142,1.273142,0.774730,1.273142,1.273142,1.273142,-1.218916,3.781084,1.289027,...,-1.218916,-1.218916,-1.218916,-1.218916,-1.218916,0.276319,-1.218916,-1.218916,-1.218916,-1.218916
8,-1.654667,0.337459,0.337459,0.835491,-1.654667,-1.654667,-1.654667,-1.654667,2.847301,3.345333,...,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667,-1.654667
9,1.394674,1.893996,1.893996,-0.602618,1.394674,1.394674,-0.602618,-0.602618,4.397382,4.397382,...,-0.602618,-0.602618,-0.602618,-0.602618,-0.602618,-0.602618,-0.602618,-0.602618,-0.602618,1.394674


# 4. Extra Point

### 1) Item based

In [42]:
def basic_baseline_item(data, sim, k):
    # initializing (1000, 1000) numpy array with zeros 
    pred_rating = np.zeros(data.shape)
    
    # transpose the data -> fitting data for item based
    data = data.T

    # calculating means
    mean = np.nanmean(np.where(data != 0, data, np.nan)) # the mean of all ratings
    mean_u = np.nanmean(np.where(data != 0, data, np.nan), axis=1) # the mean of all users
    mean_i = np.nanmean(np.where(data != 0, data, np.nan), axis=0) # the mean of all items

    # base user, item
    b_u = mean_u - mean 
    b_i = mean_i - mean
    # b = mean + b_u + b_i

    # selecting similarity fuction
    if(sim == 'COS'):
        Sim = COS(data)
    elif(sim == 'PCC'):
        Sim = PCC(data)

    # selecting top k users by sorting similarity array
    k_users = np.argsort(-Sim) 
    k_users = np.delete(k_users, np.s_[k:], 1) # delete colomn under k

    # number of users with axis = 0 condition
    num_users = np.size(data, axis=0)
    num_items = np.size(data, axis=1)
    
    # calculating predicted ratings
    for u in range(0, num_users):
        list_sim = Sim[u, k_users[u]] # selecting top k similarity
        for i in range(0, num_items):
            list_rating = data[k_users[u], i].astype('float64') # k users' ratings on item i

            b_ui = mean + b_u[u] + b_i[i] # scalar
            b_vi = mean + b_u[k_users[u]] + b_i[i] # list
            
            #explanation of varialbles
            # mean_u[u] : user u의 평균
            # mean_i[i] : item i의 평균
            # b_u[u] : user u의 b
            # b_i[i] : item i의 b

            # calculation
            mom = np.sum(list_sim) # 분모
            son = np.sum(list_sim * (list_rating - b_vi)) #분 자
            pred_rating[u,i] = b_ui + son / mom
    
    return pred_rating

In [43]:
prediction_i = basic_baseline_item(MovieLens_pivot, 'COS', 2)
pd.DataFrame(prediction_i) # row: items column: users

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,4.825905,3.321843,4.825905,4.825905,3.327260,1.831322,3.825905,3.825905,3.329968,4.825905,...,4.825905,4.825905,-0.174095,4.825905,3.823197,4.825905,1.831322,-0.174095,2.332677,4.324551
1,5.064456,5.064456,5.064456,5.064456,3.565575,2.068931,3.563338,4.064456,3.066694,4.563338,...,5.064456,5.064456,0.064456,5.064456,4.563338,5.064456,2.068931,2.068931,0.064456,4.062219
2,4.679494,2.181668,4.679494,2.181668,1.180798,1.178190,1.681233,4.179059,0.680364,4.179929,...,4.179929,2.181668,-0.320506,2.181668,2.181668,2.181668,1.178190,2.177320,-0.320506,2.181668
3,4.030906,3.531667,4.531160,3.030906,2.531160,3.531667,4.031413,1.030400,2.031413,4.531160,...,1.530146,1.530146,0.530653,1.532173,-0.468840,4.031413,1.532173,1.530146,1.532173,1.532173
4,4.627157,3.127538,4.627157,4.627157,3.627157,3.627157,3.127031,3.627157,4.127031,4.127031,...,4.627157,4.627157,-0.372843,4.627157,3.127284,4.627157,3.627157,1.627664,2.126524,3.127031
5,2.218833,4.221555,4.221555,2.218833,2.722235,-0.277765,1.719514,1.719514,3.224277,2.218833,...,4.722235,2.218833,-0.277765,2.218833,4.722235,2.218833,-0.277765,-0.277765,-0.277765,4.221555
6,3.379884,4.379884,4.879147,4.379884,4.879147,2.382831,2.879147,2.879147,2.378411,3.877674,...,3.379884,2.375463,-0.120853,-0.120853,1.882094,4.879147,2.382831,-0.120853,-0.120853,2.382831
7,3.980871,3.980871,4.481094,4.481094,2.481094,-0.518906,2.980871,2.480648,1.481094,3.980871,...,1.979979,4.481094,-0.518906,3.980871,1.979979,4.481094,-0.518906,0.981763,-0.518906,1.979979
8,2.512886,2.015062,4.513853,0.014095,1.514820,1.513370,0.014095,2.512886,2.515303,2.013128,...,4.514337,0.014095,0.014095,0.014095,2.515303,0.014095,1.513370,2.512886,0.014095,2.015062
9,2.473576,4.468727,4.468727,0.969939,1.965090,1.972364,1.471151,-0.032486,4.466302,2.473576,...,2.461452,-0.032486,-0.032486,-0.032486,4.466302,-0.032486,-0.032486,2.473576,-0.032486,4.468727
