In [1]:
import pandas as pd

## csv파일 읽기

In [14]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
test = pd.read_csv('test.csv')

In [3]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179


## User X Movie Matrix 만들기

In [4]:
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')

print("UM Matrix value size", UM_matrix_ds.values.size)
print("ratings value size", ratings.values.size)

UM Matrix value size 6083286
ratings value size 400016


In [5]:
UM_matrix_ds.head(2)

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,


## 최근접 이웃을 찾아 보자

In [6]:
import math
from operator import itemgetter
from scipy.spatial import distance

def distance_cosine(a, b):
    return 1-distance.cosine(a,b)

def distance_correlation(a, b):
    return 1-distance.correlation(a,b)

def distance_euclidean(a, b):
    return 1/(distance.euclidean(a,b)+1)

In [7]:
def nearest_neighbor_user(user, topN, simFunc):
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    ## Brote Force Compute
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        
        if uid == user:
            continue
            
        for i in ratedIndex:
            if False == math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        ## At least 3 intersection items
        if interSectionLen < 3:
            continue
            
        ## similarity function
        sim = simFunc(interSectionU1, interSectionU2)
        
        if math.isnan(sim) == False:
            nn[uid] = sim
            
    ## top N returned
    return sorted(nn.items(), key=itemgetter(1))[:-(topN+1):-1]

In [18]:
import time
st = time.time()
for user in test['userId']:
    print(nearest_neighbor_user(user, 3, distance_euclidean))
print(time.time()-st, 'sec')

[(354, 1.0), (115, 0.6666666666666666), (544, 0.585786437626905)]
[(539, 0.38742588672279304), (50, 0.3761785115301142), (444, 0.3567891723253309)]
[(634, 0.6666666666666666), (35, 0.38742588672279304), (1, 0.38742588672279304)]
[(76, 1.0), (515, 0.585786437626905), (485, 0.585786437626905)]
[(618, 1.0), (391, 0.6666666666666666), (211, 0.6666666666666666)]
[(668, 0.585786437626905), (154, 0.585786437626905), (331, 0.5358983848622454)]
[(631, 1.0), (252, 0.6666666666666666), (194, 0.6666666666666666)]
[(397, 0.4142135623730951), (113, 0.32037724101704074), (129, 0.2989350844248255)]
[(223, 1.0), (524, 0.585786437626905), (208, 0.5358983848622454)]
[(569, 1.0), (139, 1.0), (125, 0.6666666666666666)]
33.21187615394592 sec


In [9]:
def predictRating(userId, nn=50, simFunc=distance_euclidean):
    
    ## neighborhood
    neighbor = nearest_neighbor_user(userId, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    ## neighborhood's movie : at least 4 ratings
    neighbor_movie = UM_matrix_ds.loc[neighbor_id]\
                    .dropna(1, how='all', thresh=4)
    neighbor_dic = (dict(neighbor))
    ret = [] # ['movieID', 'predictedRate']
    
    ## rating predict by my similarities
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dic.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [66]:
i=0
for user in test['userId']:
    ans = predictRating(user, 100)
    for a in ans:
        if a[0] == test.loc[test['userId'] == user]['movieId'][i]:
            print(a)
    i+=1
        

[1356, 4.1089823775966527]
[555, 4.0031304509580083]
[6212, 2.5542517080017473]
