In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
ratings = pd.read_csv('./data/ml-20m/ratings.csv')
ratings = ratings.iloc[:1000000, :]
ratings.tail(5)

Unnamed: 0,userId,movieId,rating,timestamp
999995,6743,1580,4.0,1117760757
999996,6743,1584,3.0,1117760943
999997,6743,1586,3.0,1117761831
999998,6743,1589,4.0,1117761990
999999,6743,1590,4.0,1117761913


In [3]:
movies = pd.read_csv('./data/ml-20m/movies.csv')
movies.tail(5)

Unnamed: 0,movieId,title,genres
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)
27277,131262,Innocence (2014),Adventure|Fantasy|Horror


In [4]:
selected_user = pd.DataFrame(columns=ratings.columns)
for i in range(0, 10000, 1000):
    selected_user=selected_user.append(ratings.iloc[i])
selected_user.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1.0,2.0,3.5,1112486000.0
1000,11.0,527.0,4.5,1251171000.0
2000,18.0,4967.0,5.0,1196423000.0
3000,24.0,4321.0,3.0,994232800.0
4000,35.0,110.0,4.5,1164499000.0
5000,50.0,1094.0,4.0,1182678000.0
6000,54.0,3198.0,2.0,975440600.0
7000,59.0,1.0,4.5,1380401000.0
8000,70.0,2916.0,2.0,1020294000.0
9000,83.0,342.0,3.5,1112724000.0


In [5]:
UM_matrix_ds = ratings.pivot(index='userId', columns='movieId', values='rating')
UM_matrix_ds.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [12]:
def distance_cosine(a, b):
    return 1-distance.cosine(a,b)
def nearest_neighbor_user(user, topN, simFunc):
    ul= UM_matrix_ds.loc[user].dropna()
    ratedIndex = ul.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(ul[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        if interSectionLen < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
        
        if not math.isnan(sim): nn[uid] = sim
    return sorted(nn.items(), key=itemgetter(1))[:-(topN+1):-1]

In [13]:
for user in selected_user['userId']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_cosine)))

User 1.0 neighbors : [(6564, 1.0), (6310, 1.0), (5992, 1.0)]
User 11.0 neighbors : [(1801, 1.0), (992, 1.0), (743, 1.0)]
User 18.0 neighbors : [(6098, 1.0), (4929, 1.0), (2615, 1.0)]
User 24.0 neighbors : [(6036, 0.9994838709333602), (6441, 0.9992360733058688), (2725, 0.9989922980237592)]
User 35.0 neighbors : [(6185, 1.0), (5432, 1.0), (4753, 1.0)]
User 50.0 neighbors : [(6541, 1.0), (5641, 1.0), (5308, 1.0)]
User 54.0 neighbors : [(5109, 1.0), (3156, 1.0), (876, 1.0)]
User 59.0 neighbors : [(5733, 1.0), (4992, 1.0), (4416, 1.0)]
User 70.0 neighbors : [(6036, 1.0), (4262, 1.0), (4122, 1.0)]
User 83.0 neighbors : [(6033, 1.0), (5992, 1.0), (5909, 1.0)]


In [14]:
def predict_rating(userid, nn=100, simFunc=distance_cosine):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neighbor_id].dropna(1, how='all', thresh=4)
    neighbor_dict = (dict(neighbor))
    ret =[]
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [15]:
result = []
for i in range(10):
    userId = int(selected_user.iloc[i].userId)
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating(userId, 300, distance_cosine)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(userId), int(movieId), movie[1]])
            
resultdf = pd.DataFrame(result, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,1,2,3.970709
1,11,527,4.568207
2,35,110,4.600021
3,50,1094,3.570565
4,59,1,4.668322
5,83,342,3.789501


In [16]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(selected_user[selected_user['userId']==userid]['rating']))
    
resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)


Error Rate(Absolute) :  0.25436584678973356
Error Rate(Squared) :  0.08879684437379105
