In [14]:
import pandas as pd
import numpy as np
import time

def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()        
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result    
    return timed

In [15]:
!ls ml-latest-small/

README.txt  links.csv   movies.csv  ratings.csv tags.csv


In [16]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [17]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [19]:
movies = movies.join(ratings.groupby(['movieId']).aggregate('mean')['rating'],
           on='movieId')
movies = movies.rename(columns={'rating': 'mean_rating'})

In [127]:
def sim(id1, id2): 
    id1, id2 = str(id1), str(id2)
    
    # Users who have seen both films
    idx1, idx2 = (ratings.movieId==id1), (ratings.movieId==id2)
    if (sum(idx1) == 0 or sum(idx2) == 0): # Non-existing index
        return 0
    users1 = set(ratings[idx1].userId)
    users2 = set(ratings[idx2].userId)
    users = list(users1 & users2)
    
    # Ratings of those users
    users_idx = ratings.userId.apply(lambda x: x in users)
    rating1 = ratings[(idx1) & (users_idx)].rating
    rating2 = ratings[(idx2) & (users_idx)].rating
    
    # Pearson correlation
    mean1 = movies[movies.movieId==id1].mean_rating.iloc[0]
    mean2 = movies[movies.movieId==id2].mean_rating.iloc[0]
    num = np.dot(rating1 - mean1, rating2 - mean2)

    sd1 = np.sqrt(np.sum((rating1 - mean1)**2))
    sd2 = np.sqrt(np.sum((rating2 - mean2)**2))
    den = sd1 * sd2
    
    # Special cases
    if (den == 0):
        return 0
    
    return num / den    

In [157]:
sim(34,2334)

-0.12468244196632929

In [98]:
@timeit
def pred(user, film):
    mean_rating = movies[movies.movieId==film].mean_rating.iloc[0]
    films_user = ratings[ratings.userId==user].movieId
    num, den = 0, 0
    for movie in films_user:
        s = abs(sim(film, movie))
        r = ratings[(ratings.userId==user) & 
                            (ratings.movieId==movie)].rating.iloc[0]
        mean_r = movies[movies.movieId==movie].mean_rating.iloc[0]
        
        num += s * (r - mean_r)
        den += s
        # Special case
    if den == 0: 
        return mean_rating
    return mean_rating + num / den

In [219]:
pred(1,5)

3.9995398554255206

# Versión Real I

In [43]:
import csv

_movie_ratings = {}
_user_ratings = {}
f = open('ml-latest-small/ratings.csv',"r",encoding="utf8")
reader = csv.reader(f)
next(reader)  # skips header line
for line in reader:
    userid = line[0]
    movieid = line[1]
    rating = line[2]
    # ignore line[3], timestamp
    if userid in _user_ratings:
        _user_ratings[userid].append((movieid,rating))
    else:
        _user_ratings[userid] = [(movieid,rating)]

    if movieid in _movie_ratings:
        _movie_ratings[movieid].append((userid,rating))
    else:
        _movie_ratings[movieid] = [(userid,rating)]
f.close()

In [44]:
def get_rating(userid, movieid):
    userid, movieid = str(userid), str(movieid)
    for x in _user_ratings[userid]:
        if x[0] == movieid:
            return float(x[1])
    return None

In [45]:
def sim2(id1, id2):    
    id1, id2 = str(id1), str(id2)
    
    # Users who have seen both films
    if (not(id1 in _movie_ratings)) or \
       (not(id2 in _movie_ratings)): # Non-existing index
        return 0
    users1 = set([x[0] for x in _movie_ratings[id1]])
    users2 = set([x[0] for x in _movie_ratings[id2]])
    users = list(users1 & users2)
    
    # Ratings of those users
    rating1 = np.array([get_rating(user, id1) for user in users])
    rating2 = np.array([get_rating(user, id2) for user in users])
    
    # Pearson correlation
    mean1 = np.mean([float(x[1]) for x in _movie_ratings[id1]])
    mean2 = np.mean([float(x[1]) for x in _movie_ratings[id2]])
    num = np.dot(rating1 - mean1, rating2 - mean2)

    sd1 = np.sqrt(np.sum((rating1 - mean1)**2))
    sd2 = np.sqrt(np.sum((rating2 - mean2)**2))
    den = sd1 * sd2
    
    # Special cases
    if (den == 0):
        return 0
    
    return num / den    

In [99]:
@timeit
def pred2(user, film):
    user, film = str(user), str(film)
    if film not in _movie_ratings:
        print('Film not existing')
        return
    mean_rating = np.mean([float(x[1]) for x in _movie_ratings[film]])
    if user not in _user_ratings:
        print('User not existing')
        return
    films_user = [x[0] for x in _user_ratings[user]]
    num, den = 0, 0
    for movie in films_user:
        s = abs(sim2(film, movie))
        r = get_rating(user, movie)
        mean_r = np.mean([float(x[1]) for x in _movie_ratings[movie]])
        
        num += s * (r - mean_r)
        den += s
    # Special case
    if den == 0: 
        return mean_rating
    return mean_rating + num / den

# Versión Real II
Leyendo los datos como en el py

In [47]:
import csv

_movie_ratings2 = {}
_user_ratings2 = {}
f = open('ml-latest-small/ratings.csv',"r",encoding="utf8")
reader = csv.reader(f)
next(reader)  # skips header line
for line in reader:
    userid = line[0]
    movieid = line[1]
    rating = line[2]
    # ignore line[3], timestamp
    if userid not in _user_ratings2:
        _user_ratings2[userid] = {} # each user is a dict with movies and ratings
    _user_ratings2[userid][movieid] = float(rating)

    if movieid not in _movie_ratings2:
        _movie_ratings2[movieid] = {}
    _movie_ratings2[movieid][userid] = float(rating)
f.close()

In [48]:
def sim3(id1, id2):    
    id1, id2 = str(id1), str(id2)
    
    # Users who have seen both films
    if (not(id1 in _movie_ratings2)) or \
       (not(id2 in _movie_ratings2)): # Non-existing index
        return 0
    users1 = set(_movie_ratings2[id1].keys())
    users2 = set(_movie_ratings2[id2].keys())
    users = list(users1 & users2)
    
    # Ratings of those users
    rating1 = np.array([_movie_ratings2[id1][user] for user in users])
    rating2 = np.array([_movie_ratings2[id2][user] for user in users])
    
    # Pearson correlation
    mean1 = np.mean(list(_movie_ratings2[id1].values()))
    mean2 = np.mean(list(_movie_ratings2[id2].values()))
    num = np.dot(rating1 - mean1, rating2 - mean2)

    sd1 = np.sqrt(np.sum((rating1 - mean1)**2))
    sd2 = np.sqrt(np.sum((rating2 - mean2)**2))
    den = sd1 * sd2
    
    # Special cases
    if (den == 0):
        return 0
    
    return num / den    

In [104]:
#@timeit
def pred3(user, film):
    user, film = str(user), str(film)
    if film not in _movie_ratings2:
        print('Film not existing')
        return
    mean_rating = np.mean(list(_movie_ratings2[film].values()))
    if user not in _user_ratings2:
        print('User not existing')
        return
    films_user = list(_user_ratings2[user].keys())
    num, den = 0, 0
    for movie in films_user:
        s = abs(sim3(film, movie))
        r = _user_ratings2[user][movie]
        mean_r = np.mean(list(_movie_ratings2[movie].values()))
        
        num += s * (r - mean_r)
        den += s
    # Special case
    if den == 0: 
        return mean_rating
    return mean_rating + num / den

In [101]:
pred3(2,34)

'pred3'  4.98 ms


3.70786866593081

In [102]:
pred2(2,34)

'pred2'  28.83 ms


3.70786866593081

In [103]:
pred(2,34)

'pred'  1955.37 ms


3.70786866593081

In [105]:
user = '7'
films = set([x[0] for x in _user_ratings[user]])
recom = []
for movie in _movie_ratings.keys():
    if not(movie in films):
        recom.append((movie, pred3(user, movie)))

In [110]:
recom.sort(key=lambda x : -x[1])

In [112]:
recom[:7]

[('131724', 5.0),
 ('5746', 5.0),
 ('6835', 5.0),
 ('3851', 5.0),
 ('1151', 5.0),
 ('1631', 5.0),
 ('2075', 5.0)]

In [128]:
def pred4(rating_list, film):
    film = str(film)
    # Shouldn't happend
    if film not in _movie_ratings2:
        print('Film not existing')
        return 0
    mean_rating = np.mean(list(_movie_ratings2[film].values()))
    num, den = 0, 0
    for x in rating_list:
        movie = x[0]
        s = abs(sim3(film, movie))
        r = x[1] # user rating of movie
        mean_r = np.mean(list(_movie_ratings2[movie].values()))

        num += s * (r - mean_r)
        den += s
    # Special case
    if den == 0: 
        return mean_rating
    return mean_rating + num / den

In [131]:
pred4([('1', 0.5),('5', 5.0)], 10)

4.986328938635185

In [154]:
rating_list = [('1', 0.5),('5', 5.0),('7',3.5), ('11',2.5), ('15',0.5), 
               ('1328',5.0), ('55292', 4.0), ('109569', 3.0),
              ('5323', 3.5), ('27793', 4.5), ('3768', 0.5)]
films = set([x[0] for x in rating_list])
recom = []
for movie in _movie_ratings2.keys():
    if not(movie in films):
        recom.append((movie, pred4(rating_list, movie)))

In [161]:
recom.sort(key=lambda x : -x[1])
recom

[('1341', 8.0),
 ('57536', 7.5),
 ('52975', 7.33840723198408),
 ('55908', 7.1875),
 ('7321', 7.0),
 ('1987', 6.75),
 ('7025', 6.75),
 ('904', 6.667567312638946),
 ('53956', 6.5696114853870755),
 ('104457', 6.569507646715775),
 ('858', 6.5670929310063935),
 ('1221', 6.566498601645964),
 ('760', 6.5625),
 ('1437', 6.5625),
 ('4826', 6.5625),
 ('60753', 6.5),
 ('1210', 6.498937859414375),
 ('94985', 6.479186234895414),
 ('1089', 6.422516657331739),
 ('86345', 6.396673774551851),
 ('1997', 6.3962971082138855),
 ('1230', 6.391174859838973),
 ('57504', 6.390803636150061),
 ('2329', 6.365703154068187),
 ('106642', 6.348570985948923),
 ('110', 6.332018715573728),
 ('27584', 6.319507646715774),
 ('4011', 6.31101524000032),
 ('8360', 6.273426726507561),
 ('26629', 6.25),
 ('57274', 6.241488032335175),
 ('7361', 6.236183372066481),
 ('65216', 6.217872590808152),
 ('118888', 6.178571428571429),
 ('127152', 6.178571428571429),
 ('5995', 6.171503301827748),
 ('1247', 6.159035799911251),
 ('94405', 6

In [None]:
k, m = 3, 100
hashes = [dict() for _ in range(m)]

np.random.seed(12345)
hashbits = np.random.randint(10, size=(m, k))

In [None]:
def hashcode(movie, i):
    """ get the i'th hash code of movie(0 <= i < m) """
    ratings = list(_movie_ratings2[movie].items())
    ratings.sort(key=lambda x : -x[1])


    row = hashbits[i]
    str = ""
    for x in row:
        if x >= len(ratings):
            x = 0
        str += ratings[x][0]
    return str

In [None]:
k, m = 20, 1
hashes = [dict() for _ in range(m)]

strlen = len(_user_ratings2) * 10 # 10 possible ratings

np.random.seed(123)
hashbits = np.random.randint(strlen, size=(m, k))

In [None]:
def hashcode(movie, i):
    """ get the i'th hash code of movie(0 <= i < m) """
    row = hashbits[i]
    str2 = ""
    for x in row:
        hash_user = x // 10
        hash_rating = x % 10
        if str(hash_user) not in _movie_ratings2[movie]:
            rating = 0
        else:
            rating = _movie_ratings2[movie][str(hash_user)]

        if hash_rating < int(2*rating):
            str2 += '1'
        else:
            str2 += '0'
    return str2

In [None]:
def hash_all_movies():
    """ go through all movies and store them in hash table(s) """
    for movie in _movie_ratings2.keys():
        for i in range(m):
            user = hashcode(movie, i)

            # store it into the dictionary..
            # (well, the index not the whole array!)
            if user not in hashes[i]:
                hashes[i][user] = []
            hashes[i][user].append(movie)
    return

In [None]:
def candidates(movie):
    """ given movie, return set of indices of matching candidates """
    res = set()
    for i in range(m):
        code = hashcode(movie, i)
        if code in hashes[i]:
            res.update(hashes[i][code])
    return res

In [None]:
hash_all_movies()

In [None]:
len(candidates('260'))