In [19]:
import numpy as np
from collections import defaultdict

In [20]:
def parse_and_prep_movielens_data():

    #parse u.data

    data_ratings = np.genfromtxt('u.data',delimiter='\t',
                                 dtype=[('userID',int),
                                        ('itemID',int),
                                        ('rating',float),
                                        ('timestamp',int)])
    
    #parse u.info

    data_user = np.genfromtxt('u_user.csv',delimiter=',',
                              dtype=[('userID',int),
                                     ('age',int),
                                     ('gender','S1'),
                                     ('occupation','S20'),
                                     ('zipcode',int)])
    #parse u.item

    data_items_genre = np.genfromtxt('u_item.csv',delimiter=',',
                                     dtype=[('itemID',int),
                                            ('genre','19float')])
    
    np.sort(data_items_genre,order='itemID')
    nitems = data_items_genre.shape[0]
    nusers = data_user.shape[0]

    #prepare the ratings matrix of seze Nusers x Nitems

    ratings_matrix = np.zeros((nusers,nitems))
    for i in xrange(data_ratings.shape[0]):
        userid = data_ratings[i]['userID']
        itemid = data_ratings[i]['itemID']
        rating = data_ratings[i]['rating']
        ratings_matrix[userid-1,itemid-1] = rating
    return ratings_matrix,data_items_genre

In [21]:
def pearson_similarity_population(ratings_matrix,avg_ratings_of_users,i,j):
    mask = np.logical_and((ratings_matrix[i,:]>0),(ratings_matrix[j,:]>0))

    if np.logical_or((np.sum(mask)==0),(np.sum(mask)==1)):
        return 0

    r_i = ratings_matrix[i,:]-avg_ratings_of_users[i]
    r_j = ratings_matrix[j,:]-avg_ratings_of_users[j]

    var_covar_2x2_array = np.cov(r_i[mask],r_j[mask])
    numerator = var_covar_2x2_array[0,1]

    var0 = np.sqrt(var_covar_2x2_array[0,0])
    var1 = np.sqrt(var_covar_2x2_array[1,1])
    denom = var0 * var1
    if denom<1e-6:
        return 0
    return numerator/denom

In [22]:
def ratings_based_on_genre(ratings,items_and_their_genre,uid,itemid):
    num_genre = 19
    genre_matrix = items_and_their_genre['genre']
    ratings_genre_based = 0.0
    denom = 0.0
    ratings_genre_user = 0.0
    for genre in range(num_genre):
        vec1 = ratings[uid,:]
        vec2 = genre_matrix[:,genre]
        mask = np.logical_and((vec2 !=0.0),(vec1!=0.0))
        sum_mask = np.sum(mask)
        if sum_mask > 0:
            ratings_genre_user = np.dot(vec1[mask],vec2[mask])/sum_mask
        if genre_matrix[itemid,genre] > 0.0 and ratings_genre_user > 0.0:
            ratings_genre_based += genre_matrix[itemid,genre] * ratings_genre_user
            denom += 1.0
    if denom > 0.0:
        return ratings_genre_based/denom
    else:
        return 0

In [23]:
def recommendations_genre_based(ratings,
                                items_and_genre,
                                avg_ratings_of_users,
                                similarity_metric,
                                similarity_threshold,
                                given_id):
    totals = defaultdict(float)
    totals_genre = defaultdict(float)
    sim_sums = defaultdict(float)
    EPS = 1e-6
    alluserIDs = ratings.shape[0]
    allitemIDs = ratings.shape[1]
    for itemid in range(allitemIDs):
        if ratings[given_id,itemid] < (1.0-EPS):
            for uid in range(alluserIDs):
                if uid != given_id:
                    if ratings[uid,itemid] > (1-EPS):
                        sim = similarity_metric(ratings,
                                                avg_ratings_of_users,
                                                given_id,uid)
                        if sim >= similarity_threshold:
                            ratings_genre_based = ratings_based_on_genre(ratings,
                                                                         items_and_genre,
                                                                         uid,
                                                                         itemid)
                            totals[itemid] += sim*ratings[uid,itemid]
                            totals_genre[itemid] += sim*ratings_genre_based
                            sim_sums[itemid] += sim

    rankings = np.zeros(len(totals),
                        dtype=[('itemid',int),('rating',float)])
    ctr=0

    for itemid,total in totals.items():
        tot_sim = sim_sums[itemid];
        recommended_rating = 0.0;
        
        if tot_sim > 1.0 + EPS:
            recommended_rating = total/sim_sums[itemid]
        item_id_recommended = itemid + 1
        rankings[ctr]=(item_id_recommended,recommended_rating)
        ctr += 1

    rankings.sort(order='rating')
    rankings[:]=rankings[::-1]

    rankings_genre = np.zeros(len(totals_genre),
                        dtype=[('itemid',int),('rating',float)])

    ctr=0
    for itemid,total in totals_genre.items():
        tot_sim = sim_sums[itemid];
        recommended_rating = 0.0;

        if tot_sim > 1.0 + EPS:
            recommended_rating = total/sim_sums[itemid]
        item_id_recommended = itemid+1
        rankings_genre[ctr] = (item_id_recommended,recommended_rating)
        ctr+=1
        
    rankings_genre.sort(order='rating')
    rankings_genre[:] = rankings_genre[::-1]
    return rankings,rankings_genre

In [24]:
def average_ratings_based_on_genre(ratings,items_and_their_genre):
    num_genre = 19
    num_users = ratings.shape[0]
    num_items = ratings.shape[1]

    genre_matrix = items_and_their_genre['genre']
    ratings_genre_user = np.zeros((num_users,num_genre))
    ratings_genre_based = np.zeros((num_users,num_items))
    for uid in range(num_users):
        for genre in range(num_genre):
            vec1 = ratings[uid,:]
            vec2 = genre_matrix[:,genre]
            mask = (vec2 != 0.0)
            sum_mask =np.sum(mask)
            if sum_mask > 0:
                ratings_genre_user[uid,genre]= np.dot(vec1[mask],vec2[mask])/sum_mask

        for itemid in range(num_items):
            for genre in range(num_genre):
                vec1 = ratings_genre_user[uid,:]
                vec2 = genre_matrix[itemid,:]
                mask = (vec2 != 0.0)
                sum_mask = np.sum(mask)
                if sum_mask > 0:
                    ratings_genre_based[uid,itemid] = np.dot(vec1[mask],vec2[mask])/sum_mask

In [25]:
def topN_PPM(ratings_matrix,N):
    mask = (ratings_matrix > 0)
    number_of_users_for_items = mask.sum(axis = 0)

    topN_users_for_items = np.array(zip(np.argsort(number_of_users_for_items),
                                        np.sort(number_of_users_for_items)),
                                    dtype=[('itemid',int),('total_users',int)])

    topN_users_for_items[:] = topN_users_for_items[::-1]

    avg_ratings_of_items = np.divide(ratings_matrix.sum(axis=0),
                                     number_of_users_for_items)
    
    topN_ratings_for_items = np.array(zip(np.argsort(avg_ratings_of_items),
                                          np.sort(avg_ratings_of_items)),
                                      dtype=[('itemid',int),('avg_rating',float)])
    
    topN_ratings_for_items[:] = topN_ratings_for_items[::-1]

    total_movies = np.vstack((topN_users_for_items['itemid'][0:N],
                              topN_ratings_for_items['itemid'][0:N])).ravel()
    
    return np.unique(total_movies)

In [26]:
def main():
    ratings_matrix,items_and_their_genre = parse_and_prep_movielens_data()
    avg_ratings_of_users = ratings_matrix.sum(axis=1)

    mask=(ratings_matrix>0)
    mask=mask.sum(axis=1)
    avg_ratings_of_users = np.divide(avg_ratings_of_users,mask)

    PPM = topN_PPM(ratings_matrix,100)
    users = [5,15,47]
    similarity_threshold = 0.8
    number_of_rankings_requested = 100
    oupt_file = open("plot_new_serendipity.dat","w")
    
    for user in users:
        RS,RS_using_genre = recommendations_genre_based(
            ratings_matrix,
            items_and_their_genre,
            avg_ratings_of_users,
            pearson_similarity_population,
            similarity_threshold,
            user-1
        )

        #in RS but not in PPM
        mask = np.in1d(RS['itemid'][:],PPM,invert=True)
        UNEXPECT = RS[mask]
        
        #compute the list of useful items
        mask = (UNEXPECT['rating'] > 3)
        USEFUL = UNEXPECT[mask]
        
        #the genre based ratings of all the USEFUL movies
        indices = [i for i,itemid in enumerate(RS_using_genre['itemid'])
                   if itemid in USEFUL['itemid']]
        
        USEFUL_GENRE = np.sort(RS_using_genre[indices],
                               order = 'rating')
 
        ratings_difference = np.sort(USEFUL,order='itemid')['rating'] - np.sort(USEFUL_GENRE,order='itemid')['rating']
        
        normalized_ratings = np.divide(ratings_difference,5.0)
        
        mask = (normalized_ratings >= -0.05)
       
        USEFUL_GENRE = np.sort(USEFUL_GENRE[mask],order = 'rating')

        oupt_file.write("# user = %s \n" % user)
        oupt_file.write("# RS \n")
        np.savetxt(oupt_file,RS[0:20],
                   fmt='%d %1.2e',delimiter='',newline='\n',header='',footer='',comments='# ')
        
        oupt_file.write("\n")
        oupt_file.write("# USEFUL_GENRE \n")
        np.savetxt(oupt_file,USEFUL_GENRE[0:20],
                   fmt='%d %1.2e',delimiter='',newline='\n',header='',footer='',comments='# ')
        oupt_file.write("\n")
    oupt_file.close()

In [27]:
if __name__=="__main__":
    main()