# Recommender Systems

### Load information about users and movies

In [1]:
from users import *

##### [0] Lets take a look at the users, movies, and their ratings.  Data is stored as a hash of hashes.

In [2]:
users

{'Claudia Puig': {'Just My Luck': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 4.0,
  'The Night Listener': 4.5,
  'You, Me and Dupree': 2.5},
 'Gene Seymour': {'Just My Luck': 1.5,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Jack Matthews': {'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Lisa Rose': {'Just My Luck': 3.0,
  'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 3.5,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.5},
 'Michael Phillips': {'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.0,
  'Superman Returns': 3.5,
  'The Night Listener': 4.0},
 'Mick LaSalle': {'Just My Luck': 2.0,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 3.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.0},
 'Toby': {'Snak

##### [1] All users

In [3]:
sorted(users.keys())

['Claudia Puig',
 'Gene Seymour',
 'Jack Matthews',
 'Lisa Rose',
 'Michael Phillips',
 'Mick LaSalle',
 'Toby']

##### [2] All the movies

In [4]:
sorted(set([m for d in users.values()
                  for m in d.keys()]))

['Just My Luck',
 'Lady in the Water',
 'Snakes on a Plane',
 'Superman Returns',
 'The Night Listener',
 'You, Me and Dupree']

##### [3] The movies / ratings of a Toby

In [5]:
users['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

##### [4] Rating of a particular movie  (recall the hash-of-hashes data structure)

In [6]:
users['Toby']['Superman Returns']

4.0

### Load similarity metrics

    sim_euclidean_distance( prefs, p1, p2 )
    sim_pearson( prefs, p1, p2 )

In [7]:
import math  # for sqrt
import similarity as sim

##### [5] The Eucliean distance and Pearson coefficient representing the similarity between two people.

In [8]:
# A routine to help with pretty printing
from pprint import pprint as pp
def pt( expr ):
    print()
    print("=== " + expr)
    val = eval(expr)
    pp(val)
    return val

pt( "sim.euclidean_distance( users, 'Mick LaSalle', 'Gene Seymour' )" )
pt( "sim.pearson( users, 'Mick LaSalle', 'Gene Seymour' ) ")
print() 
pt( "sim.euclidean_distance( users, 'Lisa Rose', 'Jack Matthews' )" )
pt( "sim.pearson( users, 'Lisa Rose', 'Jack Matthews' )" )


=== sim.euclidean_distance( users, 'Mick LaSalle', 'Gene Seymour' )
0.12903225806451613

=== sim.pearson( users, 'Mick LaSalle', 'Gene Seymour' ) 
0.41176470588235276


=== sim.euclidean_distance( users, 'Lisa Rose', 'Jack Matthews' )
0.21052631578947367

=== sim.pearson( users, 'Lisa Rose', 'Jack Matthews' )
0.7470178808339965


0.7470178808339965

#### Exercise: Write a Python list expression which returns a list of tuples giving the similarity of Toby to everyone else. Your calculated result will be along the lines of:
    [('Mick LaSalle', 0.9244734516419049),
     ('Claudia Puig', 0.8934051474415647),
     ('Gene Seymour', 0.38124642583151164),
     ('Lisa Rose', 0.9912407071619299),
     ('Toby', 1.0),
     ('Jack Matthews', 0.66284898035987),
     ('Michael Phillips', -1.0)]

In [10]:
# Your answer
toby_sim = [(name, sim.pearson(users, 'Toby', name)) for name in users.keys()]
toby_sim

[('Claudia Puig', 0.8934051474415647),
 ('Gene Seymour', 0.38124642583151164),
 ('Jack Matthews', 0.66284898035987),
 ('Lisa Rose', 0.9912407071619299),
 ('Michael Phillips', -1.0),
 ('Mick LaSalle', 0.9244734516419049),
 ('Toby', 1.0)]

---
# User Based Recommendation
---


## most_similar
Returns the best matches i.e., most similar people, for 'person' from the prefs dictionary. 
Number of results and similarity function are optional parameters.

In [11]:
def most_similar( users, person, n=5, similarity=sim.pearson ):
    sims=[(similarity(users, person, other), other) for other in users if other!=person]
    sims.sort()
    sims.reverse()
    return sims[0:n]

##### [6] Who have the most similar taste in movies to Toby?

In [12]:
most_similar( users, 'Toby' , n=4 )

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig'),
 (0.66284898035987, 'Jack Matthews')]

## user_based_recommendations

Recommendations for a person by using a weighted average
of every other user's rankings

In [13]:
def user_based_recommendations( prefs, person, similarity=sim.pearson ):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
  
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
  
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
  
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

##### [7] Based on the preferences of Toby and the ratings of movies of the other critics what movies are recommended for Toby?

In [14]:
user_based_recommendations( users,'Toby' )

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

---
## An Aside: Transpose
---
Do the equivalent of a matrix transpose on a hash-of-hashes.  Associate movies with various critics that reviewed them.

In [15]:
def transpose( users ):
    result={}
    for person in users:
        for item in users[person]:
            result.setdefault(item,{})        
            # Flip item and person
            result[item][person] = users[person][item]
    return result

#### [8] Switch movies and users

In [17]:
transpose( users )

{'Just My Luck': {'Claudia Puig': 3.0,
  'Gene Seymour': 1.5,
  'Lisa Rose': 3.0,
  'Mick LaSalle': 2.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 4.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.0,
  'Mick LaSalle': 4.0,
  'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
  'Gene Seymour': 5.0,
  'Jack Matthews': 5.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.5,
  'Mick LaSalle': 3.0,
  'Toby': 4.0},
 'The Night Listener': {'Claudia Puig': 4.5,
  'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 3.0,
  'Michael Phillips': 4.0,
  'Mick LaSalle': 3.0},
 'You, Me and Dupree': {'Claudia Puig': 2.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 3.5,
  'Lisa Rose': 2.5,
  'Mick LaSalle': 2.0,
  'Toby': 1.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 2.5,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0}}

#### [9] Once transposed, how should we interpret the below?  Think about the meaning before executing the code

In [18]:
movies=transpose(users)
pt( "most_similar( movies,'Superman Returns' )" )
pt( "user_based_recommendations( movies,'Just My Luck' )" )


=== most_similar( movies,'Superman Returns' )
[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

=== user_based_recommendations( movies,'Just My Luck' )
[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]


[(4.0, 'Michael Phillips'), (3.0, 'Jack Matthews')]

---
# Item Based Recommendation
---

## similarity_between_items
Note: For variety, the below routine uses Euclidean distance by default

In [19]:
def similarity_between_items( prefs, n=10, similarity=sim.euclidean_distance ):
    result={}
    # Invert the preference matrix to be item-centric
    item_prefs = transpose( prefs )
    len_item_prefs = len(item_prefs)
    c=0
    for item in item_prefs:
        # Status updates for large datasets
        c+=1
        if c%100==0: print( "%d / %d" % (c,len_item_prefs) )
        # Find the most similar items to this one
        scores = most_similar( item_prefs, item, n, similarity )
        result[item] = scores
    return result

##### [10] Create a dictionary of items showing which other items they are most similar to.  Note that the result will be symmetrical

In [20]:
# cache the similarity matrix for later usea
itemsim = similarity_between_items( users )
itemsim

{'Just My Luck': [(0.2222222222222222, 'Lady in the Water'),
  (0.18181818181818182, 'You, Me and Dupree'),
  (0.15384615384615385, 'The Night Listener'),
  (0.10526315789473684, 'Snakes on a Plane'),
  (0.06451612903225806, 'Superman Returns')],
 'Snakes on a Plane': [(0.2222222222222222, 'Lady in the Water'),
  (0.18181818181818182, 'The Night Listener'),
  (0.16666666666666666, 'Superman Returns'),
  (0.10526315789473684, 'Just My Luck'),
  (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, 'Snakes on a Plane'),
  (0.10256410256410256, 'The Night Listener'),
  (0.09090909090909091, 'Lady in the Water'),
  (0.06451612903225806, 'Just My Luck'),
  (0.05333333333333334, 'You, Me and Dupree')],
 'The Night Listener': [(0.2857142857142857, 'Lady in the Water'),
  (0.18181818181818182, 'Snakes on a Plane'),
  (0.15384615384615385, 'Just My Luck'),
  (0.14814814814814814, 'You, Me and Dupree'),
  (0.10256410256410256, 'Superman Returns')],
 'You, Me a

## item_based_recommendations

In [21]:
def item_based_recommendations( prefs, itemMatch, user ):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items( ):
  
      # Loop over items similar to this one
      for (similarity,item2) in itemMatch[item]:
  
        # Ignore if this user has already rated this item
        if item2 in userRatings: continue
        # Weighted sum of rating times similarity
        scores.setdefault(item2,0)
        scores[item2]+=similarity*rating
        # Sum of all the similarities
        totalSim.setdefault(item2,0)
        totalSim[item2]+=similarity
  
    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items( )]
  
    # Return the rankings from highest to lowest
    rankings.sort( )
    rankings.reverse( )
    return rankings

##### [11] Item based recommendations

In [22]:
if itemsim == None: itemsim = similarity_between_items( users )
pt( "item_based_recommendations( users, itemsim, 'Toby' )" )


=== item_based_recommendations( users, itemsim, 'Toby' )
[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]


[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

# movie lens data set

In [None]:
# http://grouplens.org/datasets/movielens/

def load_movielens_data( path='./ml-100k' ):
    # Get movie titles
    ml_movies={}
    for line in open( path+'/u.item', encoding="ISO-8859-1" ):
        #print( line )
        id, title = line.split('|')[0:2]
        ml_movies[id] = title

    # Load data
    ml_recs={}
    for line in open( path+'/u.data' ):
        # note that both user and movieid are retained as strings -not- numbers
        user, movieid, rating, ts = line.split('\t')
        ml_recs.setdefault( user, {} )
        ml_recs[user][ml_movies[movieid]] = float(rating)
    return ml_movies, ml_recs

ml_movies = None
ml_recs = None
ml_itemsim = None

In [None]:
if ml_movies is None:
    ml_movies, ml_recs = load_movielens_data( "./ml-100k" )

'loaded'

##### [12] user based recommendation on the movie lens data set
Note: by default ubr uses pearson similarity

In [None]:
ml_userid='87'
user_based_recommendations( ml_recs, ml_userid )[0:10]

##### [13] UBR using Euclidean Distance

In [None]:
user_based_recommendations( ml_recs, ml_userid, similarity=sim.euclidean_distance )[0:10]

In [None]:
if ml_itemsim is None:
    ml_itemsim = similarity_between_items( ml_recs, n=50 )

'similarity matrix calculated'

##### [14] Item based recommendation for user
Note: In the above code IBR uses Euclidean distance

In [None]:
ml_userid='87'
item_based_recommendations( ml_recs, ml_itemsim, ml_userid )[0:30]