# Pycon 2019 Workshop: Build an Intelligent Recommendation System

## Objectives
* Have a general idea of how recommendation systems operate.

## Outcomes
* Provide attendees with the necessary vocabulary and concepts of recommendation systems.
* To design and implement recommendation systems using Python.

# Collaborative Filtering
Making recommendations based on other people.

In [None]:
from math import sqrt

users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0},
         "Clara": {"Blues Traveler": 4.75, "Norah Jones": 4.5, "Phoenix": 5.0, "The Strokes": 4.25, "Weird Al": 4.0},
         "Robert": {"Blues Traveler": 4.0, "Norah Jones": 3.0, "Phoenix": 5.0, "The Strokes": 2, "Weird Al": 1.0}
        }

# Investigate 2 dimensional relationships

Find similar users by calculating their 2-dimensional distance from each other based on their overall ratings, using only items both users reviewed.  These techniques work best when there are no missing values.  The Minkowski Distance Metric reminds us that the greater the value of r (determined by the Manhattan Distance, Euclidean Distance, or Supremum Distance), the more a large difference in one dimension will influence the total difference.  Never assume that all users have rated all products!

### Manhattan Distance
* Benefit is that it is fast to compute.

### Euclidean Distance
* More steps to calculate but is a more accurate measurement than Manhattan Distance.

In [None]:
users['Angelica']

In [None]:
users['Veronica']

In [None]:
def manhattan(rating1, rating2):
    """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries
       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}
       using the calculation | x1 - x2 | + | y1 - y2 |
       """
    distance = 0
    commonRatings = False 
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])
            commonRatings = True
    if commonRatings:
        return distance
    else:
        return -1 #Indicates no ratings in common

In [None]:
manhattan(users['Angelica'], users['Veronica'])

In [None]:
manhattan(users['Dan'], users['Sam'])

In [None]:
from scipy.spatial.distance import cdist

A = [[3.5, 2.5]]

B = [[3, 3]]
#A = [list(users['Angelica'].values())]
#B = [list(users['Dan'].values())]
out = cdist(A, B, metric='cityblock')
print(out)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(A, B)
plt.show()

In [None]:
def euclidean(rating1, rating2):
    """Computes the Euclidean distance. Both rating1 and rating2 are dictionaries
       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}
       using the calculation sqrt( ( x1 - x2 )**2 + (y1 - y2 )**2 )
       """
    distance = 0
    total = 0
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])**2
            total += 1
    if total > 0:
        return sqrt(distance)
    else:
        return -1 #Indicates no ratings in common

In [None]:
euclidean(users['Angelica'], users['Veronica'])

In [None]:
euclidean(users['Dan'], users['Sam'])

In [None]:
def computeNearestNeighbor(username, users, dist='Manhattan'):
    """Creates a sorted list of users based on their distance to username."""
    distances = []
    for user in users:
        if user != username:
            if dist == 'Euclidean':
                distance = euclidean(users[user], users[username])
                distances.append((distance, user))
            elif dist =='Manhattan':
                distance = manhattan(users[user], users[username])
                distances.append((distance, user))
    # sort based on distance -- closest first
    distances.sort()
    return distances

In [None]:
computeNearestNeighbor('Dan', users)

In [None]:
computeNearestNeighbor('Dan', users, 'Euclidean')

In [None]:
def recommend(username, users, dist='Manhattan'):
    """Give list of recommendations"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(username, users, dist)[0][1]

    recommendations = []
    # now find bands neighbor rated that user didn't
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    # using the fn sorted for variety - sort is more efficient
    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse=True)

In [None]:
recommend('Hailey', users)

In [None]:
recommend('Hailey', users, 'Euclidean') # This particular recommendation would change with the amount of products and users.

In [None]:
recommend('Dan', users) # Is this because this is the only product left for Dan or because he would actually like it or both?

In [None]:
recommend('Sam', users) # Will Sam like this product?

In [None]:
recommend('Angelica', users, 'Euclidean') # What happened here?

In [None]:
computeNearestNeighbor('Angelica', users, 'Euclidean') # Take a look at that person's nearest neighbor

In [None]:
import pandas as pd

df = pd.DataFrame(users)

In [None]:
df[['Angelica', 'Veronica']] # Compare the product ratings of these two users.

In [None]:
def minkowski(rating1, rating2, r):
    """Computes the Minkowski distance. Both rating1 and rating2 are dictionaries of the form.
    Minkowski distance is typically used with r being 1 (Manhattan distance) or 2 (Euclidean distance).
    {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
    distance = 0
    commonRatings = False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key] - rating2[key]), r)
            commonRatings = True
    if commonRatings:
        return pow(distance, 1/r)
    else:
        return 0 
#Indicates no ratings in common

In [None]:
minkowski(users['Angelica'], users['Veronica'], 2)

In [None]:
def computeNearestNeighbor(username, users, r):
    """Creates a sorted list of users based on their distance to username.
    When r=1 then use Manhattan Distance
    When r=2 then use Euclidean Distance
    When r>2 then use Supremum Distance
    """
    distances = []
    for user in users:
        if user != username:
            distance = minkowski(users[user], users[username], r)
            distances.append((distance, user))
    # sort based on distance -- closest first
    distances.sort()
    return distances

In [None]:
computeNearestNeighbor('Angelica', users, 2)

In [None]:
def recommend(username, users, r):
    """Give list of recommendations"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(username, users, r)[0][1]

    recommendations = []
    # now find bands neighbor rated that user didn't
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    # using the fn sorted for variety - sort is more efficient
    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse=True)

In [None]:
recommend('Angelica', users, 2) # What happened here?

In [None]:
df.describe().T # Closer inspection of user ratings show that users behave differently when rating products.

Look at Jordyn's min and max values, this is known as "grade inflation".

This variability can create problems with a recommendation system.  One way to overcome this variability is to use Pearson's Correlation Coefficient.

In [None]:
users2 = {"Clara": {"Blues Traveler": 4.75, "Norah Jones": 4.5, "Phoenix": 5.0, "The Strokes": 4.25, "Weird Al": 4.0},
          "Robert": {"Blues Traveler": 4.0, "Norah Jones": 3.0, "Phoenix": 5.0, "The Strokes": 2, "Weird Al": 1.0} }

In [None]:
df2 = pd.DataFrame(users2)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
df2.plot('Clara', 'Robert', kind='scatter', ax=ax)
for k, v in df2.iterrows():
    ax.annotate(k, v)
fig.canvas.draw() # Appears to be in perfect agreement, but look at the scaling!

In [None]:
plt.scatter(df2["Clara"].index, df2["Clara"])
plt.scatter(df2["Robert"].index, df2["Robert"])
plt.show()

Use Pearson Correlation Coefficient to find the individual who is most similar to the person we are interested in finding.

In [None]:
def pearson(rating1, rating2):
    """ The Pearson Correlation Coefficient is a measure of correlation between two variables.
    Use this to find the individual who is most similar to a particular user. It ranges between -1 and 1 inclusive.
    1 indicates perfect agreement.
    -1 indicates perfect disagreement.
    """
    sum_xy = 0
    sum_x = 0
    sum_y = 0
    sum_x2 = 0
    sum_y2 = 0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x * y
            sum_x += x
            sum_y += y
            sum_x2 += pow(x, 2)
            sum_y2 += pow(y, 2)
    # now compute denominator
    denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y) / n) / denominator

In [None]:
# Back to the full users table, including Clara and Robert
pearson(users['Clara'], users['Robert']) # There are only 2 users.

In [None]:
pearson(users['Angelica'], users['Bill'])

# Cosine Similarity
Chances are good that you do not have any ratings for products that I have, and the reverse is also true.

Cosine Similarity is used to find similarity without being concerned about the number of zeros, as any measure of similarity should not depend on the shared-zero values.

This one of the reasons why replacing null values with zeros and using a distance measure is a bad idea.  Can you explain what would happen in this situation?

In [None]:
def computeUserAverages(users):
    """Compute the average rating per user and return the result.
    """
    results = {}
    for (key, ratings) in users.items():
        results[key] = float(sum(ratings.values())) / len(ratings.values())
    return results

In [None]:
computeUserAverages(users)

In [None]:
# Use Pandas to find the mean of each user.
df.describe(include='all').loc['mean']
#df.mean(axis=0)

In [None]:
# Use Pandas to find the mean of each product.
df.mean(axis=1)

Use adjusted cosine similarity to compensate for grade inflation by subtracting a user's average rating from each of their ratings.

In [None]:
def computeSimilarity(band1, band2, userRatings):
    averages = {}
    for (key, ratings) in userRatings.items():
        averages[key] = (float(sum(ratings.values()))
                         / len(ratings.values()))

    num = 0  # numerator
    dem1 = 0 # first half of denominator
    dem2 = 0
    for (user, ratings) in userRatings.items():
        if band1 in ratings and band2 in ratings:
            avg = averages[user]
            num += (ratings[band1] - avg) * (ratings[band2] - avg)
            dem1 += (ratings[band1] - avg)**2
            dem2 += (ratings[band2] - avg)**2
    try:
        return num / (sqrt(dem1) * sqrt(dem2))
    except ZeroDivisionError:
        return None

In [None]:
bands = ['Blues Traveler', 'Broken Bells', 'Norah Jones', 'Phoenix', 'Slightly Stoopid', 'The Strokes', 'Vampire Weekend', 'Weird Al']

for b in bands:
    for x in bands:
        #print("%20s%20s%10.5f" % (b, x, computeSimilarity(b, x, users)))
        try:
            print("%20s%20s%10.5f" % (b, x, computeSimilarity(b, x, users)))
        except TypeError:
            print("%20s%20s%10.5s" % (b, x, computeSimilarity(b, x, users)))

# Using K Nearest Neighbor to base recommendations on more than one person who is similar to our user.

This helps to iron out the influence of any 'quirky' recommendations of any single person.

The k-nearest neighbor approach to collaborative filtering uses k most similar people to 
determine recommendations. The best value for k is application specific so some experimentation is required.

In [None]:
class recommender:
    """Recommendation class using K Nearest Neighbor (KNN).
    """
    def __init__(self, data, k=1, metric='pearson', n=5):
        """ initialize recommender
        currently, if data is dictionary the recommender is initialized
        to it.
        For all other data types of data, no initialization occurs
        k is the k value for k nearest neighbor
        metric is which distance formula to use
        n is the maximum number of recommendations to make"""
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        # for some reason I want to save the name of the metric
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        #
        # if data is dictionary set recommender data to it
        #
        if type(data).__name__ == 'dict':
            self.data = data

    def convertProductID2name(self, id):
        """Given product id number return product name"""
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id


    def userRatings(self, id, n):
        """Return n top ratings for user with id"""
        print ("Ratings for " + self.userid2name[id])
        ratings = self.data[id]
        print(len(ratings))
        ratings = list(ratings.items())
        ratings = [(self.convertProductID2name(k), v)
                   for (k, v) in ratings]
        # finally sort and return
        ratings.sort(key=lambda artistTuple: artistTuple[1],
                     reverse = True)
        ratings = ratings[:n]
        for rating in ratings:
            print("%s\t%i" % (rating[0], rating[1]))
                 
        
    def pearson(self, rating1, rating2):
        """ Determines how much influence each person should have, 
        adjusting for grade inflation.
        """
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        if n == 0:
            return 0
        # now compute denominator
        denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
                       * sqrt(sum_y2 - pow(sum_y, 2) / n))
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / denominator


    def computeNearestNeighbor(self, username):
        """Creates a sorted list of users based on their distance to
        username"""
        distances = []
        for instance in self.data:
            if instance != username:
                distance = self.fn(self.data[username],
                                   self.data[instance])
                distances.append((instance, distance))
        # sort based on distance -- closest first
        distances.sort(key=lambda artistTuple: artistTuple[1],
                       reverse=True)
        return distances

    def recommend(self, user):
       """Give list of recommendations"""
       recommendations = {}
       # first get list of users  ordered by nearness
       nearest = self.computeNearestNeighbor(user)
       #
       # now get the ratings for the user
       #
       userRatings = self.data[user]
       #
       # determine the total distance
       totalDistance = 0.0
       for i in range(self.k):
          totalDistance += nearest[i][1]
       # now iterate through the k nearest neighbors
       # accumulating their ratings
       for i in range(self.k):
          # compute slice of pie 
          weight = nearest[i][1] / totalDistance
          # get the name of the person
          name = nearest[i][0]
          # get the ratings for this person
          neighborRatings = self.data[name]
          # get the name of the person
          # now find bands neighbor rated that user didn't
          for artist in neighborRatings:
             if not artist in userRatings:
                if artist not in recommendations:
                   recommendations[artist] = (neighborRatings[artist]
                                              * weight)
                else:
                   recommendations[artist] = (recommendations[artist]
                                              + neighborRatings[artist]
                                              * weight)
       # now make list from dictionary
       recommendations = list(recommendations.items())
       recommendations = [(self.convertProductID2name(k), v)
                          for (k, v) in recommendations]
       # finally sort and return
       recommendations.sort(key=lambda artistTuple: artistTuple[1],
                            reverse = True)
       # Return the first n items
       return recommendations[:self.n]



In [None]:
r = recommender(users)
user = 'Jordyn'
print('Recommendation for {}: {}'.format(user, r.recommend(user)))
user = 'Hailey'
print('Recommendation for {}: {}'.format(user, r.recommend(user)))

# Using Slope One to make item-based recommendations
Slope One has two steps:
1. Calculate deviations between all pairs of items.
2. Use deviations to make predictions.

There are several algorithms for Slope One.  This is an example of a Weighted Slope One.

Thanks to Bryan O’Sullivan's blog "teideal glic deisbhéalach" for providing a Python version of Slope One:

http://www.serpentine.com/blog/2006/12/12/collaborative-filtering-made-easy/

In [None]:
def buildAverageDiffs(items, users, averages, writeToCache=True):
    for itemId in items:
        for otherItemId in items:
            average = 0
            userRatingPairCount = 0
            if itemId != otherItemId:
                for userId in users:
                    userRatings = users[userId]
                    if itemId in userRatings and otherItemId in userRatings:
                        userRatingPairCount += 1
                        average += (userRatings[itemId] - userRatings[otherItemId])
                averages[(itemId,otherItemId)] = average / userRatingPairCount

def suggestedRating(users, items, averages, targetUserId, targetItemId):
    runningRatingCount = 0
    weightedRatingTotal = 0.0
    for i in users[targetUserId]:
        ratingCount = usersWhoRatedBoth(users, i, targetItemId)
        weightedRatingTotal += (users[targetUserId][i] + averages[(targetItemId, i)]) * ratingCount
        runningRatingCount += ratingCount
    return weightedRatingTotal / runningRatingCount

def usersWhoRatedBoth(users, itemId1, itemId2):
    count = 0
    for userId in users:
        if itemId1 in users[userId] and itemId2 in users[userId]:
            count += 1
    return count

#-----------------------------------MAIN---------------------------------------
users2 = {"Amy": {"Taylor Swift": 4, "PSY": 3, "Whitney Houston": 4},
          "Ben": {"Taylor Swift": 5, "PSY": 2}
          ,"Clara": {"PSY": 3.5, "Whitney Houston": 4},
          "Daisy": {"Taylor Swift": 5, "Whitney Houston": 3}}
items = {'Taylor Swift': {"Amy":4, "Ben":5, "Daisy":5},
         'PSY': {"Ben":2, "Clara":3.5},
         'Whitney Houston': {"Amy":4, "Clara":4, "Daisy":3}}

averages = {}

buildAverageDiffs(items, users2, averages)

print({'ItemCount': len(items), 'UserCount': len(users2), 'AverageDiffsCount': len(averages)} )
print(averages)

print("\n\rGuess of rating that user 'Ben' will give 'Whitney Houston'= " + str(suggestedRating(users2,items, averages, 'Ben', 'Whitney Houston')))
print("\n\rGuess of rating that user 'Clara' will give 'Taylor Swift'= " + str(suggestedRating(users2,items, averages, 'Clara', 'Taylor Swift')))
print("\n\rNotice Clara's score!")

In [None]:
# Alternate version of the buildAverageDiffs() function.
def computeDeviations(users):
    # for each person in the data:
    #    get their ratings
    frequencies = {}
    deviations = {}
    for ratings in users.values():
        # for each item & rating in that set of ratings:
        for (item, rating) in ratings.items():
            frequencies.setdefault(item, {})
            deviations.setdefault(item, {})                    
        # for each item2 & rating2 in that set of ratings:
            for (item2, rating2) in ratings.items():
                if item != item2:
                    # add the difference between the ratings to our
                    # computation
                    frequencies[item].setdefault(item2, 0)
                    deviations[item].setdefault(item2, 0.0)
                    frequencies[item][item2] += 1
                    deviations[item][item2] += rating - rating2

    for (item, ratings) in deviations.items():
        for item2 in ratings:
            ratings[item2] /= frequencies[item][item2]
    print(deviations)

In [None]:
# How is this output different than the output of the buildAverageDiffs() function?
computeDeviations(users2)

In [None]:
def slopeOneRecommendations(userRatings):
    recommendations = {}
    frequencies = {}
    deviations = computeDeviations(userRatings)
    # for every item and rating in the user's recommendations
    for (userItem, userRating) in userRatings.items():
        # for every item in our dataset that the user didn't rate
        for (diffItem, diffRatings) in deviations.items():
            if diffItem not in userRatings and \
            userItem in deviations[diffItem]:
                freq = frequencies[diffItem][userItem]
                recommendations.setdefault(diffItem, 0.0)
                frequencies.setdefault(diffItem, 0)
                # add to the running sum representing the numerator
                # of the formula
                recommendations[diffItem] += (diffRatings[userItem] +
                                              userRating) * freq
                # keep a running sum of the frequency of diffitem
                frequencies[diffItem] += freq
    recommendations =  [(self.convertProductID2name(k),
                         v / frequencies[k])
                        for (k, v) in recommendations.items()]
    # finally sort and return
    recommendations.sort(key=lambda artistTuple: artistTuple[1],
                         reverse = True)
    # Return the first 50 recommendations
    return recommendations[:50]

In [None]:
computeDeviations(users2)

In [None]:
slopeOneRecommendations(users2)

# Incorporate Slope One into the Recommender class.

In [None]:
class recommender:
    """
    sampe data set available at https://grouplens.org/datasets/movielens/
    """
    def __init__(self, data, k=1, metric='pearson', n=5):
        """ initialize recommender
        currently, if data is dictionary the recommender is initialized
        to it.
        For all other data types of data, no initialization occurs
        k is the k value for k nearest neighbor
        metric is which distance formula to use
        n is the maximum number of recommendations to make"""
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        #
        # The following two variables are used for Slope One
        # 
        self.frequencies = {}
        self.deviations = {}
        # for some reason I want to save the name of the metric
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        #
        # if data is dictionary set recommender data to it
        #
        if type(data).__name__ == 'dict':
            self.data = data

    def convertProductID2name(self, id):
        """Given product id number return product name"""
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id

    def userRatings(self, id, n):
        """Return n top ratings for user with id"""
        print("Ratings for " + self.userid2name[id])
        ratings = self.data[id]
        print(len(ratings))
        ratings = list(ratings.items())[:n]
        ratings = [(self.convertProductID2name(k), v)
                   for (k, v) in ratings]
        # finally sort and return
        ratings.sort(key=lambda artistTuple: artistTuple[1],
                     reverse = True)      
        for rating in ratings:
            print("%s\t%i" % (rating[0], rating[1]))

    def showUserTopItems(self, user, n):
        """ show top n items for user"""
        items = list(self.data[user].items())
        items.sort(key=lambda itemTuple: itemTuple[1], reverse=True)
        for i in range(n):
            print("%s\t%i" % (self.convertProductID2name(items[i][0]),
                              items[i][1]))

    def computeDeviations(self):
        # for each person in the data:
        #    get their ratings
        for ratings in self.data.values():
            # for each item & rating in that set of ratings:
            for (item, rating) in ratings.items():
                self.frequencies.setdefault(item, {})
                self.deviations.setdefault(item, {})                    
                # for each item2 & rating2 in that set of ratings:
                for (item2, rating2) in ratings.items():
                    if item != item2:
                        # add the difference between the ratings to our
                        # computation
                        self.frequencies[item].setdefault(item2, 0)
                        self.deviations[item].setdefault(item2, 0.0)
                        self.frequencies[item][item2] += 1
                        self.deviations[item][item2] += rating - rating2

        for (item, ratings) in self.deviations.items():
            for item2 in ratings:
                ratings[item2] /= self.frequencies[item][item2]

    def slopeOneRecommendations(self, userRatings):
        recommendations = {}
        frequencies = {}
        # for every item and rating in the user's recommendations
        for (userItem, userRating) in userRatings.items():
            # for every item in our dataset that the user didn't rate
            for (diffItem, diffRatings) in self.deviations.items():
                if diffItem not in userRatings and \
                userItem in self.deviations[diffItem]:
                    freq = self.frequencies[diffItem][userItem]
                    recommendations.setdefault(diffItem, 0.0)
                    frequencies.setdefault(diffItem, 0)
                    # add to the running sum representing the numerator
                    # of the formula
                    recommendations[diffItem] += (diffRatings[userItem] +
                                                  userRating) * freq
                    # keep a running sum of the frequency of diffitem
                    frequencies[diffItem] += freq
        recommendations =  [(self.convertProductID2name(k),
                             v / frequencies[k])
                            for (k, v) in recommendations.items()]
        # finally sort and return
        recommendations.sort(key=lambda artistTuple: artistTuple[1],
                             reverse=True)
        # Return only the first 50 recommendations
        return recommendations[:50]

    def pearson(self, rating1, rating2):
        """ Determines how much influence each person should have, 
        adjusting for grade inflation.
        """
        sum_xy = 0
        sum_x = 0
        sum_y = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x * y
                sum_x += x
                sum_y += y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        if n == 0:
            return 0
        # now compute denominator
        denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * \
                        sqrt(sum_y2 - pow(sum_y, 2) / n)
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y) / n) / denominator

    def computeNearestNeighbor(self, username):
        """creates a sorted list of users based on their distance
        to username"""
        distances = []
        try:
            for instance in self.data:
                if instance != username:
                    distance = self.fn(self.data[username],
                                       self.data[instance])
                    distances.append((instance, distance))
        except KeyError:
            pass
                
        # sort based on distance -- closest first
        distances.sort(key=lambda artistTuple: artistTuple[1],
                       reverse=True)
        return distances

    def recommend(self, user):
        """Give list of recommendations"""
        recommendations = {}
        # first get list of users  ordered by nearness
        nearest = self.computeNearestNeighbor(user)
        #
        # now get the ratings for the user
        #
        try:
            userRatings = self.data[user]
        except KeyError:
            return print('There is no user named {}'.format(user))
        #
        # determine the total distance
        totalDistance = 0.0
        for i in range(self.k):
            totalDistance += nearest[i][1]
        # now iterate through the k nearest neighbors
        # accumulating their ratings
        for i in range(self.k):
            # compute slice of pie 
            weight = nearest[i][1] / totalDistance
            # get the name of the person
            name = nearest[i][0]
            # get the ratings for this person
            neighborRatings = self.data[name]
            # get the name of the person
            # now find bands neighbor rated that user didn't
            for artist in neighborRatings:
                if not artist in userRatings:
                    if artist not in recommendations:
                        recommendations[artist] = neighborRatings[artist] * weight
                    else:
                        recommendations[artist] = recommendations[artist] + \
                                            neighborRatings[artist] * \
                                            weight
        # now make list from dictionary and only get the first n items
        recommendations = list(recommendations.items())[:self.n]
        recommendations = [(self.convertProductID2name(k), v)
                           for (k, v) in recommendations]
        # finally sort and return
        recommendations.sort(key=lambda artistTuple: artistTuple[1],
                             reverse=True)
        return recommendations

In [None]:
r = recommender(users)

In [None]:
r.recommend('Jordyn') 

In [None]:
r.recommend('Hailey')

In [None]:
r.recommend('Clara')

In [None]:
r.r.computeNearestNeighbor('Dan')

In [None]:
r.recommend('Robet')

# Scenarios to keep in mind (when to these methods).
* Use Pearson Correlation when data are subject to grade-inflation.
* Use Distance measures when data are dense.
* Use Cosine Similarity when data are sparse.

In [None]:
bandname1 = 'Blues Traveler'
bandname2 = 'The Strokes'
votes1 = 0
votes2 = 0
user1 = 'Angelica'
user2 = 'Veronica'
for user, item in users.items():
    print(user)
    if bandname1 in item:
        votes1 += 1
        print('  {}: {}'.format(bandname1, item[bandname1]))
    if bandname2 in item:
        votes2 += 1
        print('  {}: {}'.format(bandname2, item[bandname2]))
    else:
        pass
print('\n\r{}% users have a rated the band {}.'.format((votes / len(users)), bandname1))
print('\n\r{}% users have a rated the band {}.'.format((votes / len(users)), bandname2))
