diff --git a/code/18_recommenders_class.py b/code/18_recommenders_class.py new file mode 100644 index 0000000..f5028d0 --- /dev/null +++ b/code/18_recommenders_class.py @@ -0,0 +1,227 @@ + +""" +Class 18: Recommendation Engines +Jaccard Similarity +Modified KNN Algorithm +""" + +import pandas as pd +from collections import Counter +#read in brands data +user_brands = pd.read_csv('../data/user_brand.csv') +# user_brands = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/user_brand.csv') + +#look at count of stores +user_brands.Store.value_counts() + +# Series of user IDs, note the duplicates +user_ids = user_brands.ID + +# groupby ID to see what each user likes! +user_brands.groupby('ID').Store.value_counts() + +# turns my data frame into a dictionary +# where the key is a user ID, and the value is a +# list of stores that the user "likes" +brandsfor = {str(k): list(v) for k,v in user_brands.groupby("ID")["Store"]} + +# try it out. User 83065 likes Kohl's and Target +brandsfor['83065'] + +# User 82983 likes many more! +brandsfor['82983'] + + + +######################## +## Jaccard Similarity ## +######################## +''' +The Jaccard Similarity allows us to compare two sets +If we regard people as merely being a set of brands they prefer +the Jaccard Similarity allows us to compare people + +Example. the jaccard similarty between user 82983 and 83065 is .125 + because + brandsfor['83065'] == ["Kohl's", 'Target'] + brandsfor['82983'] == ['Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom'] + +the intersection of these two sets is just set("Target") +the union of the two sets is set(['Target', 'Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom']) +so the len(intersection) / len(union) = 1 / 8 == .125 + +EXERCISE: what is the Jaccard Similarity + between user 82956 and user 82963? + +''' +brandsfor['82956'] # == ['Diesel', 'Old Navy', 'Crate & Barrel', 'Target'] + +brandsfor['82963'] # == ['Puma', 'New Balance', 'Old Navy', 'Target'] + + +''' +EXERCISE: Complete the jaccard method below. + It should take in a list of brands, and output the + jaccard similarity between them + +This should work with anything in the set, for example +jaccard([1,2,3], [2,3,4,5,6]) == .3333333 + +HINT: set1 & set2 is the intersection + set1 | set2 is the union + +''' + +def jaccard(first, second): + first = set(first) + second = set(second) + # the line below should be changed + return 0 + +# try it out! +brandsfor['83065'] # brands for user 83065 +brandsfor['82983'] # brands for user 82983 +jaccard(brandsfor['83065'], brandsfor['82983']) + + +jaccard(brandsfor['82956'], brandsfor['82963']) + + + + +####################### +### Our Recommender ### +####################### + +''' +Our recommender will be a modified KNN collaborative algorithm. +Input: A given user's brands that they like +Output: A set (no repeats) of brand recommendations based on + similar users preferences + +1. When a user's brands are given to us, we will calculate the input user's +jaccard similarity with every person in our brandsfor dictionary + +2. We will pick the K most similar users and recommend +the brands that they like that the given user doesn't know about + +EXAMPLE: +Given User likes ['Target', 'Old Navy', 'Banana Republic', 'H&M'] +Outputs: ['Forever 21', 'Gap', 'Steve Madden'] +''' + + +given_user = ['Target', 'Old Navy', 'Banana Republic', 'H&M'] + +#similarty between user 83065 and given user +brandsfor['83065'] +jaccard(brandsfor['83065'], given_user) +# should be 0.2 + +''' +EXERCISE + Find the similarty between given_user and ALL of our users + output should be a dictionary where + the key is a user id and the value is the jaccard similarity +{... + '83055': 0.25, + '83056': 0.0, + '83058': 0.1111111111111111, + '83060': 0.07894736842105263, + '83061': 0.4, + '83064': 0.25, + '83065': 0.2, + ...} +''' +# ANSWER +similarities = {} + +similarities + +K = 5 #number of similar users to look at + + +# Now for the top K most similar users, let's aggregate the brands they like. +# I sort by the jaccard similarty so most similar users are first +# I use the sorted method, but because I'm dorting dictionaries +# I specify the "key" as the value of the dictionary +# the key is what the list should sort on +# so the most similar users end up being on top +most_similar_users = sorted(similarities, key=similarities.get, reverse=True)[:K] + +# list of K similar users' IDs +most_similar_users + +# let's see what some of the most similar users likes +brandsfor[most_similar_users[0]] + +brandsfor[most_similar_users[3]] + +# Aggregate all brands liked by the K most similar users into a single set +brands_to_recommend = set() +for user in most_similar_users: + # for each user + brands_to_recommend.update(set(brandsfor[user])) + # add to the set of brands_to_recommend + + +brands_to_recommend +# UH OH WE HAVE DUPLICATES. Banana Republic, Old Navy, Target are all repeats. + + +# EXERCISE: use a set difference so brands_to_recommend only has +# brands that given_user hasn't seen yet + +# ANSWER HERE +brands_to_recommend = brands_to_recommend + +# without duplicates +brands_to_recommend + + +#################### +##One Step Further## +#################### + +# We can take this one step further and caculate a "score" of recommendation +# We will define the score as being the number of times +# a brand appears within the first K users +brands_to_recommend = [] +for user in most_similar_users: + brands_to_recommend += list(set(brandsfor[user]) - set(given_user)) + +# Use a counter to count the number of times a brand appears +recommend_with_scores = Counter(brands_to_recommend) + +# Now we see Gap has the highest score! +recommend_with_scores + +################### +#### Item based ### +################### + +''' +We can also define a similary between items using jaccard similarity. +We can say that the similarity between two items is the jaccard similarity +between the sets of people who like the two brands. + +Example: similarity of Gap to Target is: +''' +# filter users by liking Gap +gap_lovers = set(user_brands['Gap' == user_brands.Store].ID) +old_navy_lovers = set(user_brands['Old Navy' == user_brands.Store].ID) + +# similarty between Gap and Old Navy +jaccard(gap_lovers, old_navy_lovers) + + +guess_lovers = set(user_brands['Guess' == user_brands.Store].ID) +# similarty between Gap andGuess +jaccard(guess_lovers, gap_lovers) + + +calvin_lovers = set(user_brands['Calvin Klein' == user_brands.Store].ID) +# similarty between Gap and Calvin Klein +jaccard(calvin_lovers, gap_lovers) + + diff --git a/code/18_recommenders_soutions.py b/code/18_recommenders_soutions.py new file mode 100644 index 0000000..e0ed2bb --- /dev/null +++ b/code/18_recommenders_soutions.py @@ -0,0 +1,229 @@ + +""" +Class 18: Recommendation Engines +Jaccard Similarity +Modified KNN Algorithm +""" + +import pandas as pd +from collections import Counter +#read in brands data +user_brands = pd.read_csv('../data/user_brand.csv') +# user_brands = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/user_brand.csv') + +#look at count of stores +user_brands.Store.value_counts() + +# Series of user IDs, note the duplicates +user_ids = user_brands.ID + +# groupby ID to see what each user likes! +user_brands.groupby('ID').Store.value_counts() + +# turns my data frame into a dictionary +# where the key is a user ID, and the value is a +# list of stores that the user "likes" +brandsfor = {str(k): list(v) for k,v in user_brands.groupby("ID")["Store"]} + +# try it out. User 83065 likes Kohl's and Target +brandsfor['83065'] + +# User 82983 likes many more! +brandsfor['82983'] + + + +######################## +## Jaccard Similarity ## +######################## +''' +The Jaccard Similarity allows us to compare two sets +If we regard people as merely being a set of brands they prefer +the Jaccard Similarity allows us to compare people + +Example. the jaccard similarty between user 82983 and 83065 is .125 + because + brandsfor['83065'] == ["Kohl's", 'Target'] + brandsfor['82983'] == ['Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom'] + +the intersection of these two sets is just set("Target") +the union of the two sets is set(['Target', 'Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom']) +so the len(intersection) / len(union) = 1 / 8 == .125 + +EXERCISE: what is the Jaccard Similarity + between user 82956 and user 82963? +# ANSWER == 0.3333333333 + +''' +brandsfor['82956'] # == ['Diesel', 'Old Navy', 'Crate & Barrel', 'Target'] + +brandsfor['82963'] # == ['Puma', 'New Balance', 'Old Navy', 'Target'] + + +''' +EXERCISE: Complete the jaccard method below. + It should take in a list of brands, and output the + jaccard similarity between them + +This should work with anything in the set, for example +jaccard([1,2,3], [2,3,4,5,6]) == .3333333 + +HINT: set1 & set2 is the intersection + set1 | set2 is the union + +''' + +def jaccard(first, second): + first = set(first) + second = set(second) + # the line below should be changed + # ANSWER + return float(len(first & second)) / len(first | second) + +# try it out! +brandsfor['83065'] # brands for user 83065 +brandsfor['82983'] # brands for user 82983 +jaccard(brandsfor['83065'], brandsfor['82983']) + + +jaccard(brandsfor['82956'], brandsfor['82963']) + + + + +####################### +### Our Recommender ### +####################### + +''' +Our recommender will be a modified KNN collaborative algorithm. +Input: A given user's brands that they like +Output: A set (no repeats) of brand recommendations based on + similar users preferences + +1. When a user's brands are given to us, we will calculate the input user's +jaccard similarity with every person in our brandsfor dictionary + +2. We will pick the K most similar users and recommend +the brands that they like that the given user doesn't know about + +EXAMPLE: +Given User likes ['Target', 'Old Navy', 'Banana Republic', 'H&M'] +Outputs: ['Forever 21', 'Gap', 'Steve Madden'] +''' + + +given_user = ['Target', 'Old Navy', 'Banana Republic', 'H&M'] + +#similarty between user 83065 and given user +brandsfor['83065'] +jaccard(brandsfor['83065'], given_user) +# should be 0.2 + +''' +EXERCISE + Find the similarty between given_user and ALL of our users + output should be a dictionary where + the key is a user id and the value is the jaccard similarity +{... + '83055': 0.25, + '83056': 0.0, + '83058': 0.1111111111111111, + '83060': 0.07894736842105263, + '83061': 0.4, + '83064': 0.25, + '83065': 0.2, + ...} +''' +# ANSWER +similarities = {k: jaccard(given_user, v) for k, v in brandsfor.iteritems()} + +similarities + +K = 5 #number of similar users to look at + + +# Now for the top K most similar users, let's aggregate the brands they like. +# I sort by the jaccard similarty so most similar users are first +# I use the sorted method, but because I'm dorting dictionaries +# I specify the "key" as the value of the dictionary +# the key is what the list should sort on +# so the most similar users end up being on top +most_similar_users = sorted(similarities, key=similarities.get, reverse=True)[:K] + +# list of K similar users' IDs +most_similar_users + +# let's see what some of the most similar users likes +brandsfor[most_similar_users[0]] + +brandsfor[most_similar_users[3]] + +# Aggregate all brands liked by the K most similar users into a single set +brands_to_recommend = set() +for user in most_similar_users: + # for each user + brands_to_recommend.update(set(brandsfor[user])) + # add to the set of brands_to_recommend + + +brands_to_recommend +# UH OH WE HAVE DUPLICATES. Banana Republic, Old Navy, Target are all repeats. + + +# EXERCISE: use a set difference so brands_to_recommend only has +# brands that given_user hasn't seen yet + +# ANSWER +brands_to_recommend = brands_to_recommend - set(given_user) + +# without duplicates +brands_to_recommend + + +#################### +##One Step Further## +#################### + +# We can take this one step further and caculate a "score" of recommendation +# We will define the score as being the number of times +# a brand appears within the first K users +brands_to_recommend = [] +for user in most_similar_users: + brands_to_recommend += list(set(brandsfor[user]) - set(given_user)) + +# Use a counter to count the number of times a brand appears +recommend_with_scores = Counter(brands_to_recommend) + +# Now we see Gap has the highest score! +recommend_with_scores + +################### +#### Item based ### +################### + +''' +We can also define a similary between items using jaccard similarity. +We can say that the similarity between two items is the jaccard similarity +between the sets of people who like the two brands. + +Example: similarity of Gap to Target is: +''' +# filter users by liking Gap +gap_lovers = set(user_brands['Gap' == user_brands.Store].ID) +old_navy_lovers = set(user_brands['Old Navy' == user_brands.Store].ID) + +# similarty between Gap and Old Navy +jaccard(gap_lovers, old_navy_lovers) + + +guess_lovers = set(user_brands['Guess' == user_brands.Store].ID) +# similarty between Gap andGuess +jaccard(guess_lovers, gap_lovers) + + +calvin_lovers = set(user_brands['Calvin Klein' == user_brands.Store].ID) +# similarty between Gap and Calvin Klein +jaccard(calvin_lovers, gap_lovers) + + diff --git a/slides/18_recommendation_engines.pdf b/slides/18_recommendation_engines.pdf new file mode 100644 index 0000000..877d068 Binary files /dev/null and b/slides/18_recommendation_engines.pdf differ diff --git a/slides/18_recommendation_engines.pptx b/slides/18_recommendation_engines.pptx new file mode 100644 index 0000000..26f1999 Binary files /dev/null and b/slides/18_recommendation_engines.pptx differ