added code, slides, data for class 18

justmarkham · Mar 2, 2015 · cf70d56 · cf70d56
1 parent cdde150
commit cf70d56
Show file tree

Hide file tree

Showing 4 changed files with 456 additions and 0 deletions.
diff --git a/code/18_recommenders_class.py b/code/18_recommenders_class.py
@@ -0,0 +1,227 @@
+
+"""
+Class 18: Recommendation Engines
+Jaccard Similarity
+Modified KNN Algorithm
+"""
+
+import pandas as pd
+from collections import Counter
+#read in brands data
+user_brands = pd.read_csv('../data/user_brand.csv')
+# user_brands = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT4/master/data/user_brand.csv')
+
+#look at count of stores
+user_brands.Store.value_counts()
+
+# Series of user IDs, note the duplicates
+user_ids = user_brands.ID
+
+# groupby ID to see what each user likes!
+user_brands.groupby('ID').Store.value_counts()
+
+# turns my data frame into a dictionary
+# where the key is a user ID, and the value is a 
+# list of stores that the user "likes"
+brandsfor = {str(k): list(v) for k,v in user_brands.groupby("ID")["Store"]}
+
+# try it out. User 83065 likes Kohl's and Target
+brandsfor['83065']
+
+# User 82983 likes many more!
+brandsfor['82983']
+
+
+
+########################
+## Jaccard Similarity ##
+########################
+'''
+The Jaccard Similarity allows us to compare two sets
+If we regard people as merely being a set of brands they prefer
+the Jaccard Similarity allows us to compare people
+
+Example. the jaccard similarty between user 82983 and 83065 is .125
+            because
+            brandsfor['83065'] == ["Kohl's", 'Target']
+            brandsfor['82983'] == ['Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom']
+
+the intersection of these two sets is just set("Target")
+the union of the two sets is set(['Target', 'Hanky Panky', 'Betsey Johnson', 'Converse', 'Steve Madden', 'Old Navy', 'Target', 'Nordstrom'])
+so the len(intersection) / len(union) = 1 / 8 == .125
+
+EXERCISE: what is the Jaccard Similarity 
+          between user 82956 and user 82963?
+
+'''
+brandsfor['82956'] # == ['Diesel', 'Old Navy', 'Crate & Barrel', 'Target']
+
+brandsfor['82963'] # == ['Puma', 'New Balance', 'Old Navy', 'Target']
+
+
+'''
+EXERCISE: Complete the jaccard method below.
+          It should take in a list of brands, and output the 
+          jaccard similarity between them
+
+This should work with anything in the set, for example
+jaccard([1,2,3], [2,3,4,5,6])  == .3333333
+
+HINT: set1 & set2 is the intersection
+      set1 | set2 is the union
+
+'''
+
+def jaccard(first, second):
+  first = set(first)
+  second = set(second)
+  # the line below should be changed
+  return 0
+
+# try it out!
+brandsfor['83065'] # brands for user 83065
+brandsfor['82983'] # brands for user 82983
+jaccard(brandsfor['83065'], brandsfor['82983'])
+
+
+jaccard(brandsfor['82956'], brandsfor['82963'])
+
+
+
+
+#######################
+### Our Recommender ###
+#######################
+
+'''
+Our recommender will be a modified KNN collaborative algorithm.
+Input: A given user's brands that they like
+Output: A set (no repeats) of brand recommendations based on
+        similar users preferences
+
+1. When a user's brands are given to us, we will calculate the input user's
+jaccard similarity with every person in our brandsfor dictionary
+
+2. We will pick the K most similar users and recommend
+the brands that they like that the given user doesn't know about
+
+EXAMPLE:
+Given User likes ['Target', 'Old Navy', 'Banana Republic', 'H&M']
+Outputs: ['Forever 21', 'Gap', 'Steve Madden']
+'''
+
+
+given_user = ['Target', 'Old Navy', 'Banana Republic', 'H&M']
+
+#similarty between user 83065 and given user
+brandsfor['83065']
+jaccard(brandsfor['83065'], given_user) 
+# should be 0.2
+
+'''
+EXERCISE
+    Find the similarty between given_user and ALL of our users
+    output should be a dictionary where
+    the key is a user id and the value is the jaccard similarity
+{...
+ '83055': 0.25,
+ '83056': 0.0,
+ '83058': 0.1111111111111111,
+ '83060': 0.07894736842105263,
+ '83061': 0.4,
+ '83064': 0.25,
+ '83065': 0.2,
+ ...}
+'''
+# ANSWER
+similarities = {}
+
+similarities
+
+K = 5 #number of similar users to look at
+
+
+# Now for the top K most similar users, let's aggregate the brands they like.
+# I sort by the jaccard similarty so most similar users are first
+# I use the sorted method, but because I'm dorting dictionaries
+# I specify the "key" as the value of the dictionary
+# the key is what the list should sort on
+# so the most similar users end up being on top
+most_similar_users = sorted(similarities, key=similarities.get, reverse=True)[:K]
+
+# list of K similar users' IDs
+most_similar_users
+
+# let's see what some of the most similar users likes
+brandsfor[most_similar_users[0]]
+
+brandsfor[most_similar_users[3]]
+
+# Aggregate all brands liked by the K most similar users into a single set
+brands_to_recommend = set()
+for user in most_similar_users:
+    # for each user
+    brands_to_recommend.update(set(brandsfor[user]))
+    # add to the set of brands_to_recommend
+
+
+brands_to_recommend
+# UH OH WE HAVE DUPLICATES. Banana Republic, Old Navy, Target are all repeats.
+
+
+# EXERCISE: use a set difference so brands_to_recommend only has
+# brands that given_user hasn't seen yet
+
+# ANSWER HERE
+brands_to_recommend = brands_to_recommend
+
+# without duplicates
+brands_to_recommend
+
+
+####################
+##One Step Further##
+####################
+
+# We can take this one step further and caculate a "score" of recommendation
+# We will define the score as being the number of times
+# a brand appears within the first K users
+brands_to_recommend = []
+for user in most_similar_users:
+    brands_to_recommend += list(set(brandsfor[user]) - set(given_user))
+
+# Use a counter to count the number of times a brand appears
+recommend_with_scores = Counter(brands_to_recommend)
+
+# Now we see Gap has the highest score!
+recommend_with_scores
+
+###################
+#### Item based ###
+###################
+
+'''
+We can also define a similary between items using jaccard similarity.
+We can say that the similarity between two items is the jaccard similarity
+between the sets of people who like the two brands.
+
+Example: similarity of Gap to Target is:
+'''
+# filter users by liking Gap
+gap_lovers = set(user_brands['Gap' == user_brands.Store].ID)
+old_navy_lovers = set(user_brands['Old Navy' == user_brands.Store].ID)
+
+# similarty between Gap and Old Navy
+jaccard(gap_lovers, old_navy_lovers)
+
+
+guess_lovers = set(user_brands['Guess' == user_brands.Store].ID)
+# similarty between Gap andGuess
+jaccard(guess_lovers, gap_lovers)
+
+
+calvin_lovers = set(user_brands['Calvin Klein' == user_brands.Store].ID)
+# similarty between Gap and Calvin Klein
+jaccard(calvin_lovers, gap_lovers)
+
+