# Jonathan Halverson
# Wednesday, May 24, 2017
# User-based and item-based collaborative filtering

This notebook borrows heavily from Joel Grus' chapter on the subject:

In [1]:
from __future__ import division
import math, random
from collections import defaultdict, Counter

In [2]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [3]:
len(users_interests)

15

### Simplest method is to recommend most popular interests not already subscribed to

In [4]:
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()

In [5]:
popular_interests

[('Python', 4),
 ('R', 4),
 ('Java', 3),
 ('regression', 3),
 ('statistics', 3),
 ('probability', 3),
 ('HBase', 3),
 ('Big Data', 3),
 ('neural networks', 2),
 ('Hadoop', 2),
 ('deep learning', 2),
 ('pandas', 2),
 ('artificial intelligence', 2),
 ('libsvm', 2),
 ('C++', 2),
 ('Postgres', 2),
 ('MongoDB', 2),
 ('scikit-learn', 2),
 ('machine learning', 2),
 ('statsmodels', 2),
 ('Cassandra', 2),
 ('NoSQL', 1),
 ('Mahout', 1),
 ('Storm', 1),
 ('MySQL', 1),
 ('programming languages', 1),
 ('Haskell', 1),
 ('mathematics', 1),
 ('Spark', 1),
 ('numpy', 1),
 ('theory', 1),
 ('decision trees', 1),
 ('MapReduce', 1),
 ('scipy', 1),
 ('databases', 1),
 ('support vector machines', 1)]

In [6]:
def most_popular_new_interests(user_interests, max_results=5):
     suggestions = [(interest, frequency) 
                    for interest, frequency in popular_interests
                    if interest not in user_interests]
     return suggestions[:max_results]

In [7]:
print "Most Popular New Interests"
print "already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]
print most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
print
print "already like:", ["R", "Python", "statistics", "regression", "probability"]
print most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"])
print    

Most Popular New Interests
already like: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3)]

already like: ['R', 'Python', 'statistics', 'regression', 'probability']
[('Java', 3), ('HBase', 3), ('Big Data', 3), ('neural networks', 2), ('Hadoop', 2)]



### User-based collaborative filtering: Compute the cosine similarity between users

In [8]:
def dot(a, b):
     return sum([a_i * b_i for a_i, b_i in zip(a, b)])

In [9]:
def cosine_similarity(v, w):
     return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

In [10]:
unique_interests = sorted(list({interest 
                                for user_interests in users_interests
                                for interest in user_interests }))
unique_interests

['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [11]:
def make_user_interest_vector(user_interests):
     """given a list of interests, produce a vector whose i-th element is 1
     if unique_interests[i] is in the list, 0 otherwise"""
     return [1 if interest in user_interests else 0 for interest in unique_interests]

In [13]:
user_interest_matrix = map(make_user_interest_vector, users_interests)

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                      for interest_vector_i in user_interest_matrix]

In [14]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero 
             if user_id != other_user_id and similarity > 0]  # similarity

    return sorted(pairs,                                      # sort them
                  key=lambda (_, similarity): similarity,     # most similar
                  reverse=True)                               # first


def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda (_, weight): weight,
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [15]:
print "User based similarity"
print "most similar to 0"
print most_similar_users_to(0)

print "Suggestions for 0"
print user_based_suggestions(0)
print

User based similarity
most similar to 0
[(9, 0.5669467095138409), (1, 0.3380617018914066), (8, 0.1889822365046136), (13, 0.1690308509457033), (5, 0.1543033499620919)]
Suggestions for 0
[('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('Postgres', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('artificial intelligence', 0.1889822365046136), ('databases', 0.1690308509457033), ('MySQL', 0.1690308509457033), ('programming languages', 0.1543033499620919), ('Python', 0.1543033499620919), ('Haskell', 0.1543033499620919), ('C++', 0.1543033499620919), ('R', 0.1543033499620919)]



### Item-based collaborative filtering

In [22]:
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

In [23]:
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,
                  key=lambda (_, similarity): similarity,
                  reverse=True)

def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(),
                         key=lambda (_, similarity): similarity,
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

In [24]:
print "Item based similarity"
print "most similar to 'Big Data'"
print most_similar_interests_to(0)
print

print "suggestions for user 0"
print item_based_suggestions(0)

Item based similarity
most similar to 'Big Data'
[('Hadoop', 0.8164965809277261), ('Java', 0.6666666666666666), ('MapReduce', 0.5773502691896258), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('deep learning', 0.4082482904638631), ('neural networks', 0.4082482904638631), ('HBase', 0.3333333333333333)]

suggestions for user 0
[('MapReduce', 1.861807319565799), ('Postgres', 1.3164965809277263), ('MongoDB', 1.3164965809277263), ('NoSQL', 1.2844570503761732), ('programming languages', 0.5773502691896258), ('MySQL', 0.5773502691896258), ('Haskell', 0.5773502691896258), ('databases', 0.5773502691896258), ('neural networks', 0.4082482904638631), ('deep learning', 0.4082482904638631), ('C++', 0.4082482904638631), ('artificial intelligence', 0.4082482904638631), ('Python', 0.2886751345948129), ('R', 0.2886751345948129)]
