<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb

In [None]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [None]:
# One simple approach is to recommend what is popular
from collections import Counter

popular_interests = Counter(interest 
                            for user_interests in users_interests
                            for interest in user_interests)
print(popular_interests)

In [None]:
# We can the just suggest the most popular itnerests that are not already in a user's list
from typing import List, Tuple

def most_popular_new_interests(
        user_interests : List[str],
        max_results: int = 5) -> List[Tuple[str, int]]:
    suggestions = [(interest, frequency)
                    for interest, frequency in popular_interests.most_common()
                    if interest not in user_interests]
    return suggestions[:max_results]

print(most_popular_new_interests(users_interests[1]))
print(most_popular_new_interests(users_interests[0]))

In [None]:
# Collaborative filtering to find new interests based on similarity to others who have similar things to you
unique_interests = sorted({interest
                           for user_interests in users_interests
                           for interest in user_interests})
assert unique_interests[:6] == [
                                'Big Data',
                                'C++',
                                'Cassandra',
                                'HBase',
                                'Hadoop',
                                'Haskell',
]
print(unique_interests)

In [None]:
# Next we produce an interest vector of 0s and 10s for each user
def make_user_interest_vector(user_interests: List[str]) -> List[int]:
  """
  Given a list of interests, produce a vector whose ith element is 1
  if unique_interests[i] is in the list, 0 otherwise
  """
  return [1 if interest in user_interests else 0
          for interest in unique_interests]

user_interest_vectors = [make_user_interest_vector(user_interests)
                        for user_interests in users_interests]
print(user_interest_vectors)



In [9]:
# Because we have a small dataset, it's no problem to compute the pairwise similarities
from Chapter_21 import cosine_similarity

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_vectors]
                     for interest_vector_i in user_interest_vectors]

assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests"
assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

In [14]:
def most_similar_users_to(user_id: int) -> List[Tuple[int, float]]:
  pairs = [(other_user_id, similarity) 
            for other_user_id, similarity in
                enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity > 0]
  return sorted(pairs,
                key = lambda pair: pair[-1],
                reverse=True)
  
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In [19]:
from collections import defaultdict

def user_based_suggestions(user_id: int,
                           include_current_interests: bool = False):
  # Sum up the similarities
  suggestions: Dict[str, float] = defaultdict(float)
  for other_user_id, similarity in most_similar_users_to(user_id):
    for interest in users_interests[other_user_id]:
      suggestions[interest] += similarity

  # Convert them to a sorted list
  suggestions = sorted(suggestions.items(),
                       key=lambda pair: pair[-1], 
                       reverse=True)
  # And (maybe) exclude already interests
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion, weight)
            for suggestion, weight in suggestions
            if suggestion not in users_interests[user_id]]

user_based_suggestions(0)


[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('Python', 0.1543033499620919),
 ('R', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('programming languages', 0.1543033499620919)]