<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Only do this once per VM, otherwise you'll get multiple clones and nested directories
!git clone https://github.com/jamestheengineer/data-science-from-scratch-Python.git
%cd data-science-from-scratch-Python/
!pip install import-ipynb
import import_ipynb

In [None]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [None]:
# One simple approach is to recommend what is popular
from collections import Counter

popular_interests = Counter(interest 
                            for user_interests in users_interests
                            for interest in user_interests)
print(popular_interests)

In [None]:
# We can the just suggest the most popular itnerests that are not already in a user's list
from typing import List, Tuple

def most_popular_new_interests(
        user_interests : List[str],
        max_results: int = 5) -> List[Tuple[str, int]]:
    suggestions = [(interest, frequency)
                    for interest, frequency in popular_interests.most_common()
                    if interest not in user_interests]
    return suggestions[:max_results]

print(most_popular_new_interests(users_interests[1]))
print(most_popular_new_interests(users_interests[0]))

In [None]:
# Collaborative filtering to find new interests based on similarity to others who have similar things to you
unique_interests = sorted({interest
                           for user_interests in users_interests
                           for interest in user_interests})
assert unique_interests[:6] == [
                                'Big Data',
                                'C++',
                                'Cassandra',
                                'HBase',
                                'Hadoop',
                                'Haskell',
]
print(unique_interests)

In [None]:
# Next we produce an interest vector of 0s and 10s for each user
def make_user_interest_vector(user_interests: List[str]) -> List[int]:
  """
  Given a list of interests, produce a vector whose ith element is 1
  if unique_interests[i] is in the list, 0 otherwise
  """
  return [1 if interest in user_interests else 0
          for interest in unique_interests]

user_interest_vectors = [make_user_interest_vector(user_interests)
                        for user_interests in users_interests]
print(user_interest_vectors)



In [None]:
# Because we have a small dataset, it's no problem to compute the pairwise similarities
from Chapter_21 import cosine_similarity

user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_vectors]
                     for interest_vector_i in user_interest_vectors]

assert 0.56 < user_similarities[0][9] < 0.58, "several shared interests"
assert 0.18 < user_similarities[0][8] < 0.20, "only one shared interest"

In [None]:
def most_similar_users_to(user_id: int) -> List[Tuple[int, float]]:
  pairs = [(other_user_id, similarity) 
            for other_user_id, similarity in
                enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity > 0]
  return sorted(pairs,
                key = lambda pair: pair[-1],
                reverse=True)
  
most_similar_users_to(0)

In [None]:
from collections import defaultdict

def user_based_suggestions(user_id: int,
                           include_current_interests: bool = False):
  # Sum up the similarities
  suggestions: Dict[str, float] = defaultdict(float)
  for other_user_id, similarity in most_similar_users_to(user_id):
    for interest in users_interests[other_user_id]:
      suggestions[interest] += similarity

  # Convert them to a sorted list
  suggestions = sorted(suggestions.items(),
                       key=lambda pair: pair[-1], 
                       reverse=True)
  # And (maybe) exclude already interests
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion, weight)
            for suggestion, weight in suggestions
            if suggestion not in users_interests[user_id]]

user_based_suggestions(0)


In [None]:
# Item-based suggestions

interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_vectors]
                        for j, _ in enumerate(unique_interests)]

interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                            for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

def most_similar_interests_to(interest_id: int):
  similarities = interest_similarities[interest_id]
  pairs = [(unique_interests[other_interest_id], similarity)
            for other_interest_id, similarity in enumerate(similarities)
            if interest_id != other_interest_id and similarity > 0]

  return sorted(pairs,
                key=lambda pair: pair[-1],
                reverse=True)

In [None]:
most_similar_interests_to(0)

In [None]:
def item_based_suggestions(user_id: int,
                           include_current_interests: bool = False):
  # Add up the similar interests
  suggestions = defaultdict(float)
  user_interest_vector = user_interest_vectors[user_id]
  for interest_id, is_interested in enumerate(user_interest_vector):
    if is_interested == 1:
      similar_interests = most_similar_interests_to(interest_id)
      for interest, similarity in similar_interests:
        suggestions[interest] += similarity

  # Sort them by weight
  suggestions = sorted(suggestions.items(),
                       key=lambda pair: pair[-1],
                       reverse=True)
  
  if include_current_interests:
    return suggestions
  else:
    return [(suggestion, weight)
            for suggestion, weight in suggestions
            if suggestion not in users_interests[user_id]]

item_based_suggestions(0)

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

In [None]:
!unzip ml-100k.zip

In [None]:
# This points to the current directory, modify if your files are elsewhere
MOVIES = "ml-100k/u.item"
RATINGS = "ml-100k/u.data"
from typing import NamedTuple

class Rating(NamedTuple):
  user_id: str
  movie_id: str
  rating: float

import csv
# We specify this encoding to avoid a UnicodeDecodeError
# See https://stackoverlow.com/a/53136168/1076346
with open(MOVIES, encoding="iso-8859-1") as f:
  reader = csv.reader(f, delimiter="|")
  movies = {movie_id: title for movie_id, title, *_ in reader}

# Create a list of [Rating]
with open(RATINGS, encoding="iso-8859-1") as f:
  reader = csv.reader(f, delimiter="\t")
  ratings = [Rating(user_id, movie_id, float(rating))
              for user_id, movie_id, rating, _ in reader]

assert len(movies) == 1682
assert len(list({rating.user_id for rating in ratings})) == 943

In [None]:
# Average ratings for Star Wars movies
import re

# Data structure for accumulating ratings by movie_id
star_wars_ratings = {movie_id: []
                     for movie_id, title in movies.items()
                     if re.search("Star Wars|Empire Strikes|Jedi", title)}

# Iterate over ratings, accumulating the Star Wars ones
for rating in ratings:
  if rating.movie_id in star_wars_ratings:
    star_wars_ratings[rating.movie_id].append(rating.rating)
    
 # Compute the average rating for each movie
avg_ratings = [(sum(title_ratings) / len(title_ratings), movie_id)
                for movie_id, title_ratings in star_wars_ratings.items()]

# And then print them in order
for avg_rating, movie_id in sorted(avg_ratings, reverse=True):
  print(f"{avg_rating:.2f} {movies[movie_id]}")

In [None]:
# Lets come up with a model to predict these ratings
import random
random.seed(0)
random.shuffle(ratings)

split1 = int(len(ratings) * 0.7)
split2 = int(len(ratings) * 0.85)

train = ratings[:split1] # 70% of the data
validation = ratings[split1:split2] # 15% of the data
test = ratings[split2:] # 15% of the data

# Baseling model to make sure we do better. Just use average rating
avg_rating = sum(rating.rating for rating in train) / len(train)
baseline_error = sum((rating.rating - avg_rating) ** 2
                     for rating in test) / len(test)

# THis is what we hope to do better than
assert 1.26 < baseline_error < 1.27

In [None]:
from Chapter_19 import random_tensor

EMBEDDING_DIM = 2

# Find unique ids
user_ids = {rating.user_id for rating in ratings}
movie_ids = {rating.movie_id for rating in ratings}

# Then create a random vector per id
user_vectors = {user_id: random_tensor(EMBEDDING_DIM)
                for user_id in user_ids}
movie_vectors = {movie_id: random_tensor(EMBEDDING_DIM)
                for movie_id in movie_ids}

from typing import List
import tqdm
from Chapter_04 import dot

def loop(dataset: List[Rating],
         learning_rate: float = None) -> None:
    with tqdm.tqdm(dataset) as t:
      loss = 0.0
      for i, rating in enumerate(t):
        movie_vector = movie_vectors[rating.movie_id]
        user_vector = user_vectors[rating.user_id]
        predicted = dot(user_vector, movie_vector)
        error = predicted - rating.rating
        loss += error ** 2

        if learning_rate is not None:
          user_gradient = [error * m_j for m_j in movie_vector]
          movie_gradient = [error * u_j for u_j in user_vector]

          # Take gradient steps
          for j in range(EMBEDDING_DIM):
            user_vector[j] -= learning_rate * user_gradient[j]
            movie_vector[j] -= learning_rate * movie_gradient[j]
        
        t.set_description(f"avg loss: {loss / (i + 1)}")

# Now let's train
learning_rate = 0.05
for epoch in range(20): 
  learning_rate *= 0.9
  print(epoch, learning_rate)
  loop(train, learning_rate=learning_rate)
  loop(validation)
loop(test)

In [None]:
from Chapter_10 import pca, transform

original_vectors = [vector for vector in movie_vectors.values()]
components = pca(original_vectors, 2)

ratings_by_movie = defaultdict(list)
for rating in ratings:
  ratings_by_movie[rating.movie_id].append(rating.rating)

vectors = [
    (movie_id, 
     sum(ratings_by_movie[movie_id]) / len(ratings_by_movie[movie_id]),
     movies[movie_id],
     vector)
    for movie_id, vector in zip(movie_vectors.keys(),
                                transform(original_vectors, components))
]

# Print top 25 and bottom 25 by first principal component
print(sorted(vectors, key=lambda v: v[-1][0])[:25])
print(sorted(vectors, key=lambda v: v[-1][0])[-25])