# Browsing Books - A Recomendation Engine for Books
For CMPT3520 Machine Learning II <br/>
Annabell Rodriguez, Laura Brin, Sandra Alex

## Introduction

### Business Problem

### Evaluation Metrics

## Browsing Books

### Loading data

In [None]:
#Loading Libraries
from __future__ import print_function

import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
import sklearn
import sklearn.manifold
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

# Install Altair and activate its colab renderer.
#print("Installing Altair...")
#!pip install git+git://github.com/altair-viz/altair.git
#!pip install altair vega_datasets
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.renderers.enable('colab')
#print("Done installing Altair.")
from sklearn.metrics.pairwise import cosine_similarity 

In [None]:
#Loading Dataset
from ast import literal_eval
 
users = pd.read_csv("Datasets\\Users.csv")
ratings= pd.read_csv("Datasets\\Ratings.csv")
books= pd.read_csv("Datasets\\Books.csv",dtype={"ISBN":object},low_memory=False)


### Books

In [None]:
books.head()

In [None]:
books.shape

In [None]:
books.isnull().sum()

In [None]:
books.duplicated().sum()

### Users

In [None]:
users.head()

In [None]:
users['city'],users['state'],users['country']=users["Location"].str.split(",",2).str
users.drop(['Location'],axis=1,inplace=True, errors='ignore')
users.head()

In [None]:
users["country"].fillna("Unknown", inplace = True)
users["state"].fillna("Unknown", inplace = True)
users["city"].fillna("Unknown", inplace = True)

In [None]:
country_list=users["country"].value_counts().where(users["country"].value_counts()>7500,other="Other")
print(country_list)


In [None]:
users["new_country"]=users["country"].apply(lambda x: x.strip(" ") if country_list[x]!="Other" else "Other")
users["new_country"].value_counts()


In [None]:
state_list=users["state"].value_counts().where(users["state"].value_counts()>5000,other="Other")
users["new_state"]=users["state"].apply(lambda x: x.strip(" ") if state_list[x]!="Other" else "Other")
users["new_state"].value_counts()

In [None]:
city_list=users["city"].value_counts().where(users["city"].value_counts()>1500,other="Other")
users["new_city"]=users["city"].apply(lambda x: x.strip(" ") if city_list[x]!="Other" else "Other")
users["new_city"].value_counts()

In [None]:
users.describe()

In [None]:
import random
random.seed(42)

f=lambda x: x if x<100 else round(random.randint(24,44))
users['Age']=users['Age'].apply(f)
users.describe()

In [None]:
users['Age'].value_counts()

In [None]:
users.describe(include=[object])

In [None]:
users.drop(['city','state','country'],axis=1,inplace=True, errors='ignore')

### Ratings

In [None]:
ratings.head()

In [None]:
ratings.shape

In [None]:
ratings_title=ratings.merge(books,on='ISBN')
ratings_title

In [None]:
ratings_title.drop(columns=["ISBN","Image-URL-S","Image-URL-M"],axis=1,inplace=True)

In [None]:
users.head()

In [None]:
df = ratings_title.merge(users.drop("Age", axis=1), on="User-ID")
df.head(10)

In [None]:
df['new_country'].unique()

In [None]:
df[df['new_country']=='canada'].head()

### Visuals

To visualize genres, books are assigned a random genre from their genre list

In [None]:
df

In [None]:
books.reset_index(drop=True, inplace=True)

In [None]:
# df remove other columns rather than book title and user id
df[['User-ID', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'new_country', 'new_state', 'new_city']]

In [None]:
df

In [None]:
df[['Book-Title', 'Year-Of-Publication', 'Publisher', 'Image-URL-L', 'new_country']]

In [None]:
book_ratings = books.merge(
    df[['Book-Title', 'new_country', 'Book-Rating']]
    .groupby(["Book-Title", 'new_country'], as_index=False)
    .agg({'Book-Rating': ['count', 'mean']})
    .flatten_cols(),
    on='Book-Title')

country_filter = alt.selection_multi(fields=['new_country'])

country_chart = alt.Chart().mark_bar().encode(
    x="count()",
    y=alt.Y('new_country'),
    color=alt.condition(
        country_filter,
        alt.Color("new_country:N"),
        alt.value('lightgray'))
).properties(height=600, selection=country_filter)

book_ratings

In [None]:
(book_ratings[['Book-Title', 'new_country', 'Book-Rating count', 'Book-Rating mean']]
 .sort_values('Book-Rating count', ascending=False)
 .head(10))

In [None]:
def filtered_hist(field, label, filter):
  """Creates a layered chart of histograms.
  The first layer (light gray) contains the histogram of the full data, and the
  second contains the histogram of the filtered data.
  Args:
    field: the field for which to generate the histogram.
    label: String label of the histogram.
    filter: an alt.Selection object to be used to filter the data.
  """
  base = alt.Chart().mark_bar().encode(
      x=alt.X(field, bin=alt.Bin(maxbins=10), title=label),
      y="count()",
  ).properties(
      width=300,
  )
  return alt.layer(
      base.transform_filter(filter),
      base.encode(color=alt.value('lightgray'), opacity=alt.value(.7)),
  ).resolve_scale(y='independent')

In [None]:
book_ratings

In [None]:
# Display the number of ratings and average rating per book.
alt.hconcat(
    filtered_hist('Book-Rating count', '# ratings / book', country_filter),
    filtered_hist('Book-Rating mean', 'mean rating', country_filter),
    country_chart,
    data=book_ratings)

## Content-Based Filtering

## Collaborative Filtering Based Recommender System

In [None]:
x = df.groupby('User-ID').count()['Book-Rating']>200
knowledgable_users = x[x].index

In [None]:
filtered_rating = df[df['User-ID'].isin(knowledgable_users)]

In [None]:
y = filtered_rating.groupby('Book-Title').count()['Book-Rating']>=50
famous_books = y[y].index

In [None]:
final_ratings =  filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

In [None]:
pt = final_ratings.pivot_table(index='Book-Title',columns='User-ID'
                          ,values='Book-Rating')
pt

In [None]:
pt.fillna(0,inplace=True)
pt

In [None]:
similarity_score = cosine_similarity(pt)

In [None]:
similarity_score.shape

In [None]:
def recommend(book_name):
    index = np.where(pt.index==book_name)[0][0]
    similar_books = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1], reverse=True)[1:6]
    
    data = []
    
    for i in similar_books:
        item = []
        temp_df = books[books['Book-Title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        data.append(item)
    return np.array(data)

In [None]:
title = "Message in a Bottle"

In [None]:
recommendations = recommend(title)
recommendations

In [None]:
titles = recommendations[:,:1]

In [None]:
df[df['Book-Title'].apply(lambda x: True if ((x in titles) | (x == title)) else False) ]

## Collaborative Filtering-Matrix Factorization

In [None]:
# Create a dictionary of user and movie ids
#user_ids = dict(zip(ratings['User-ID'].unique(), range(len(ratings['User-ID'].unique()))))
#book_ids = dict(zip(ratings['ISBN'].unique(), range(len(ratings['ISBN'].unique()))))

### Sparse Representation of the Ratings Matrix

The next 5 code blocks contain functions taken from Google's Machine Learning Recomendation Systems Colab document. <br/>
* The first function can be called to split the dataframe into 2 subsets, one for train and one for test. This uses the sample function rather than a train_test_split, and the fraction can be selected when called.
* The second functon is for creating the sparse tensor from the ratings dataframe. As the rated book list contains almost 8000 entries, most books will be unrated by most users. To reduce matrix size, a sparse tensor can be utilized  <br/>
* The third function calculates the mean squared error when given the sparse matrix A and two embedding matrices (U,V). Predictions are made by matrix multiplying the embedding tensors and mapping them to the sparse matrix space. The gather_nd function for TensorFlow combines slices from the new matrix and saves them to match the shape specified by the indices (in this example the size of the sparse matrix). A loss calculation is then performed to evaluate the prediction against the true values recorded in the sparse matrix.<br/>
* The fourth code block defines a helper class for creating the Collaborative Filtering Model. <br/>
* The fifth block builds the Collaborative Filtering Model

In [None]:
# Utility to split the data into training and test sets.
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

In [None]:

def build_rating_sparse_tensor(ratings_df):
  """
  Args:
    ratings_df: a pd.DataFrame with `user_id`, `ISBN` and `rating` columns.
  Returns:
    a tf.SparseTensor representing the ratings matrix.
  """
  indices = ratings_df[['User-ID', 'Book_id']].values
  values = ratings_df['Book-Rating'].values
  return tf.SparseTensor(
      indices=indices,
      values=values,
      dense_shape=[users.shape[0], all_books_final.shape[0]])

In [None]:
def sparse_mean_square_error(sparse_ratings, user_embeddings, book_embeddings):
  """
  Args:
    sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
    user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of user i.
    book_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of movie j.
  Returns:
    A scalar Tensor representing the MSE between the true ratings and the
      model's predictions.
  """
  predictions = tf.gather_nd(
      tf.matmul(user_embeddings, book_embeddings, transpose_b=True),
      sparse_ratings.indices)
  loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
  return loss

The CFModel helper class takes an object of 3 arguments
* Embeddings which is a dictionary of the User_ID and Book ISBNs and the related Tensors
* A loss calculation, in this case MSE on the sparse tensor and embedding spaces of the training dataset
* A dictionary of metrics calculated for the train and test datasets converted to a list

In [None]:
class CFModel(object):
  """Simple class that represents a collaborative filtering model"""
  def __init__(self, embedding_vars, loss, metrics=None):
    """Initializes a CFModel.
    Args:
      embedding_vars: A dictionary of tf.Variables.
      loss: A float Tensor. The loss to optimize.
      metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
    """
    self._embedding_vars = embedding_vars
    self._loss = loss
    self._metrics = metrics
    self._embeddings = {k: None for k in embedding_vars}
    self._session = None

  @property
  def embeddings(self):
    """The embeddings dictionary."""
    return self._embeddings

  def train(self, num_iterations=100, learning_rate=1.0, plot_results=True,
            optimizer=tf.train.GradientDescentOptimizer):
    """Trains the model.
    Args:
      iterations: number of iterations to run.
      learning_rate: optimizer learning rate.
      plot_results: whether to plot the results at the end of training.
      optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    Returns:
      The metrics dictionary evaluated at the last iteration.
    """
    with self._loss.graph.as_default():
      opt = optimizer(learning_rate)
      train_op = opt.minimize(self._loss)
      local_init_op = tf.group(
          tf.variables_initializer(opt.variables()),
          tf.local_variables_initializer())
      if self._session is None:
        self._session = tf.Session()
        with self._session.as_default():
          self._session.run(tf.global_variables_initializer())
          self._session.run(tf.tables_initializer())
          tf.train.start_queue_runners()

    with self._session.as_default():
      local_init_op.run()
      iterations = []
      metrics = self._metrics or ({},)
      #metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

      # Train and append results.
      for i in range(num_iterations + 1):
        _, results = self._session.run((train_op, metrics))
        if (i % 10 == 0) or i == num_iterations:
          print("\r iteration %d: " % i + ", ".join(
                ["%s=%f" % (k, v) for r in results for k, v in r.items()]),
                end='')
          iterations.append(i)
          #for metric_val, result in zip(metrics_vals, results):
            #for k, v in result.items():
              #metric_val[k].append(v)

      for k, v in self._embedding_vars.items():
        self._embeddings[k] = v.eval()

      # if plot_results:
      #   # Plot the metrics.
      #   num_subplots = len(metrics)+1
      #   fig = plt.figure()
      #   fig.set_size_inches(num_subplots*10, 8)
      #   for i, metric_vals in enumerate(metrics_vals):
      #     ax = fig.add_subplot(1, num_subplots, i+1)
      #     for k, v in metric_vals.items():
      #       ax.plot(iterations, v, label=k)
      #     ax.set_xlim([1, num_iterations])
      #     ax.legend()
      # return results

The CFM function takes as input the ratings dataframe, the dimensions of the embedding vectors, and the standard deviation of the random initial embeddings <br/>
The code first creates 2 subsets of data for the test and train splits <br/>
It then creates the sparse tensor for each split <br/>
The embedding spaces U and V are created using the provided dimensions and a normal distribution from the train data split <br/>
Loss is calculated for MSE on the sparse tensor and embedding spaces and saved to a metrics dictionary
Finally the embeddings are labelled and saved to a dictionary. 
The build_model function returns the model created by the helper function

In [None]:
def build_model(ratings, embedding_dim=3, init_stddev=1.):
  """
  Args:
    ratings: a DataFrame of the ratings
    embedding_dim: the dimension of the embedding vectors.
    init_stddev: float, the standard deviation of the random initial embeddings.
  Returns:
    model: a CFModel.
  """
  # Split the ratings DataFrame into train and test.
  train_ratings, test_ratings = split_dataframe(ratings)
  # SparseTensor representation of the train and test datasets.
  A_train = build_rating_sparse_tensor(train_ratings)
  A_test = build_rating_sparse_tensor(test_ratings)
  # Initialize the embeddings using a normal distribution.
  U = tf.Variable(tf.random_normal(
      [A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
  V = tf.Variable(tf.random_normal(
      [A_train.dense_shape[1], embedding_dim], stddev=init_stddev))
  train_loss = sparse_mean_square_error(A_train, U, V)
  test_loss = sparse_mean_square_error(A_test, U, V)
  metrics = {
      'train_error': train_loss,
      'test_error': test_loss
  }
  embeddings = {
      "User-ID": U,
      "Book_id": V
  }
  return CFModel(embeddings, train_loss, [metrics])

In [None]:
ratings

Building the Collaborative Filtering Matrix Model

In [None]:
matrix_model=build_model(ratings, embedding_dim=3, init_stddev=0.5)
matrix_model.train(num_iterations=100, learning_rate=0.1, plot_results=True)

## Collaborative Filtering-autoencoder Deep Neural Network

## Recommendation System

## Performance Evaluation

## Conclusion