# Recommend destinations based on collaborative filtering 

## Step 1: Import the Dependencies

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

## Step 2: Load the Data

In [2]:
destinations = pd.read_csv("sample_destinations.csv")
destinations.head()

Unnamed: 0,destination_id,title,genre,history,art_and_architecture,nature,adventure,entertainment,health_and_lifestyle,food,industries,religious
0,0,Boudhanath Stupa,['Religious Sites'],True,True,False,False,False,False,False,False,True
1,1,Phewa Tal (Fewa Lake),['Bodies of Water'],False,False,True,False,False,False,False,False,False
2,2,Sarangkot,['Mountains'],False,False,True,False,False,False,False,False,False
3,3,Swayambhunath Temple,['Religious Sites'],True,True,False,False,False,False,False,False,True
4,4,Poon Hill,['Mountains'],False,False,True,False,False,False,False,False,False


In [3]:
ratings=pd.read_csv("sample_user_ratings.csv")
ratings.head()

Unnamed: 0,user_id,destination_id,rating
0,0,62,5
1,0,120,4
2,0,28,4
3,0,4,5
4,0,100,3


## Step 3: Exploratory Data Analysis


In [4]:
n_ratings = len(ratings)
n_destinations = ratings['destination_id'].nunique() #unique rated destinations
n_users = ratings['user_id'].nunique()

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique rated destination_id's: {n_destinations}")
print(f"Number of unique users: {n_users}")
print(f"Average number of ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average number of ratings per destination: {round(n_ratings/n_destinations, 2)}")

Number of ratings: 2000
Number of unique rated destination_id's: 200
Number of unique users: 101
Average number of ratings per user: 19.8
Average number of ratings per destination: 10.0


Now, let's take a look at users' rating counts. We can do this using pandas' `groupby()` and `count()` which groups the data by `user_id`'s and counts the number of ratings for each userId. 

In [5]:
user_freq = ratings[['user_id', 'destination_id']].groupby('user_id').count().reset_index()
user_freq.columns = ['user_id', 'n_ratings']
user_freq.head()

Unnamed: 0,user_id,n_ratings
0,0,24
1,1,14
2,2,20
3,3,26
4,4,22


In [6]:
print(f"Mean number of ratings for a given user: {user_freq['n_ratings'].mean():.2f}.")

Mean number of ratings for a given user: 19.80.


### Destinations with the lowest and highest average rating

In [7]:
mean_rating = ratings.groupby('destination_id')[['rating']].mean()

lowest_rated = mean_rating['rating'].idxmin()
destinations[destinations['destination_id'] == lowest_rated]

Unnamed: 0,destination_id,title,genre,history,art_and_architecture,nature,adventure,entertainment,health_and_lifestyle,food,industries,religious
131,131,Mera Peak,['Mountains'],False,False,True,False,False,False,False,False,False


In [8]:
highest_rated = mean_rating['rating'].idxmax()
destinations[destinations['destination_id'] == highest_rated]

Unnamed: 0,destination_id,title,genre,history,art_and_architecture,nature,adventure,entertainment,health_and_lifestyle,food,industries,religious
11,11,Golden Temple (Hiranya Varna Mahavihar),['Religious Sites'],True,True,False,False,False,False,False,False,True


In [9]:
ratings[ratings['destination_id']==highest_rated]

Unnamed: 0,user_id,destination_id,rating
846,41,11,5
931,46,11,5
1047,52,11,5
1655,82,11,5
1941,97,11,5


Better approach is to look at the [Bayesian average](https://en.wikipedia.org/wiki/Bayesian_average).

#### Bayesian Average

Bayesian Average is defined as:

$r_{i} = \frac{C \times m + \Sigma{\text{reviews}}}{C+N}$

where $C$ represents our confidence, $m$ represents our prior, and $N$ is the total number of reviews for destination $i$. In this case, our prior will be the average rating across all destinations. By defintion, C represents "the typical dataset size". Let's make $C$ be the average number of ratings for a given destination.

In [10]:
destination_stats = ratings.groupby('destination_id')[['rating']].agg(['count', 'mean'])
destination_stats.columns = destination_stats.columns.droplevel()
destination_stats

Unnamed: 0_level_0,count,mean
destination_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,4.555556
1,9,4.333333
2,15,4.466667
3,5,4.600000
4,8,4.750000
...,...,...
195,6,2.833333
196,11,3.000000
197,12,3.000000
198,9,3.222222


In [11]:
C = destination_stats['count'].mean()
m = destination_stats['mean'].mean()

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return bayesian_avg

bayesian_avg_ratings = ratings.groupby('destination_id')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['destination_id', 'bayesian_avg']
destination_stats = destination_stats.merge(bayesian_avg_ratings, on='destination_id')

In [12]:
destination_stats = destination_stats.merge(destinations[['destination_id', 'title']])
destination_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,destination_id,count,mean,bayesian_avg,title
58,58,18,4.722222,4.379142,Langtang Valley
30,30,16,4.6875,4.331384,Muktinath Temple
74,74,13,4.769231,4.331129,Babar Mahal Revisted
67,67,13,4.769231,4.331129,Tushita-Nepal
86,86,11,4.818182,4.315047,Makwanpur Gadhi


### Destinations with the least Bayesian averages 

In [13]:
destination_stats.sort_values('bayesian_avg', ascending=True).head()

Unnamed: 0,destination_id,count,mean,bayesian_avg,title
184,184,12,2.416667,3.027999,Image Ark
132,132,9,2.333333,3.085051,Akash Bhairav Temple
139,139,18,2.777778,3.129142,Sauraha Tharu Culture House
180,180,13,2.692308,3.157216,Dattatreya Temple
111,111,12,2.666667,3.164363,Rara Lake


### Destinations with the highest Bayesian averages

In [14]:
destination_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,destination_id,count,mean,bayesian_avg,title
58,58,18,4.722222,4.379142,Langtang Valley
30,30,16,4.6875,4.331384,Muktinath Temple
74,74,13,4.769231,4.331129,Babar Mahal Revisted
67,67,13,4.769231,4.331129,Tushita-Nepal
86,86,11,4.818182,4.315047,Makwanpur Gadhi


## Step 4: Transforming the data

We will be using a technique called [collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering) to generate user recommendations. This technique is based on the assumption of "homophily" - similar users like similar things. Collaborative filtering is a type of unsupervised learning that makes predictions about the interests of a user by learning from the interests of a larger population.

The first step of collaborative filtering is to transform our data into a `user-item matrix` - also known as a "utility" matrix. In this matrix, rows represent users and columns represent items. The beauty of collaborative filtering is that it doesn't require any information about the users or items to generate recommendations. 



The `create_X()` function outputs a sparse matrix X with four mapper dictionaries:
- **user_mapper:** maps user id to user index
- **movie_mapper:** maps movie id to movie index
- **user_inv_mapper:** maps user index to user id
- **movie_inv_mapper:** maps movie index to movie id

We need these dictionaries because they map which row and column of the utility matrix corresponds to which user ID and movie ID, respectively.

The **X** (user-item) matrix is a [scipy.sparse.csr_matrix](scipylinkhere) which stores the data sparsely.

In [15]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        destination_mapper: dict that maps destination id's to movie indices
        destination_inv_mapper: dict that maps destination indices to movie id's
    """
    N = df['user_id'].nunique()
    M = df['destination_id'].nunique()

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
    destination_mapper = dict(zip(np.unique(df["destination_id"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
    destination_inv_mapper = dict(zip(list(range(M)), np.unique(df["destination_id"])))
    
    user_index = [user_mapper[i] for i in df['user_id']]
    destination_index = [destination_mapper[i] for i in df['destination_id']]

    X = csr_matrix((df["rating"], (destination_index, user_index)), shape=(M, N))
    
    return X, user_mapper, destination_mapper, user_inv_mapper, destination_inv_mapper

In [16]:
X, user_mapper, destination_mapper, user_inv_mapper, destination_inv_mapper = create_X(ratings)

Let's check out the sparsity of our X matrix.

Here, we calculate sparsity by dividing the number of non-zero elements by total number of elements as described in the equation below: 

$$S=\frac{\text{# non-zero elements}}{\text{total elements}}$$

In [17]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 9.44%


Only 9.44% of cells in our user-item matrix are populated with ratings. User-item matrices are typically very sparse. A general rule of thumb is that your matrix sparsity should be no lower than 0.5% to generate decent results.

### Writing your matrix to a file

We're going to save our user-item matrix for the next part of this tutorial series. Since our matrix is represented as a scipy sparse matrix, we can use the [scipy.sparse.save_npz](https://docs.scipy.org/doc/scipy-1.1.0/reference/generated/scipy.sparse.load_npz.html) method to write the matrix to a file. 

In [18]:
from scipy.sparse import save_npz

save_npz('user_item_matrix.npz', X)

## Step 5: Finding similar destinations using k-Nearest Neighbours

This approach looks for the $k$ nearest neighbours of a given destination by identifying $k$ points in the dataset that are closest to destination $m$. kNN makes use of distance metrics such as:

1. Cosine similarity
2. Euclidean distance
3. Manhattan distance
4. Pearson correlation 

Although difficult to visualize, we are working in a M-dimensional space where M represents the number of movies in our X matrix. 

In [19]:
from sklearn.neighbors import NearestNeighbors

def find_similar_destinations(destination_id, X, k, metric='cosine', show_distance=False):
    """
    Finds k-nearest neighbours for a given destination id.
    
    Args:
        destination_id: id of the destination of interest
        X: user-item utility matrix
        k: number of similar destinations to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar destination IDs
    """
    neighbour_ids = []
    
    destination_ind = destination_mapper[destination_id]
    destination_vec = X[destination_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    if isinstance(destination_vec, (np.ndarray)):
        destination_vec = destination_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(destination_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(destination_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

`find_similar_destinations()` takes in a destination_d and user-item X matrix, and outputs a list of $k$ movies that are similar to the destination_id of interest. 

Let's see how it works in action. We will first create another mapper that maps `destination_id` to `title` so that our results are interpretable. 

In [20]:
destination_titles = dict(zip(destinations['destination_id'], destinations['title']))

destination_id = 1

similar_ids = find_similar_destinations(destination_id, X, k=5)
destination_title = destination_titles[destination_id]

print(f"Because you watched {destination_title}")
for i in similar_ids:
    print(destination_titles[i])

Because you watched Phewa Tal (Fewa Lake)
Budhanilkantha
Pokhara Museum
Garden of Dreams
Gurunghe Hill
Kala Patthar


The results above show the 10 most similar destinations to Phewa Tal.

<b>Note that these recommendations are based solely on user-item ratings. Movie features such as genres are not taken into consideration in this approach. </b>

You can also play around with the kNN distance metric and see what results you would get if you use "manhattan" or "euclidean" instead of "cosine".

In [22]:
destination_titles = dict(zip(destinations['destination_id'], destinations['title']))

destination_id = 1
similar_ids = find_similar_destinations(destination_id, X, k=10, metric="euclidean")

destination_title = destination_titles[destination_id]
print(f"Because you watched {destination_title}:")
print(similar_ids)
for i in similar_ids:
    print(destination_titles[i])

Because you watched Phewa Tal (Fewa Lake):
[143, 168, 191, 124, 165, 131, 117, 189, 150, 155]
Gosainkunda
Valle del Naar
Casino Royale
Sun Kosi
Taleju Bell
Mera Peak
Gurunghe Hill
Datankali Temple
Patan Gate
Dolakha Bhimsen Mandir
