In [96]:
# import
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import random
np.seterr(divide='raise', over='raise', under='raise', invalid='raise')

from data import CITIES, BUSINESSES, USERS, REVIEWS, TIPS, CHECKINS

In [97]:
def ratings(city):
    reviews = REVIEWS[city]
    ratings = pd.DataFrame(reviews, columns=['user_id', 'business_id', 'stars'])
    # onthoud de user en business ids die meer dan 9 keer zijn gereviewd
    aantal_reviews_user = ratings.groupby('user_id').size()
    voldoende_reviews_user = aantal_reviews_user[aantal_reviews_user > 9].index
    
    # maak een dataframe met alleen de gebruikers die meer dan 9 keer een rating hebben gegeven
    ratings = ratings[ratings['user_id'].isin(voldoende_reviews_user)]
    return ratings

# Helper functies

In [98]:
def get_rating(ratings, user_id, business_id):
    """Given a userId and movieId, this function returns the corresponding rating.
       Should return NaN if no rating exists."""
    # select the rating where the userId and the businessId match the given Ids
    rating = ratings['stars'][(ratings['user_id'] == user_id) & (ratings['business_id'] == business_id)]
    
    # if the user has not rated the movie, return NaN
    if len(rating) == 0:
        return np.nan
    
    return rating

In [99]:
def get_city(user_id):
    """returned de stad waarin de gebruiker de meeste ratings heeft gegeven aan bedrijven."""
    ratings_cities = pd.DataFrame(columns=['business_id', 'user_id', 'city'])
    for city in CITIES:
        reviews = REVIEWS[city]
        df = pd.DataFrame(reviews, columns=['business_id', 'user_id'])
        df['city'] = city
        ratings_cities = ratings_cities.append(df)
    ratings_cities = ratings_cities[ratings_cities['user_id'] == user_id]
    
    # als de gebruiker nog geen ratings heeft gegeven, kies een random city
    if ratings_cities.empty:
        return random.choice(CITIES)
    
    ratings_cities = ratings_cities.groupby('city').size().sort_values(ascending=False)
    city = ratings_cities.index[0]
    
    return city

In [100]:
def mean_center_columns(matrix):
    """de matrix - het gemiddelde van de columns, om bias te verkomen"""
    return matrix - matrix.mean()

In [101]:
def cosine_similarity(matrix, id1, id2):
    """"Compute the cosine similarity between two rows."""
    # only take the features that have values for both id1 and id2
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return NaN
    if not selected_features.any():
        return 0.0
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    # if the id is compered with itself, return maximum similarity (1)
    if id1 == id2:
        return 1
    
    # calculate the counter and the caller based on the formula
    counter = (features1 * features2).sum()
    caller = np.sqrt(np.square(features1).sum()) * np.sqrt(np.square(features2).sum())
    
    # if the caller is 0, return 0
    if caller == 0:
        return 0.0
    
    # else return the cosine similarity
    return counter / caller

In [102]:
def create_similarity_matrix_cosine(matrix):
    """ creates the similarity matrix based on cosine similarity """
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    
    # calculate the cosine similarity for all posible combinations and put in dataframe
    ids = matrix.index
    for id1 in ids:
        for id2 in ids:
            similarity = cosine_similarity(matrix, id1, id2)
            similarity_matrix[id1][id2] = similarity
            
    return similarity_matrix

In [103]:
def select_neighborhood(similarity_matrix, utility_matrix, target_user, target_business):
    """selects all items with similarity > 0"""
    # check if target user is in utility matrix
    if not target_user in utility_matrix.columns:
        return np.nan
    
    # check if target business is in utility matrix
    if not target_business in utility_matrix.index:
        return np.nan
    
    # select the movies the target user has seen
    selected_business = list(utility_matrix.index[utility_matrix.loc[:, target_user].notna()])
    
    # select the movies from selected_films with a similarity bigger than 0
    comparable_business = similarity_matrix[target_business][similarity_matrix.index.isin(selected_business)]
    comparable_business = comparable_business[comparable_business > 0]
    return comparable_business

In [104]:
def weighted_mean(neighborhood, utility_matrix, user_id):
    """"Compute the weighted mean of the selected movies."""
    # if the movie has no neighbors, return NaN
    if neighborhood is np.nan:
        return np.nan
    
    # calculate the counter and the caller given the formula
    business_ids = list(neighborhood.index)
    ratings = utility_matrix[user_id][utility_matrix.index.isin(business_ids)]
    counter = (neighborhood * ratings).sum()
    caller = neighborhood.sum()
    
    # if the caller is 0, return 0
    if caller == 0:
        return np.nan
    
    # else return the weighted mean
    return counter / caller

In [105]:
def split_data(data,d = 0.8):
    """ split data in a training and test set 
       `d` is the fraction of data in the training set"""
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]

In [106]:
def number_of_ratings(ratings):
    """ count the number of ratings of a dataset """
    return ratings.shape[0]

In [107]:
def mse(predicted_ratings):
    # calculate based on the rating and the predicted rating the mse using the given formula
    diff = predicted_ratings['stars'] - predicted_ratings['predicted_rating']
    return (diff**2).mean()

# User based

In [108]:
def utility_user(ratings):
    """ takes a rating table as input and computes the utility matrix of user based """
    # get business and user id's
    business_ids = ratings['business_id'].unique()
    user_ids = ratings['user_id'].unique()

    # create empty data frame
    pivot_data = pd.DataFrame(np.nan, columns=business_ids, index=user_ids, dtype=float)
    
    for index, row in ratings.iterrows():
        pivot_data[row['business_id']][row['user_id']] = row['stars']
        
    
    return pivot_data

In [109]:
def predict_ratings_user_based(similarity, utility, test_data):
    # make a copy of the test data
    copy_test_data = test_data.copy()
    
    # iterrate over the rows of the test data to calculate the predicted data for every row in the test data
    for x, y in copy_test_data.iterrows():
        # use the made functions item based
        neighborhood = select_neighborhood(similarity, utility, y['business_id'], y['user_id'])
        predicted_rating = weighted_mean(neighborhood, utility, y['business_id'])
        copy_test_data.loc[x, 'predicted_rating'] = predicted_rating
    return copy_test_data

# Item based

In [110]:
def predict_ratings_item_based(similarity, utility, test_data):
    # make a copy of the test data
    copy_test_data = test_data.copy()
    
    # iterrate over the rows of the test data to calculate the predicted data for every row in the test data
    for x, y in copy_test_data.iterrows():
        # use the made functions item based
        neighborhood = select_neighborhood(similarity, utility, y['user_id'], y['business_id'])
        predicted_rating = weighted_mean(neighborhood, utility, y['user_id'])
        copy_test_data.loc[x, 'predicted_rating'] = predicted_rating
    return copy_test_data

# Content based

In [111]:
def df_businesses_categorie(city):
    """Maakt dataframe met alle bedrijven in de stad en de bijbehorende categoriën."""
    reviews = BUSINESSES[city]
    business_categories = pd.DataFrame(reviews, columns=['business_id', 'categories'])
    return business_categories.dropna()

In [112]:
def extract_categories(businesses):
    """Create an unfolded genre dataframe. Unpacks genres seprated by a ',' into seperate rows.

    Arguments:
    businesses -- a dataFrame containing at least the columns 'business_id' and 'categories' 
              where genres are seprated by ','
    """
    categories_m = businesses.apply(lambda row: pd.Series([row['business_id']] + row['categories'].lower().split(",")), axis=1)
    stack_categories = categories_m.set_index(0).stack()
    df_stack_categories = stack_categories.to_frame()
    df_stack_categories['business_id'] = stack_categories.index.droplevel(1)
    df_stack_categories.columns = ['categories', 'business_id']
    return df_stack_categories.reset_index()[['business_id', 'categories']]

In [113]:
def pivot_categories(df):
    """Create a one-hot encoded matrix for categories.
    
    Arguments:
    df -- a dataFrame containing at least the columns 'business_id' and 'categories'
    
    Output:
    a matrix containing '0' or '1' in each cell.
    1: the movie has the genre
    0: the movie does not have the genre
    """
    return df.pivot_table(index = 'business_id', columns = 'categories', aggfunc = 'size', fill_value=0)

In [114]:
def create_similarity_matrix_categories(matrix):
    """Create a similarity matrix for the categories."""
    npu = matrix.values
    m1 = npu @ npu.T
    diag = np.diag(m1)
    m2 = m1 / diag
    m3 = np.minimum(m2, m2.T)
    return pd.DataFrame(m3, index = matrix.index, columns = matrix.index)

In [115]:
def predict_ids(similarity, utility, userId, itemId):
    # select right series from matrices and compute
    if userId in utility.columns and itemId in similarity.index:
        return predict_vectors(utility.loc[:,userId], similarity[itemId])
    return 0

In [117]:
def predict_vectors(user_ratings, similarities):
    # select only movies actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

In [118]:
def predict_ratings_content(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted_rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c

# Uitvoeren recommender system

In [None]:
cityname = 'ajax'

In [None]:
# splits de data in training en test set
ratings = ratings(city=cityname)
ratings, ratings_test = split_data(ratings)

## User based

In [None]:
utility_user = utility_user(ratings)
centered_utility_user = mean_center_columns(utility_user)
similarity_user = create_similarity_matrix_cosine(centered_utility_user)
predicted_user_based = predict_ratings_user_based(similarity_user, utility_user, ratings_test).dropna()

In [None]:
mse(predicted_user_based)

## Item based

In [None]:
utility_item = utility_user.T
centered_utility_item = mean_center_columns(utility_item)
similarity_item = create_similarity_matrix_cosine(centered_utility_item)
predicted_item_based = predict_ratings_item_based(similarity_item, utility_item, ratings_test).dropna()

In [None]:
mse(predicted_item_based)

## Content based

In [None]:
businesses = df_businesses_categorie(cityname)
df_categories = extract_categories(businesses)
utility_content = pivot_categories(df_categories)
similarity_content = create_similarity_matrix_categories(utility_content)
predicted_ratings_content = predict_ratings_content(similarity_content, utility_item, ratings_test)
predicted_ratings_content = predicted_ratings_content[predicted_ratings_content['predicted_rating'] > 0]

In [None]:
mse(predicted_ratings_content)

## Gecombineerd

In [None]:
ratings_test['predicted_rating'] = ((predicted_ratings_content['predicted_rating'] + predicted_item_based['predicted_rating'] + predicted_user_based['predicted_rating']) / 3)
ratings_test = ratings_test.dropna()
mse(ratings_test)

In [None]:
ratings_test['predicted_rating'] = ((predicted_ratings_content['predicted_rating'] + predicted_item_based['predicted_rating']) / 2)
ratings_test = ratings_test.dropna()
mse(ratings_test)

In [None]:
ratings_test['predicted_rating'] = ((predicted_ratings_content['predicted_rating'] + predicted_user_based['predicted_rating']) / 2)
ratings_test = ratings_test.dropna()
mse(ratings_test)

In [None]:
ratings_test['predicted_rating'] = ((predicted_item_based['predicted_rating'] + predicted_user_based['predicted_rating']) / 2)
ratings_test = ratings_test.dropna()
mse(ratings_test)

In [None]:
# use the mean rating of the trainingset as the predicted rating and calculate the mse of it
mean_ratings = ratings['stars'].mean()
ratings_test_e = ratings_test[['user_id', 'business_id', 'stars']].copy()
ratings_test_e['predicted_rating'] = [mean_ratings for index in ratings_test_e.index]
mse(ratings_test_e)

# Recommend functie

In [119]:
def recommend(user_id=None, business_id=None, city=None, n=10):
    """
    Returns n recommendations as a list of dicts.
    Optionally takes in a user_id, business_id and/or city.
    A recommendation is a dictionary in the form of:
        {
            business_id:str
            stars:str
            name:str
            city:str
            adress:str
        }
    """
    if not city:
        # selecteer alle steden waar een gebruiker een bedrijf gereviewd heeft
        city = get_city(user_id)
        
    ratings_ = ratings(city)
    
    # user based
    utility_user_ = utility_user(ratings_)
    centered_utility_user_ = mean_center_columns(utility_user_)
    similarity_user_ = create_similarity_matrix_cosine(centered_utility_user_)

    # item based
    utility_item_ = utility_user_.T
    centered_utility_item_ = mean_center_columns(utility_item_)
    similarity_item_ = create_similarity_matrix_cosine(centered_utility_item_)

    # conent based
    businesses_ = df_businesses_categorie(city)
    df_categories_ = extract_categories(businesses_)
    utility_content_ = pivot_categories(df_categories_)
    similarity_content_ = create_similarity_matrix_categories(utility_content_)
    
    # create DataFrame with the user and all cities you want the predicted rating from
    df_ratings_user = pd.DataFrame(BUSINESSES[city], columns=['business_id'])
    df_ratings_user['user_id'] = user_id
    
    predicted_user = predict_ratings_user_based(similarity_user_, utility_user_, df_ratings_user).dropna()
    predicted_item = predict_ratings_item_based(similarity_item_, utility_item_, df_ratings_user).dropna()
    predicted_content = predict_ratings_content(similarity_content_, utility_item_, df_ratings_user)
    predicted_content = predicted_content[predicted_content['predicted_rating'] > 0]

    df_ratings_user['predicted_rating'] = ((predicted_item['predicted_rating'] + predicted_user['predicted_rating'] + predicted_content['predicted_rating']) / 3)

    df_ratings_user = df_ratings_user.dropna().sort_values(by='predicted_rating', ascending=False)
    
    return df_ratings_user[:10]

recommend(user_id='DRlIsW15Zn2qwYdOuhHlsg', city='westlake')

In [120]:
recommend(user_id='7LCG3o2KW2jgKgbKN0DQOg')

Unnamed: 0,business_id,user_id,predicted_rating
107,lvMDy7xL-hpkLUpRJUnJwQ,7LCG3o2KW2jgKgbKN0DQOg,4.757576
77,hSporfb8IjTQaw_9ytHrFw,7LCG3o2KW2jgKgbKN0DQOg,4.333333
5,gQNeEQVB5aBmQM-K2aBxBQ,7LCG3o2KW2jgKgbKN0DQOg,4.166667
66,oUS-cKFK8ffdzyf4HplXpQ,7LCG3o2KW2jgKgbKN0DQOg,4.052218
403,-tSTLaafhkQ7iB5Bl5zgPg,7LCG3o2KW2jgKgbKN0DQOg,4.039903
29,Uyvu1gvRreo2e-p9T3HHxQ,7LCG3o2KW2jgKgbKN0DQOg,4.024602
41,X1RLcu527EkR6lDMffI2LA,7LCG3o2KW2jgKgbKN0DQOg,4.020202
73,KDpgTDtgqUqrFmUTrCWUtA,7LCG3o2KW2jgKgbKN0DQOg,4.009012
170,MRjbo3l3kY_AL4NlVIiZGg,7LCG3o2KW2jgKgbKN0DQOg,4.0
302,Qw45ZqhBR0VI5_cI60SgeQ,7LCG3o2KW2jgKgbKN0DQOg,3.970131
