# Coffee Shop Collaborative Filtering Recommendation Engine

User ratings are randomly generated and quite sparse so search results aren't too meaningful, however could be replaced with real data (with more users, shops, and ratings) for more interpretable results. Nevertheless, the results of the model seem quite good.

Search parameters include:
- Price range
- Coffee shop district
- Charger availability
- Wifi availability
- Newness (whether the user wants to go to a shop they've already rated, or a brand new one)

### Imports

In [1]:
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import BaselineOnly
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import KNNWithMeans
from surprise import accuracy
import pandas as pd
import numpy as np
import pprint

pd.set_option('display.max_rows', 200)

### Data Import & Preparation

In [2]:
# this is just a random set of coffee shops that i generated
shop_filepath = r'./iid_properties.csv'

# this is just a random set of user ratings that i generated
ratings_filepath = r'./uid_iid_ratings.csv'

shop_properties = pd.read_csv(shop_filepath, header=0)
shop_properties['iid'] = shop_properties['iid'].astype(str)

user_ratings = pd.read_csv(ratings_filepath, header=0)
user_ratings['iid'] = user_ratings['iid'].astype(str)

reader = Reader(rating_scale=(1, 3))

overall_data = Dataset.load_from_df(user_ratings[['uid', 'iid', 'score']], reader)
vibes_data = Dataset.load_from_df(user_ratings[['uid', 'iid', 'vibes']], reader)
coffee_data = Dataset.load_from_df(user_ratings[['uid', 'iid', 'coffee']], reader)

trainset_overall = overall_data.build_full_trainset()
trainset_vibes = vibes_data.build_full_trainset()
trainset_coffee = coffee_data.build_full_trainset()

average_ratings = user_ratings.groupby('iid')['score'].mean().reset_index().rename(columns={'score':'mean'})
average_ratings['mean'] = average_ratings['mean'].apply(lambda x: round(x, 2))

average_vibes = user_ratings.groupby('iid')['vibes'].mean().reset_index()
average_vibes['vibes'] = average_vibes['vibes'].apply(lambda x: round(x, 2))

average_coffee = user_ratings.groupby('iid')['coffee'].mean().reset_index()
average_coffee['coffee'] = average_coffee['coffee'].apply(lambda x: round(x, 2))

print('Number of users: ', trainset_overall.n_users, '\n')
print('Number of shops: ', trainset_overall.n_items, '\n')

Number of users:  138 

Number of shops:  130 



In [3]:
shop_properties.head()

Unnamed: 0,iid,district,price,charger,wifi
0,135085,1,3,0,1
1,135038,3,1,0,2
2,132825,5,1,0,0
3,135060,3,1,2,1
4,135104,4,3,2,2


In [4]:
user_ratings.head()

Unnamed: 0,uid,iid,vibes,coffee,score
0,U1077,135085,5,5,5.0
1,U1077,135038,3,2,2.5
2,U1077,132825,4,1,2.5
3,U1077,135060,2,2,2.0
4,U1068,135104,5,1,3.0


# Algorithms

In [5]:
cosine_shop_sim= {'name':'cosine', 'user_based':False, 'min_support':3}
cosine_user_sim= {'name':'cosine', 'user_based':True, 'min_support':3}
pearson_baseline_shop_sim = {'name':'pearson_baseline', 'user_based':True, 'min_support':3, 'shrinkage':100}
pearson_baseline_user_sim = {'name':'pearson_baseline', 'user_based':True, 'min_support':3, 'shrinkage':100}

bsl_options = {'method': 'sgd'}

my_k = 15
my_min_k = 5

knn_pearson_baseline_shop_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = pearson_baseline_shop_sim, bsl_options = bsl_options)
knn_pearson_baseline_user_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = pearson_baseline_user_sim, bsl_options = bsl_options)
svd_algo = SVD()
nmf_algo = NMF()

vibes_knn_cosine_shop_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = cosine_shop_sim)
vibes_knn_cosine_user_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = cosine_user_sim)
vibes_nmf_algo = NMF()

coffee_knn_cosine_shop_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = cosine_shop_sim)
coffee_knn_cosine_user_algo = KNNWithMeans(k = my_k, min_k = my_min_k, sim_options = cosine_user_sim)
coffee_nmf_algo = NMF()

# Model Fitting

In [6]:
knn_pearson_baseline_shop_algo.fit(trainset_overall)
knn_pearson_baseline_user_algo.fit(trainset_overall)
svd_algo.fit(trainset_overall)
nmf_algo.fit(trainset_overall)

coffee_knn_cosine_shop_algo.fit(trainset_coffee)
coffee_knn_cosine_user_algo.fit(trainset_coffee)
coffee_nmf_algo.fit(trainset_coffee)

vibes_knn_cosine_shop_algo.fit(trainset_vibes)
vibes_knn_cosine_user_algo.fit(trainset_vibes)
vibes_nmf_algo.fit(trainset_vibes)

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1180212d0>

In [7]:
shop_properties

Unnamed: 0,iid,district,price,charger,wifi
0,135085,1,3,0,1
1,135038,3,1,0,2
2,132825,5,1,0,0
3,135060,3,1,2,1
4,135104,4,3,2,2
5,132740,3,2,0,0
6,132663,1,1,2,2
7,132732,2,2,1,1
8,132630,4,1,2,1
9,132584,3,1,0,1


# Prediction Function

In [8]:
def get_prediction(uid, district, price, charger, wifi, newness):
    
    shops_filtered = list(shop_properties[(shop_properties.district.isin(district)) & \
                                          (shop_properties.price.isin(price)) & \
                                          (shop_properties.charger >= charger) & \
                                          (shop_properties.wifi >= wifi)]['iid'])

    if len(shops_filtered) == 0:
        return "No Coffee Shops Meet Your Criteria"
    
    user_list = [uid]*len(shops_filtered)
    
    df = pd.DataFrame(list(zip(user_list, shops_filtered)), columns=['uid','iid'])
    
    df = df.merge(shop_properties, left_on='iid', right_on='iid')
    df = df.merge(user_ratings, how='left', on=['uid','iid'])
    df = df.merge(average_ratings, how='left', on=['iid'])
    
    if newness == 1:
        df = df[df.score != df.score]
    if newness == -1:
        df = df[df.score == df.score]
    
    if df.empty == True:
        return "No Coffee Shops Meet Your Criteria"
    
    def get_knn_pearson_baseline_shop(row):
        return knn_pearson_baseline_shop_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_knn_pearson_baseline_user(row):
        return knn_pearson_baseline_user_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_svd(row):
        return svd_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_nmf(row):
        return nmf_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_vibes_mean(row):
        return(user_ratings[user_ratings['iid']==row['iid']]['vibes'].mean())
    def get_vibes_knn_cosine_shop(row):
        return vibes_knn_cosine_shop_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_vibes_knn_cosine_user(row):
        return vibes_knn_cosine_user_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_vibes_nmf(row):
        return vibes_nmf_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_coffee_mean(row):
        return(user_ratings[user_ratings['iid']==row['iid']]['coffee'].mean())
    def get_coffee_knn_cosine_shop(row):
        return coffee_knn_cosine_shop_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_coffee_knn_cosine_user(row):
        return coffee_knn_cosine_user_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_coffee_nmf(row):
        return coffee_nmf_algo.predict(uid=row['uid'],iid=str(row['iid'])).est
    def get_rating_adjust(row):
        if row['score'] == row['score']:
            return row['score'] - row['mean']
        else:
            return 0

    def get_house_blend(row):
    #25 points allocated to mean rating
        mean = 0.25 * row['mean']

    #40 points allocated to overall rating
        knn_pearson_baseline_shop = 0.1 * row['knn_pearson_baseline_shop']
        knn_pearson_baseline_user = 0.1 * row['knn_pearson_baseline_user']
        svd = 0.1 * row['svd']
        nmf = 0.1 * row['nmf']
        overall = knn_pearson_baseline_shop + knn_pearson_baseline_user + svd + nmf

    #40 points allocated to coffee shop vibes
        vibes_knn_cosine_shop = 0.15 * row['vibes_knn_cosine_shop']
        vibes_knn_cosine_user = 0.15 * row['vibes_knn_cosine_user']
        vibes_nmf = 0.1 * row['vibes_nmf']
        vibes = vibes_knn_cosine_shop + vibes_knn_cosine_user + vibes_nmf

    #40 points allocated to coffee shop coffee
        coffee_knn_cosine_shop = 0.15 * row['coffee_knn_cosine_shop']
        coffee_knn_cosine_user = 0.15 * row['coffee_knn_cosine_user']
        coffee_nmf = 0.1 * row['coffee_nmf']
        coffee = coffee_knn_cosine_shop + coffee_knn_cosine_user + coffee_nmf
        
        rating_adjust = row['rating_adjust']
                
        return round((mean + overall + vibes + coffee + rating_adjust/10)*20,2)
        
    df['rating_adjust'] = df.apply(get_rating_adjust, axis=1)

    df['knn_pearson_baseline_shop'] = df.apply(get_knn_pearson_baseline_shop, axis=1)
    df['knn_pearson_baseline_user'] = df.apply(get_knn_pearson_baseline_user, axis=1)
    df['svd'] = df.apply(get_svd, axis=1)
    df['nmf'] = df.apply(get_nmf, axis=1)

    df['vibes_knn_cosine_shop'] = df.apply(get_vibes_knn_cosine_shop, axis=1)
    df['vibes_knn_cosine_user'] = df.apply(get_vibes_knn_cosine_user, axis=1)
    df['vibes_nmf'] = df.apply(get_nmf, axis=1)

    df['coffee_knn_cosine_shop'] = df.apply(get_coffee_knn_cosine_shop, axis=1)
    df['coffee_knn_cosine_user'] = df.apply(get_coffee_knn_cosine_user, axis=1)
    df['coffee_nmf'] = df.apply(get_nmf, axis=1)

    df['house_blend'] = df.apply(get_house_blend, axis=1)
    
    df = df.rename(columns={'iid':'shop_id','mean':'avg_rating'})
        
    return df[['shop_id','district','price','charger','wifi','avg_rating','house_blend','score']].sort_values(by=['house_blend'],ascending=False)

# Testing

In [9]:
user_id = 'U1077'

### Full Coffee Shop Ranking Recommendations

shop_id = coffee shop id

district = coffee shop district

price = coffee shop price range

charger = coffee shop chargers available

wifi = coffee shop wifi available

avg_rating = coffee shop average rating

house_blend = user-specific coffee shop recommendation system score

score = user rating of coffee shop

In [10]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[1,2,3], charger=0, wifi=0, newness=0)
pprint.pprint(a.head())

    shop_id  district  price  charger  wifi  avg_rating  house_blend  score
51   134975         5      2        0     0        3.90        91.17    NaN
0    135085         1      3        0     1        3.05        91.15    5.0
106  132847         4      3        1     0        3.86        90.97    NaN
83   135027         2      1        2     2        3.47        90.41    4.0
6    132663         1      1        2     2        3.56        89.47    NaN


### Previously Rated Shops (newness=-1)

In [11]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[1,2,3], charger=0, wifi=0, newness=-1)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
0   135085         1      3        0     1        3.05        91.15    5.0
83  135027         2      1        2     2        3.47        90.41    4.0
10  132733         4      1        1     0        3.12        87.85    3.5
70  132754         3      2        2     1        3.23        86.76    3.5
14  132660         4      2        0     0        2.85        85.02    3.0


### Never Visited Shops (newness=1)

In [12]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[1,2,3], charger=0, wifi=0, newness=1)
pprint.pprint(a.head())

    shop_id  district  price  charger  wifi  avg_rating  house_blend  score
51   134975         5      2        0     0        3.90        91.17    NaN
106  132847         4      3        1     0        3.86        90.97    NaN
6    132663         1      1        2     2        3.56        89.47    NaN
96   135055         5      1        1     2        3.46        88.97    NaN
44   135018         4      2        1     1        3.44        88.87    NaN


### Changing District

In [13]:
a = get_prediction(user_id, district=[1], price=[1,2,3], charger=0, wifi=0, newness=1)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
1   132663         1      1        2     2        3.56        89.47    NaN
2   132626         1      3        1     2        3.43        88.82    NaN
7   135021         1      3        0     1        3.31        88.22    NaN
5   135013         1      2        0     1        3.29        88.12    NaN
21  132845         1      3        2     0        3.30        87.74    NaN


In [14]:
a = get_prediction(user_id, district=[2], price=[1,2,3], charger=0, wifi=0, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
25  135027         2      1        2     2        3.47        90.41    4.0
32  132870         2      3        0     0        3.33        88.32    NaN
6   132715         2      1        2     1        3.25        87.92    NaN
30  132858         2      2        1     2        3.30        87.82    NaN
0   132732         2      2        1     1        3.19        87.62    NaN


In [15]:
a = get_prediction(user_id, district=[3], price=[1,2,3], charger=0, wifi=0, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
2   132740         3      2        0     0        3.16        87.47    NaN
15  132754         3      2        2     1        3.23        86.76    3.5
17  135082         3      1        1     2        3.00        86.67    NaN
19  135033         3      2        1     2        3.06        86.22    NaN
8   135058         3      2        2     1        3.17        86.00    NaN


In [16]:
a = get_prediction(user_id, district=[4], price=[1,2,3], charger=0, wifi=0, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
19  132847         4      3        1     0        3.86        90.97    NaN
10  135018         4      2        1     1        3.44        88.87    NaN
22  132885         4      2        2     2        3.30        87.93    NaN
12  135081         4      2        2     0        3.18        87.90    NaN
2   132733         4      1        1     0        3.12        87.85    3.5


In [17]:
a = get_prediction(user_id, district=[5], price=[1,2,3], charger=0, wifi=0, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
8   134975         5      2        0     0        3.90        91.17    NaN
14  135055         5      1        1     2        3.46        88.97    NaN
3   135074         5      2        1     0        3.38        88.57    NaN
12  135042         5      2        0     2        3.11        87.36    NaN
11  135048         5      2        2     2        3.25        87.18    NaN


### Changing Price Range

In [18]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[1], charger=0, wifi=1, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
16  135027         2      1        2     2        3.47        90.41    4.0
2   132663         1      1        2     2        3.56        89.47    NaN
21  135055         5      1        1     2        3.46        88.97    NaN
6   132715         2      1        2     1        3.25        87.92    NaN
13  135062         4      1        1     1        3.13        87.65    NaN


In [19]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[2], charger=0, wifi=1, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
8   135018         4      2        1     1        3.44        88.87    NaN
9   135013         1      2        0     1        3.29        88.12    NaN
28  132885         4      2        2     2        3.30        87.93    NaN
27  132858         2      2        1     2        3.30        87.82    NaN
21  135069         4      2        0     1        3.19        87.76    NaN


In [20]:
a = get_prediction(user_id, district=[1,2,3,4,5], price=[3], charger=0, wifi=1, newness=0)
pprint.pprint(a.head())

   shop_id  district  price  charger  wifi  avg_rating  house_blend  score
0   135085         1      3        0     1        3.05        91.15    5.0
2   132626         1      3        1     2        3.43        88.82    NaN
9   135021         1      3        0     1        3.31        88.22    NaN
15  135108         4      3        2     1        3.19        87.77    NaN
5   135019         2      3        0     2        3.15        86.82    NaN
