### Coverage, Serendipity, Ranking Metrics

In [1]:
import pandas as pd
from tqdm import tqdm
import json
import requests
pd.options.mode.chained_assignment = None
import numpy as np

In [3]:
#line_count = len(open("data/review.json").readlines())
line_count = len(open("data/business.json", encoding="utf8").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts = [], [], [], [], [], [], []
with open("data/business.json", encoding="utf8") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "business_average_stars": stars, "business_review_counts": review_counts }
)

100%|██████████| 192609/192609 [00:03<00:00, 52627.03it/s]


In [4]:
#line_count = len(open("data/review.json").readlines())
line_count = len(open("data/user.json", encoding="utf8").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("data/user.json", encoding="utf8") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "user_review_counts": review_counts,"elite": elites, "user_average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:28<00:00, 57974.90it/s]


In [5]:
ratings_holdout = pd.read_csv('data/ratings_sample_holdout_20.csv')

In [10]:
ratings_train = pd.read_csv('data/ratings_sample_train_20.csv')

In [8]:
ratings_val = pd.read_csv('data/ratings_sample_cv_20.csv')

In [11]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

In [12]:
ratings_train = process(ratings_train.copy())
ratings_holdout = process(ratings_holdout.copy())
ratings_val = process(ratings_val.copy())

We can calulate this after hyperparamter tuning. So we combine train + val

In [13]:
ratings_train_final = ratings_train.append(ratings_val)

First subsample a group of users that we will measure these metrics from

Methodology:
    We sample 5 users from each city where the user made the latest review.
    These cities must have at least 100 unique businesses
    These users must also have made a postive review(above their historical average)to those restaurants.
        1. We recommend 10 restaurants to each user
        2. We see if their latest restaurant makes it into the top 10 list (Ranking Metric)
        3. We see for those 10 x 5 recommendations, how many of them are distinct businesses (Coverage)
        4. We see for those top 10 recommendations, how many of them are restaurants they have not visited (Serendipity)
    
    Additionally, we measure what our ranking was for the latest restaurant that the user visited(Ranking Metric
    


Criteria: 

In [14]:
ratings_entire_df = ratings_train.append(ratings_val).append(ratings_holdout)

In [16]:
unique_city_businesses = ratings_entire_df[['city','business_id']].drop_duplicates()
unique_cities = unique_city_businesses.groupby('city').count()['business_id']
unique_cities = unique_cities[unique_cities > 100]
out = pd.DataFrame()
for city in unique_cities.index:
    tmp = ratings_holdout[(ratings_holdout['city'] ==city) &
                              (ratings_holdout['rating'] >ratings_holdout['user_average_stars'])]
    if len(tmp['user_id'].unique())>4:
        
        ###this weird sampling technique is to ensure we dont' sample the same user twice in a same city
        five_users = np.random.choice(tmp['user_id'].unique(),5, replace = False)
        row = tmp[tmp['user_id'].isin(five_users)].groupby('user_id', group_keys=False).apply(lambda df: df.sample(1))
        out = out.append(row)

This is our subsample of users we will measure

In [17]:
all(out.groupby('city').count()['user_id']==5)

True

we merge in every business that user could have been to

In [18]:
predict_df = out[['user_id','city','state']]
predict_df = predict_df.merge(unique_city_businesses, on = 'city')

In [19]:
all(predict_df.groupby('city')['user_id'].nunique()==5)

True

Export this dataset. Everyone should make predictions on this dataset

In [25]:
predict_df.to_csv('data/metric_sample.csv')

In [21]:
predict_df['predictions'] = 2.5

In [28]:
def get_all_metrics(predict_df, validation_subsample, ratings_train_final):
    top_10_recs = predict_df.groupby(['user_id','city'])['predictions'].nlargest(10).reset_index()
    out = validation_subsample
    cnt =0
    serendipity = 0
    
    
    for row in out.iterrows():
        row_values = row[1]
        top_10 = predict_df.loc[top_10_recs[top_10_recs['user_id'] == row_values['user_id']].level_2]['business_id']
        ###In top 10
        if row_values['business_id'] in top_10.values:
            cnt+=1
        user_history = ratings_train_final[ratings_train_final['user_id'] == row_values['user_id']]    
        been_there = [i for i in top_10.values if i in  user_history.business_id.values]
        serendipity += 1-len(been_there)/10
    
    top_10 = cnt/len(out)
    serendipity = serendipity/len(out)
    
    predict_df = predict_df.reset_index()
    
    analysis_df = predict_df.merge(top_10_recs, left_on = ['user_id','city','index'], right_on = ['user_id','city','level_2'])
    
    coverage = (analysis_df.groupby('city')['business_id'].nunique()/50).values.mean()
    
    predict_df['rankings']=predict_df.groupby(['city','user_id'])['predictions'].rank("first",ascending = False)
    running_rankings =0
    for row in out.iterrows():
        row_values = row[1]
        user_recs = predict_df[(predict_df['user_id']==row_values['user_id'])
                            &(predict_df['city']==row_values['city'])
                             & (predict_df['business_id']==row_values['business_id'])
                              ]
        assert len(user_recs)==1
        running_rankings += user_recs['rankings'].sum()

    avg_rank = running_rankings / len(out)
    print(top_10, coverage, serendipity, avg_rank)
    
    return top_10, coverage, serendipity, avg_rank
    

In [31]:
top_10, coverage, serendipity, avg_rank = get_all_metrics(predict_df, out, ratings_train_final)

0.06046511627906977 0.19999999999999996 0.9858139534883719 432.22558139534885
