In [0]:
# Import required packages:
import pandas as pd
import numpy as np
from google.colab import files, drive
import time

In [0]:
# Connect to Google Drive (to load raw data)
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Define Required Functions

In [0]:
def ranking(train_set, prediction_set):
  """
  Input: train set must have these 5 columns: 'review_id,''user_id','business_id','rating','date'
         prediction_set must have these 6 columns: 'review_id,,'user_id','business_id','rating','date' ,'prediction'

  Output: a dataframe of 10 columns:'review_id, 'user_id','business_id','rating','date' ,'prediction', prediction2, true_rank, pred_rank, number_reviews
  """
  #concat train and last_review dataframes
  train_set = train_set.append(prediction_set[['review_id','user_id','business_id','rating' ,'prediction']].copy(), sort=True)

  #create prediction2 column
  train_set['prediction2'] = np.where(train_set['prediction'].notna(), train_set['prediction'], train_set['rating'])

  #now by user, build user's ranking
  grouped = train_set.groupby("user_id")
  train_set['true_rank'] = grouped['rating'].rank(method='average')
  train_set['pred_rank'] = grouped['prediction2'].rank(method='average')
  train_set['number_reviews'] = grouped['rating'].transform('count')

  #return 
  ##keep only predictions ranking
  pred_ranking = train_set[train_set.prediction.notna()]

  return pred_ranking

def coverage(train_set, prediction_set):
  """
  Input:train set must have these 4 columns: 'user_id','business_id','rating','date'
        prediction_set must have these 5 columns: 'user_id','business_id','rating','date' ,'prediction'

  Output: -Ranking Dataframe (same from 'ranking' function) with an extra column named 'Coverage' indicating wheter the predictions from prediction set
          are part of the coverage or not.
  """
  model_ranking = ranking(train_set, prediction_set) #ranking function
  diff = (model_ranking['true_rank'] - model_ranking['pred_rank'])/model_ranking['number_reviews']

  model_ranking['coverage'] = np.where(np.abs(diff)<0.25, 1.0,0.0)  #1: good recom. 0: bad recom.
  global_coverage = np.round(model_ranking.coverage.agg(sum)/model_ranking.user_id.size, 4)
  
  #return dataframe and global user coverage value
  return (model_ranking, global_coverage*100)

def csv_loader(csv_key, csv_value, dict_of_df):
  csv_name = csv_key + '.csv'
  downloaded = drive.CreateFile({'id': csv_value})
  downloaded.GetContentFile(csv_name)
  dict_of_df[csv_key] = pd.read_csv(csv_name, low_memory=False)
  print("Downloaded: ", csv_key)

def upload_csv_to_drive(destination_folder, dataframe, csv_filename):
  dataframe.to_csv(csv_filename, index=False)
  tmp = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": destination_folder}]})
  tmp.SetContentFile(csv_filename)
  tmp.Upload()
  print("Upload complete for: ", csv_filename)

# TODO: Create wrapper for methods where you calculate metrics by a DIMENSION

## Import All CSVs
1.  Test CSVs (the test results of each of your models)
2.  Two additional CSVs used for computing metics (training data and 'categorization')

In [4]:
# 1) Test data
test_csv_files = {
  'bias_baseline': '1whL3GlNUTGmvJSqgBcftgLro_Iyc4OMX',
  'cf_baseline': '1yx7CbvemO_KRIXJKrF0gV4ryWhZlws4G',
  'deep_learning': '17ncEfcXPV3srLyulf9lfwvvMyJ_ptjIz',
}

dataframes = {}
for key, value in test_csv_files.items():
  csv_loader(key, value, dataframes)

Downloaded:  bias_baseline
Downloaded:  cf_baseline
Downloaded:  deep_learning


In [0]:
# Quick debugging: new input CSVs (e.g. bias_baseline) have an additional
# column, which was causing problems. Drop it for the time being.
dataframes['bias_baseline'] = dataframes['bias_baseline'].drop(['rating'], axis = 1)
dataframes['deep_learning'] = dataframes['deep_learning'].drop(['rating'], axis = 1)

In [0]:
# Deep learning is actually 4 models. Split these into 4 Dataframes:
dataframes['deep_learning_m1f1'] = dataframes['deep_learning'][['review_id','user_id','business_id','prediction_m1f1']].rename(columns={'prediction_m1f1':'prediction'})
dataframes['deep_learning_m1f2'] = dataframes['deep_learning'][['review_id','user_id','business_id','prediction_m1f2']].rename(columns={'prediction_m1f2':'prediction'})
dataframes['deep_learning_m2f1'] = dataframes['deep_learning'][['review_id','user_id','business_id','prediction_m2f1']].rename(columns={'prediction_m2f1':'prediction'})
dataframes['deep_learning_m2f2'] = dataframes['deep_learning'][['review_id','user_id','business_id','prediction_m2f2']].rename(columns={'prediction_m2f2':'prediction'})
del dataframes['deep_learning']

In [8]:
# 2) Metadata
metadata_files = {
  # 'train_set_new_og': '1-1vfc5jxrCggpVYDkf8Z2rVLjIIOh6bu',
  'categorization': '1sBtQbasU-skHl2SZHI_qUi-Lf4Co1us5',
  'train_set_new': '1u-6hcK4B0EncGsu0dKmS9Uc3eykS2Ref',
}

metadata = {}
for key, value in metadata_files.items():
  csv_loader(key, value, metadata)

train_full = metadata['train_set_new']
categorization = metadata['categorization'].drop(['user_id', 'business_id'], axis=1).rename(columns={'stars': 'rating'})

Downloaded:  categorization
Downloaded:  train_set_new


## Build Table 1 for final report 
##### You can find it here: https://drive.google.com/drive/u/2/folders/12pKbs8ptxpPaqaMmArivSqXQ58uv3iV0

In [9]:
# This loop will calculate all metrics for all models and save it to a dataframe:

model_list, rmse_list, coverage_list = [], [], []

for row in dataframes:
  print("Working on", row)
  model_list.append(row)
  
  # Get the necessary DFs: 
    # 1) test results (test_df), 
    # 2) augmented test results with actual ratings and categories (test_df_cat), 
  test_df = dataframes[row]
  test_df_cat = pd.merge(test_df, categorization, on='review_id', how='inner')
  
  # Compute RMSE:
  test_df_cat['squared_error'] = (test_df_cat['rating'] - test_df_cat['prediction'])**2
  model_rmse = np.sqrt(test_df_cat['squared_error'].mean())
  rmse_list.append(model_rmse)
  
  # Compute Coverage:
  rank, cov = coverage(
      train_full[['review_id','user_id','business_id','rating']], 
      test_df_cat[['review_id','user_id','business_id','rating','prediction']])
  coverage_list.append(cov)

  # Compute Spearman: REPLACE THIS WITH NEW METRIC

  # Clean up:
  del test_df, test_df_cat, rank

# Build your dataframe for the report and export to CSV:
table_1 = pd.DataFrame({'Model': model_list, 'RMSE': rmse_list, 'Coverage': coverage_list})
upload_csv_to_drive("12pKbs8ptxpPaqaMmArivSqXQ58uv3iV0", table_1, "table_1.csv")

Working on bias_baseline
Working on cf_baseline
Working on deep_learning_m1f1
Working on deep_learning_m1f2
Working on deep_learning_m2f1
Working on deep_learning_m2f2
Upload complete for:  table_1.csv


In [10]:
table_1

Unnamed: 0,Model,RMSE,Coverage
0,bias_baseline,1.391587,48.0
1,cf_baseline,1.336046,49.96
2,deep_learning_m1f1,1.936393,55.4
3,deep_learning_m1f2,1.932867,55.43
4,deep_learning_m2f1,1.514599,41.64
5,deep_learning_m2f2,1.514133,41.53


## Build Table 2/3 for final report

#### Methods for Metrics:

In [0]:
def calculate_RMSE_by_group(input_mat, model_name, metric_name, grouper):
  """
  Inputs:
  1. Ratings matrix with columns 'rating' and 'prediction'
  2. A column by which to group your metrics (any categorical column in 'categorization' matrix)
  3. A model name (e.g. 'CF_Baseline')
  4. A metric name (e.g. 'RMSE')
  Output: A long-form dataframe reporting RMSE by model, metric, and variable group.
  """
  # Calculate RMSE
  input_mat['squared_error'] = (input_mat['rating'] - input_mat['prediction'])**2
  test_df_grouped = pd.DataFrame(input_mat.groupby(grouper)['squared_error'].agg('mean')).reset_index()
  test_df_grouped['metric_value'] = np.sqrt(test_df_grouped['squared_error'])
  # Append metric and model columns
  test_df_grouped = pd.merge(pd.DataFrame({'metric': [metric_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
  test_df_grouped = pd.merge(pd.DataFrame({'model': [model_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
  test_df_grouped = test_df_grouped.drop(['squared_error'], axis=1)
  
  return test_df_grouped

def calculate_coverage_by_group(input_mat, model_name, metric_name, grouper):
  """
  Inputs:
  1. As input, must use the output of the method "ranking" defined above.
  2. A column by which to group your metrics (any categorical column in 'categorization' matrix)
  3. A model name (e.g. 'CF_Baseline')
  4. A metric name (e.g. 'RMSE')
  Output: A long-form dataframe reporting coverage by model, metric, and variable group.
  """
  # Join it to categorization to get categories for grouping:
  model_ranking_cat = pd.merge(input_mat, categorization, on='review_id', how='inner')

  # Calculate coverage:
  diff = (model_ranking_cat['true_rank'] - model_ranking_cat['pred_rank']) / model_ranking_cat['number_reviews']
  model_ranking_cat['metric_value'] = np.where(np.abs(diff)<0.25, 1.0,0.0)  #1: good recom. 0: bad recom.
  test_df_grouped = pd.DataFrame(model_ranking_cat.groupby(grouper)['metric_value'].agg('mean')).reset_index()
  test_df_grouped['metric_value'] = round(100.0 * test_df_grouped['metric_value'], 2)

  # Append metric and model columns:
  test_df_grouped = pd.merge(pd.DataFrame({'metric': [metric_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
  test_df_grouped = pd.merge(pd.DataFrame({'model': [model_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
  
  return test_df_grouped 

In [12]:
# This loop will calculate all metrics for all models and save it to a dataframe:

grouper_values = ['is_open','business_popularity_bin','user_activity_bin'] # do a subset of cities later

for grouper_value in grouper_values:
  table_2 = pd.DataFrame()

  for row in dataframes:
    print("Working on", row)
    model_list.append(row)
    
    # Get the necessary DFs: 
      # 1) test results (test_df), 
      # 2) augmented test results with categories (test_df_cat), 
    test_df = dataframes[row]
    test_df_cat = pd.merge(test_df, categorization, on='review_id', how='inner')
    
    # Compute RMSE by grouping (TODO: iterate over these groupings):
    table_2 = table_2.append(calculate_RMSE_by_group(test_df_cat, row, 'RMSE', grouper_value))
    
    # Compute Coverage by grouping:
    # Take the ranking (this is the precursor to coverage): 
    model_ranking = ranking(
      train_full[['review_id','user_id','business_id','rating']], 
      test_df_cat[['review_id','user_id','business_id','rating','prediction']])
    table_2 = table_2.append(calculate_coverage_by_group(model_ranking, row, 'coverage', grouper_value))

    # Compute Spearman: REPLACE THIS WITH NEW METRIC
    # Clean up:
    del test_df, test_df_cat, model_ranking

  # Build your dataframe for the report:
  table_2 = table_2.pivot_table(
    values = 'metric_value',
    index = ['model', 'metric'],
    columns = grouper_value).reset_index()

  # Upload as named CSV
  file_name = grouper_value + '_table_2.csv'
  upload_csv_to_drive('12pKbs8ptxpPaqaMmArivSqXQ58uv3iV0', table_2, file_name)

Working on bias_baseline
Working on cf_baseline
Working on deep_learning_m1f1
Working on deep_learning_m1f2
Working on deep_learning_m2f1
Working on deep_learning_m2f2
Upload complete for:  is_open_table_2.csv
Working on bias_baseline
Working on cf_baseline
Working on deep_learning_m1f1
Working on deep_learning_m1f2
Working on deep_learning_m2f1
Working on deep_learning_m2f2
Upload complete for:  business_popularity_bin_table_2.csv
Working on bias_baseline
Working on cf_baseline
Working on deep_learning_m1f1
Working on deep_learning_m1f2
Working on deep_learning_m2f1
Working on deep_learning_m2f2
Upload complete for:  user_activity_bin_table_2.csv


In [13]:
table_2

user_activity_bin,model,metric,high,low,medium
0,bias_baseline,RMSE,1.282249,1.437495,1.356632
1,bias_baseline,coverage,51.88,47.64,46.31
2,cf_baseline,RMSE,1.272532,1.357752,1.327999
3,cf_baseline,coverage,52.11,50.25,47.93
4,deep_learning_m1f1,RMSE,1.794966,1.963291,1.964366
5,deep_learning_m1f1,coverage,49.23,57.83,54.05
6,deep_learning_m1f2,RMSE,1.787742,1.96202,1.958074
7,deep_learning_m1f2,coverage,49.2,57.86,54.1
8,deep_learning_m2f1,RMSE,1.443523,1.532555,1.519637
9,deep_learning_m2f1,coverage,39.57,43.45,39.01


# ----------------------------------------------------
## Practice, ignore:

In [0]:
# test_df = dataframes['cf_baseline']
# test_df_cat = pd.merge(test_df, categorization, on='review_id', how='inner')

In [0]:
# print(test_df.shape)
# print(train_full.shape)
# print(categorization.shape)
# test_df_cat.head()

In [0]:
# # Coverage
# def calculate_RMSE_by_group(input_mat, model_name, metric_name, grouper):
#   """
#   Inputs:
#   1. Ratings matrix with columns 'rating' and 'prediction'
#   2. A column by which to group your metrics (any categorical column in 'categorization' matrix)
#   3. A model name (e.g. 'CF_Baseline')
#   4. A metric name (e.g. 'RMSE')
#   """
#   # Calculate RMSE
#   input_mat['squared_error'] = (input_mat['rating'] - input_mat['prediction'])**2
#   test_df_grouped = pd.DataFrame(input_mat.groupby(grouper)['squared_error'].agg('mean')).reset_index()
#   test_df_grouped['metric_value'] = np.sqrt(test_df_grouped['squared_error'])
#   # Append metric and model columns
#   test_df_grouped = pd.merge(pd.DataFrame({'metric': [metric_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
#   test_df_grouped = pd.merge(pd.DataFrame({'model': [model_name] * len(test_df_grouped)}), test_df_grouped, left_index=True, right_index=True)
#   test_df_grouped = test_df_grouped.drop(['squared_error'], axis=1)
  
#   return test_df_grouped


# # Take the ranking (this is the precursor to coverage): 
# model_ranking = ranking(
#     train_full[['review_id','user_id','business_id','rating']], 
#     test_df_cat[['review_id','user_id','business_id','rating','prediction']])

# calculate_coverage_by_group(model_ranking, 'some_model', 'coverage', 'user_activity_bin')

In [0]:
# test_df_grouped

In [0]:
# csv_files = {
#   'train_set_new': '1-1vfc5jxrCggpVYDkf8Z2rVLjIIOh6bu'
# }

# dfs = {}

# for key, value in csv_files.items():
#   csv_name = key + '.csv'
#   downloaded = drive.CreateFile({'id': value})
#   downloaded.GetContentFile(csv_name)
#   dfs[key] = pd.read_csv(csv_name, low_memory=False)
#   print("Done with: ", key)

# train_full = dfs['train_set_new']

In [0]:
# full_df_cat.user_id.value_counts()

In [0]:
# # Try ranking: 
# r = ranking(
#     train_full[['review_id','user_id','business_id','rating']], 
#     test_df_cat[['review_id','user_id','business_id','rating','prediction']])

# r.head()

In [0]:
# # Try coverage:
# rank, cov = coverage(
#     train_full[['review_id','user_id','business_id','rating']], 
#     test_df_cat[['review_id','user_id','business_id','rating','prediction']])

# cov # 1.24 doesn't seem right; it should be between 0-1, right?

In [0]:
# # User rank to test spearman:
# print(rank.shape)
# print(len(rank['user_id'].unique()))
# rank.head()