# Baseline Collaborative Filtering Model

We choose a simple user-based collaborative filtering approach in order to get a baseline accuracy score (RMSE) for our prediction task. For this task, we take advantage of the excellent *Surprise* Python package: https://surprise.readthedocs.io/en/stable/index.html.

In [0]:
# Import required packages:
import pandas as pd
import numpy as np
from google.colab import files, drive
import time

# Colab Filtering packages:
!pip install surprise
from surprise.prediction_algorithms import knns
from surprise.prediction_algorithms import baseline_only
from surprise import SVD
from surprise import Dataset
from surprise import Trainset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import validation
from surprise.model_selection import train_test_split

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 20.7MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678205 sha256=8e2f49d53d30fee9b4e4276ae6ed8509c3f75e845832bca6d31b43a87774b131
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
# Connect to Google Drive (to load raw data)
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

### Import training and test sets

In [0]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

csv_files = {
  'train_set_new': '1-1vfc5jxrCggpVYDkf8Z2rVLjIIOh6bu',
  'last_review_test_set': '1GzOIGqDLDPuHqHxCIYo3euAijvw3h7aZ',
}

dfs = {}

for key, value in csv_files.items():
  csv_name = key + '.csv'
  downloaded = drive.CreateFile({'id': value})
  downloaded.GetContentFile(csv_name)
  dfs[key] = pd.read_csv(csv_name, low_memory=False)
  print("Done with: ", key)

train = dfs['train_set_new'].drop(['date'], axis=1)
test_original = dfs['last_review_test_set'].drop(['date'], axis=1)

Done with:  train_set_new
Done with:  last_review_test_set


In [0]:
# Preserve your reivew IDs (you'll need these later)
train_review_id = pd.DataFrame(train['review_id'])
train = train.drop(['review_id'], axis=1)

test_review_id = pd.DataFrame(test_original['review_id'])
test = test_original.drop(['review_id'], axis=1)

In [0]:
# Validate that this dataset hasn't changed relative to other notebooks:

# train.describe()
# train.user_id.value_counts()
# test.user_id.value_counts()
print(train.head())
print(test.head())

print(train.shape, train_review_id.shape)
print(test.shape, test_review_id.shape)

                  user_id             business_id  rating
0  Spgm6HFWgc4YXJlbhg11Pg  x8O-Mll5ksDpeIgtA0XY-w    4.00
1  VAouiuvywiJid2hnMnw4hA  Noi53T0PWNEN9mQRS3-Ncg    1.00
2  q0zu-FkoAnsXk-th-WQxIw  iQ7tfyfw6lNMre4EIbXpSg    4.00
3  F30hW73J1qOctAv46kznGA  ZOmf-3NN4Z59b2Fw6VAM7g    2.00
4  HpzMECWPO4H1fJihiMG1Pg  i8x1ZPvmdYGu6yH_wvgNkg    4.00
                  user_id             business_id  rating
0  ---1lKK3aKOuomHnwAkAow  Hqs4YNST_ZHbshwyi4bnsQ    5.00
1  --0kuuLmuYBe3Rmu0Iycww  PYe_FDw6QTbTf66WcGE_tw    2.00
2  --2HUmLkcNHZp0xw6AMBPg  KW9RNyBPmc77f9FsO92qYw    5.00
3  --2vR0DIsmQ6WfcSzKWigw  BLIJ-p5wYuAhw6Pp6mh6mw    3.00
4  --3WaS23LcIXtxyFULJHTA  UKrfUw8quQiQM2N9i1nH0g    4.00
(3398090, 3) (3398090, 1)
(286130, 3) (286130, 1)


### Setup for SVD algorithm

In [0]:
algo_SVD = SVD()
reader = Reader(rating_scale=(1, 5))

train_data = Dataset.load_from_df(train, reader).build_full_trainset()
test_data = Dataset.load_from_df(test, reader).build_full_trainset().build_testset()

### Setup for KNN algorithm

In [0]:
# Initialize models (neighborhood size is 40 by default for KNN):
# algo_KNN = knns.KNNBasic(sim_options = {'name' : 'cosine'}, verbose = False)

### Train SVD and make predictions on test set

In [0]:
start_time = time.time()

# SVD
algo_SVD.fit(train_data)
baseline_SVD_test_pred = algo_SVD.test(test_data)

print("Time to run (minutes): ", (time.time() - start_time)/60.0 )

Time to run (minutes):  3.3954473455746967


### Measure RMSE and MAE

In [0]:
RMSE = accuracy.rmse(baseline_SVD_test_pred)
MAE = accuracy.mae(baseline_SVD_test_pred)

RMSE: 1.3360
MAE:  1.0846


### Save predictions to CSV (for further analysis in evaluation pipeline)

In [0]:
user_id = []
business_id = []
prediction = []

for pred in baseline_SVD_test_pred:
  user_id.append(pred.uid)
  business_id.append(pred.iid)
  prediction.append(pred.est)

test_results = pd.DataFrame({'user_id': user_id, 'business_id': business_id, 'prediction': prediction})

In [0]:
# Take the 'review_id' column from your original test set and merge it with your test results (which were stripped of 'review_id')

def add_review_id(original_test_dataframe, new_test_dataframe_with_predictions):
  """
  Input: 
    A dataframe with a column named 'review_id' (original_test_dataframe)
    A dataframe containing your test results (make sure it has not been re-sorted 
    and that it has the same number of rows as original_test_dataframe)
  Output: The two 
  """
  test_review_id = pd.DataFrame(original_test_dataframe['review_id'])
  test_results_final = pd.merge(test_review_id, new_test_dataframe_with_predictions, left_index=True, right_index=True)
  random_location = int(round(len(test_results_final) / 2,2))
  print("Make sure the user_id and business_id columns match: \n", 
        original_test_dataframe.iloc[random_location], 
        new_test_dataframe_with_predictions.iloc[random_location])
  return test_results_final

# Example: 'test_original' has column 'review_id'. 'test_results' has only 'user_id' and 'business_id'
last_review_pred_CF = add_review_id(test_original, test_results)

Make sure the user_id and business_id columns match: 
 review_id      viSbI71RlY4rb2rqpZPcmQ
user_id        UwaxXAKeecCqBw1bM-e9qw
business_id    HmF1uRRxucvELiPWXXsZPw
rating                           3.00
Name: 143065, dtype: object user_id        UwaxXAKeecCqBw1bM-e9qw
business_id    HmF1uRRxucvELiPWXXsZPw
prediction                       4.65
Name: 143065, dtype: object


In [0]:
# Optimization: you should follow best practices import this function from your other notebook:

def upload_csv_to_drive(destination_folder, dataframe, csv_filename):
  dataframe.to_csv(csv_filename, index=False)
  tmp = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": destination_folder}]})
  tmp.SetContentFile(csv_filename)
  tmp.Upload()
  print("Upload complete for: ", csv_filename)

upload_csv_to_drive("19QQWYzHPxr5MMM9iF1WIkF_Io5kYD-Bv", last_review_pred_CF, "cfbaseline_lastreview_predictions.csv")

Upload complete for:  cfbaseline_lastreview_predictions.csv
