# MMD 2024, Collaborative Filtering on Google Colab
This notebook sets up the enviroment and runs CF experiments on Google Colab.





In [1]:
# Clone the repository to local runtime
import numpy as np
import scipy.sparse as sp
private = False
if private:
    # Private repository, requires authentication
    from google.colab import userdata
    pat = userdata.get('github_pat')
    project = '24WS-mmd-code-priv'
else:
    pat = ''
    project = '24WS-mmd-code-public'

In [2]:
# !git clone https://{pat}@github.com/aip-hd-tea/{project}.git

In [3]:
# Import the repository code
import sys
sys.path.insert(0,f"/content/{project}")

import data_util as cfd

# After edits of cf_algorithms_to_complete.py:
# 1. Rename the file rec_sys.cf_algorithms_to_complete.py to rec_sys.cf_algorithms.py
# 2. Restart the runtime (Runtime -> Restart the session); possibly not needed
# 3. Swap the comments in the next two lines, so that cf_algorithms is imported as cfa
# import rec_sys.cf_algorithms_to_complete as cfa
import cf_algorithms as cfa
import cf_algorithms_ex3 as cf3
# 4. Re-run all cells
# 5. If your changes are correct, you will see a long
#    printout of recommendations for MovieLens dataset (last cell)

2024-10-28 17:28:47.988770: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-28 17:28:47.991944: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-28 17:28:48.001235: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730132928.017137   15338 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730132928.021739   15338 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-28 17:28:48.044108: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [4]:
# Load or set the configuration
#from rec_sys.cf_config import config

import dataclasses
@dataclasses.dataclass
class config:
    max_rows: int = int(1e5)
    dowload_url: str = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    download_dir: str = "./content/"
    unzipped_dir: str = download_dir
    file_path: str = download_dir + "ratings.csv"


In [5]:
# Load the MovieLens and Lecture datasets
um_movielens = cfd.get_um_by_name(config, "movielens")
um_lecture = cfd.get_um_by_name(config, "lecture_1")

Dir './content/' already exists, skipping download

### Start reading data from './content/ratings.csv'
Loaded data from './content/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 


In [6]:

# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)


>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [2 5], rating: -0.4010036592543246
item_idx: 1, neighbors: [2 3], rating: 4.27917451131852
all_ratings lecture toy dataset: [-0.4010036592543246, 4.27917451131852, 2.0, 5.0, 4.0, 3.0]


In [7]:
# Rate all items in the toy data set with own function for sparse vectors
all_ratings2 = cf3.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)


>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [2 5], rating: -4.77498019093577
item_idx: 1, neighbors: [2 3], rating: -4.279174511318519
all_ratings lecture toy dataset: [-0.4010036592543246, 4.27917451131852, 2.0, 5.0, 4.0, 3.0]


In [8]:
# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)


>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2

item_idx: 70, neighbors: [755 756], rating: 3.0
item_idx: 71, neighbors: [740 755], rating: 1.6794219017028809
item_idx: 72, neighbors: [755 756], rating: 4.474548816680908
item_idx: 73, neighbors: [752 756], rating: 3.0707926750183105
item_idx: 74, neighbors: [676 747], rating: 4.0
item_idx: 75, neighbors: [693 747], rating: 3.0
item_idx: 76, neighbors: [755 756], rating: 4.0
item_idx: 77, neighbors: [719 747], rating: -1.044062852859497
item_idx: 78, neighbors: [723 756], rating: 2.086301803588867
item_idx: 79, neighbors: [755 756], rating: 4.949097633361816
item_idx: 80, neighbors: [696 704], rating: 3.848302125930786
item_idx: 81, neighbors: [750 753], rating: 3.0
item_idx: 82, neighbors: [755 756], rating: 3.974548816680908
item_idx: 83, neighbors: [753 755], rating: 3.3744473457336426
item_idx: 84, neighbors: [753 756], rating: 2.999999761581421
item_idx: 85, neighbors: [755 756], rating: 2.974

  um_normalized = utility_matrix / norms
  rating_of_item = np.sum(sims_best * orig_utility_matrix[item_index, best_among_who_rated]) / np.sum(np.abs(sims_best))


item_idx: 704, neighbors: [704 739], rating: -0.28693899512290955
item_idx: 705, neighbors: [646 708], rating: 2.890956401824951
item_idx: 706, neighbors: [627 708], rating: 1.4948139190673828
item_idx: 707, neighbors: [704 708], rating: 1.903896450996399
item_idx: 708, neighbors: [722 739], rating: 1.3710031509399414
item_idx: 709, neighbors: [708 739], rating: -4.0458502769470215
item_idx: 710, neighbors: [722 739], rating: 0.2473354935646057
item_idx: 711, neighbors: [739 743], rating: -3.4218456745147705
item_idx: 712, neighbors: [627 708], rating: 1.8944683074951172
item_idx: 713, neighbors: [722 739], rating: -0.5773980617523193
item_idx: 714, neighbors: [739 754], rating: -0.8574036359786987
item_idx: 715, neighbors: [459 547], rating: -0.2152843475341797
item_idx: 716, neighbors: [711 754], rating: 4.650521755218506
item_idx: 717, neighbors: [704 708], rating: 0.9890542030334473
item_idx: 718, neighbors: [627 722], rating: 3.546812057495117
item_idx: 719, neighbors: [708 722], 

In [9]:
# Very very slow. Takes 7 minutes lol
all_ratings_movielens2 = cf3.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens2)


>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2



  utility_matrix = utility_matrix.multiply(1/matrix_norms)
  data = np.multiply(ret.data, other[:, ret.col].ravel())


item_idx: 70, neighbors: [755 756], rating: 0.2927297697966007
item_idx: 71, neighbors: [740 755], rating: 3.0820837870787767
item_idx: 72, neighbors: [755 756], rating: 0.664700507211851
item_idx: 73, neighbors: [752 756], rating: 0.09954949072866007
item_idx: 74, neighbors: [676 747], rating: 4.0
item_idx: 75, neighbors: [693 747], rating: 2.9999999999999907
item_idx: 76, neighbors: [755 756], rating: 0.3903063597288009
item_idx: 77, neighbors: [719 747], rating: -1.9440710725326347
item_idx: 78, neighbors: [723 756], rating: 0.7365256128989361
item_idx: 79, neighbors: [755 756], rating: 0.9390946546949008
item_idx: 80, neighbors: [696 704], rating: 2.358838375100903
item_idx: 81, neighbors: [750 753], rating: 3.0
item_idx: 82, neighbors: [755 756], rating: 0.6159122122457509
item_idx: 83, neighbors: [753 755], rating: 3.6525822864449156
item_idx: 84, neighbors: [753 756], rating: 1.173286628286692
item_idx: 85, neighbors: [755 756], rating: 0.5183356223135507
item_idx: 86, neighbors