# MMD 2024, Collaborative Filtering on Google Colab
This notebook sets up the enviroment and runs CF experiments on Google Colab.





In [1]:
# Clone the repository to local runtime

private = False
if private:
    # Private repository, requires authentication
    from google.colab import userdata
    pat = userdata.get('github_pat')
    project = '24WS-mmd-code-priv'
else:
    pat = ''
    project = '24WS-mmd-code-public'

In [2]:
!git clone https://{pat}@github.com/aip-hd-tea/{project}.git

Cloning into '24WS-mmd-code-public'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 30 (delta 8), reused 27 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 8.12 KiB | 8.12 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [9]:
# Import the repository code
import sys
sys.path.insert(0,f"/content/{project}")

import rec_sys.data_util as cfd

# After edits of cf_algorithms_to_complete.py:
# 1. Rename the file rec_sys.cf_algorithms_to_complete.py to rec_sys.cf_algorithms.py
# 2. Restart the runtime (Runtime -> Restart the session); possibly not needed
# 3. Swap the comments in the next two lines, so that cf_algorithms is imported as cfa
# import rec_sys.cf_algorithms_to_complete as cfa
import rec_sys.cf_algorithms as cfa
# 4. Re-run all cells
# 5. If your changes are correct, you will see a long
#    printout of recommendations for MovieLens dataset (last cell)

In [4]:
# Load or set the configuration
#from rec_sys.cf_config import config

import dataclasses
@dataclasses.dataclass
class config:
    max_rows: int = int(1e5)
    dowload_url: str = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    download_dir: str = "/content/"
    unzipped_dir: str = download_dir + "ml-25m/"
    file_path: str = download_dir + "ml-25m/ratings.csv"


In [67]:

import importlib
importlib.reload(cfa)

# Load the MovieLens and Lecture datasets
um_movielens = cfd.get_um_by_name(config, "movielens")
um_lecture = cfd.get_um_by_name(config, "lecture_1")

# Rate all items for the lecture toy dataset
all_ratings = cfa.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

# Rate all items the MovieLens data
all_ratings_movielens = cfa.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)

Dir '/content/ml-25m/' already exists, skipping download

### Start reading data from '/content/ml-25m/ratings.csv'
Loaded data from '/content/ml-25m/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 

>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

item_idx: 0, neighbors: [5 2], rating: 2.4024121798152467
item_idx: 1, neighbors: [2 3], rating: 4.796110065157531
all_ratings lecture toy dataset: [2.4024121798152467, 4.796110065157531, 2.0, 5.0, 4.0, 3.0]

>>> CF computation for UM w/ shape: (9786, 757), user_index: 0, neighborhood_size: 2



  um_normalized = utility_matrix / norms


item_idx: 70, neighbors: [645 420], rating: 3.444246530532837
item_idx: 71, neighbors: [439 755], rating: 2.795684576034546
item_idx: 72, neighbors: [ 37 678], rating: 3.358126640319824
item_idx: 73, neighbors: [201 355], rating: 4.143816947937012
item_idx: 74, neighbors: [612 216], rating: 3.310704231262207
item_idx: 75, neighbors: [105  11], rating: 3.9287185668945312
item_idx: 76, neighbors: [ 37 420], rating: 5.083348751068115
item_idx: 77, neighbors: [ 11 565], rating: 4.658747673034668
item_idx: 78, neighbors: [723 565], rating: 3.8134701251983643
item_idx: 79, neighbors: [551], rating: 4.595975875854492
item_idx: 80, neighbors: [565 420], rating: 2.331827163696289
item_idx: 81, neighbors: [456 355], rating: 2.9076056480407715
item_idx: 82, neighbors: [645 420], rating: 4.870625972747803
item_idx: 83, neighbors: [755 645], rating: 3.945624589920044
item_idx: 84, neighbors: [355 420], rating: 2.890001058578491
item_idx: 85, neighbors: [355], rating: 4.492856979370117
item_idx: 86,

  rating_of_item = (similarities[best_among_who_rated] @ unbiased_ratings) /np.abs(similarities[best_among_who_rated]).sum()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
item_idx: 4787, neighbors: [186 284], rating: 2.4844188690185547
item_idx: 4788, neighbors: [186 547], rating: 3.238947629928589
item_idx: 4789, neighbors: [547 476], rating: 3.1824939250946045
item_idx: 4790, neighbors: [186 225], rating: 1.984696626663208
item_idx: 4791, neighbors: [186 547], rating: 3.750455856323242
item_idx: 4792, neighbors: [547 476], rating: 2.846062660217285
item_idx: 4793, neighbors: [320 186], rating: 3.258434772491455
item_idx: 4794, neighbors: [713 331], rating: 3.23956036567688
item_idx: 4795, neighbors: [186 547], rating: 3.494701623916626
item_idx: 4796, neighbors: [186], rating: 3.9375734329223633
item_idx: 4797, neighbors: [186 755], rating: 4.185196876525879
item_idx: 4798, neighbors: [186], rating: 3.9375734329223633
item_idx: 4799, neighbors: [227 547], rating: 3.5584774017333984
item_idx: 4800, neighbors: [186 547], rating: 3.2619638442993164
item_idx: 4801, neighbors: [696 186], rati

In [78]:
!python /content/24WS-mmd-code-public/rec_sys/centered_cosine_sim.py

Test b.1 passed: Centered cosine similarity equals expected value of -1.0000000000000002
Test b.2 passed: Centered cosine similarity equals expected value of -0.8019070321811681


## Support for Sparse

In [71]:
import rec_sys.cf_algorithms_sparse as cfas

In [99]:
importlib.reload(cfas)

# Load the MovieLens and Lecture datasets
um_movielens = cfas.convert_dense_um_to_sparse(cfd.get_um_by_name(config, "movielens"), print_sts = True)
um_lecture = cfas.convert_dense_um_to_sparse(cfd.get_um_by_name(config, "lecture_1"), print_sts = True)

# Rate all items for the lecture toy dataset
all_ratings = cfas.rate_all_items(um_lecture, 4, 2)
print ("all_ratings lecture toy dataset:", all_ratings)

# Rate all items the MovieLens data
all_ratings_movielens = cfas.rate_all_items(um_movielens, 0, 2)
print("all_ratings_movielens:", all_ratings_movielens)

Dir '/content/ml-25m/' already exists, skipping download

### Start reading data from '/content/ml-25m/ratings.csv'
Loaded data from '/content/ml-25m/ratings.csv', df shape: (100000, 3), size in MB: 1.1444091796875 
Pivoting the data
Utility matrix, df shape: (9786, 757), size in MB: 29.142929077148438 
Final utility matrix (numpy array as np.float32), df shape: (9786, 757), size in MB: 28.25928497314453 
Final utility matrix (Sparse CSR Matrix) size in MB: 0.800274 MB
Final utility matrix (Sparse CSR Matrix) size in MB: 0.000244 MB

>>> CF computation for UM w/ shape: (6, 6), user_index: 4, neighborhood_size: 2

19


ValueError: dimension mismatch