In [1]:
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate
from collections import defaultdict
import pandas as pd

In [2]:
df = pd.read_csv("../ALS/data/ratings.csv", nrows=1000000)  # Header wird automatisch erkannt
df.columns = ["user", "item", "rating"]  # Umbenennen, falls nötig

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)  # Spaltennamen statt Indizes

In [11]:
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8713  0.8703  0.8715  0.8677  0.8687  0.8699  0.0015  
MAE (testset)     0.6815  0.6817  0.6812  0.6795  0.6795  0.6807  0.0010  
Fit time          3.70    4.34    3.92    4.47    4.13    4.11    0.28    
Test time         0.62    0.62    0.61    0.71    0.65    0.64    0.04    


{'test_rmse': array([0.87132088, 0.8703213 , 0.87145765, 0.86765762, 0.86874061]),
 'test_mae': array([0.68154045, 0.681741  , 0.68119256, 0.67949982, 0.67954632]),
 'fit_time': (3.701836109161377,
  4.341921091079712,
  3.924405097961426,
  4.467318773269653,
  4.125885963439941),
 'test_time': (0.6189332008361816,
  0.6157870292663574,
  0.6067540645599365,
  0.7122130393981934,
  0.6460320949554443)}

In [None]:
algo = KNNBasic()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9093  0.9097  0.9120  0.9103  0.0012  
MAE (testset)     0.7170  0.7172  0.7185  0.7176  0.0006  
Fit time          8.05    8.25    8.02    8.11    0.10    
Test time         64.86   66.65   62.73   64.75   1.60    


{'test_rmse': array([0.90926767, 0.90971025, 0.91200533]),
 'test_mae': array([0.71704404, 0.7171998 , 0.71848078]),
 'fit_time': (8.053538084030151, 8.247802019119263, 8.016561031341553),
 'test_time': (64.85671186447144, 66.65344619750977, 62.732341051101685)}

In [6]:

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [3]:
books = pd.read_csv("../ALS/data/books.csv")
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [4]:
# Train an SVD using test and split dataset
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10585e3e0>

In [8]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
top_n = get_top_n(predictions, n=10)

# Erstelle ein Mapping von ID zu Titel
id_to_title = dict(zip(books['book_id'], books['title']))

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    book_ids = [iid for (iid, *_) in user_ratings]
    print(f"User {uid}:")
    for book_id in book_ids:
        title = id_to_title.get(book_id, f"Unknown ID: {book_id}")
        print(f"  {book_id}: {title}")
    print()  # Leerzeile zwischen Usern

User 1:
  3628: The Complete Calvin and Hobbes
  5580: The Calvin and Hobbes Lazy Sunday Book
  1010: The Essential Calvin and Hobbes: A Calvin and Hobbes Treasury
  8946: The Divan
  7401: The Brothers K
  4822: Complete Poems, 1904-1962
  8323: Homicide: A Year on the Killing Streets
  1833: These Is My Words: The Diary of Sarah Agnes Prine, 1881-1901, Arizona Territories (Sarah Agnes Prine, #1)
  7010: The Walking Dead, Vol. 07: The Calm Before
  5754: Collected Fictions

User 2:
  70: Ender's Game (Ender's Saga, #1)
  964: J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings
  7254: Homicidal Psycho Jungle Cat: A Calvin and Hobbes Collection
  2975: Kurt Vonnegut's Cat's Cradle (Modern Critical Interpretations)
  2209: The Complete Works
  1010: The Essential Calvin and Hobbes: A Calvin and Hobbes Treasury
  8946: The Divan
  5207: The Days Are Just Packed: A Calvin and Hobbes Collection
  1788: The Calvin and Hobbes Tenth Anniversary Book
  4483: It's a Magical Wo