# Item Item Scorer

In [2]:
import pandas as pd

from lenskit.basic import topn
from lenskit.knn import ItemKNNScorer
from lenskit.batch import recommend
from lenskit.data import from_interactions_df, ItemListCollection, UserIDKey
from lenskit.metrics import NDCG, RBP, RecipRank, RunAnalysis, Recall, Precision, RMSE, MAE
from lenskit.pipeline import topn_pipeline
from lenskit.splitting import SampleFrac, crossfold_users

Load data

In [3]:
df = pd.read_csv('../ALS/data/ratings.csv')
books_df = pd.read_csv('../ALS/data/books.csv')
dataset = from_interactions_df(df, user_col='user_id', item_col='book_id', rating_col='rating')

## Build ItemItem Model

Build model with default settings

In [3]:
model_ii = ItemKNNScorer(k=20)

In [4]:
pipe_ii = topn_pipeline(model_ii)

In [5]:
# test data is organized by user
all_test = ItemListCollection(UserIDKey)
# recommendations will be organized by model and user ID
all_recs = ItemListCollection(["model", "user_id"])

for split in crossfold_users(dataset, 5, SampleFrac(0.2)):
    # collect the test data
    all_test.add_from(split.test)

    # train the pipeline, cloning first so a fresh pipeline for each split
    fit_ii = pipe_ii.clone()
    fit_ii.train(split.train)
    ii_recs = recommend(fit_ii, split.test.keys(), 100)
    all_recs.add_from(ii_recs, model="II")

  return torch.sparse_csr_tensor(


In [6]:
ran = RunAnalysis()
ran.add_metric(NDCG())
ran.add_metric(RBP())
ran.add_metric(RecipRank())
results = ran.measure(all_recs, all_test)

In [8]:
results.list_metrics().mean()

NDCG         0.015468
RBP          0.007261
RecipRank    0.026249
dtype: float64

First Fine Tuning. Increase min neighbours from 1 to 3 and increase minimum similarity from 1e-06 to 0.1

In [9]:
model_ii = ItemKNNScorer(
    max_nbrs=20,              # 20 Nachbarn
    min_nbrs=3,            # Mindestens 3 Nachbarn
    min_sim=0.1             # Mindest-Ähnlichkeit
    )
pipe_ii = topn_pipeline(model_ii)

In [10]:
# test data is organized by user
all_test = ItemListCollection(UserIDKey)
# recommendations will be organized by model and user ID
all_recs = ItemListCollection(["model", "user_id"])

for split in crossfold_users(dataset, 5, SampleFrac(0.2)):
    # collect the test data
    all_test.add_from(split.test)

    # train the pipeline, cloning first so a fresh pipeline for each split
    fit_ii = pipe_ii.clone()
    fit_ii.train(split.train)
    ii_recs = recommend(fit_ii, split.test.keys(), 100)
    all_recs.add_from(ii_recs, model="II")

Tests mit k=10

In [19]:
ran = RunAnalysis()
ran.add_metric(NDCG(k=10))
ran.add_metric(Precision(k=10))
ran.add_metric(Recall(k=10))
ran.add_metric(RBP())
ran.add_metric(RecipRank())
ran.add_metric(RMSE(missing_scores='ignore', missing_truth='ignore'))
ran.add_metric(MAE(missing_scores='ignore', missing_truth='ignore'))
results = ran.measure(all_recs, all_test)

In [20]:
results.list_metrics().mean()

NDCG@10         0.179640
Precision@10    0.179530
Recall@10       0.175635
RBP             0.170473
RecipRank       0.350592
RMSE            0.187708
MAE             0.050603
dtype: float64

Grid Search

In [21]:
# Schnelle Parameter-Suche
configs = [
    {"max_nbrs": 20, "min_nbrs": 3, "min_sim": 0.1},   # Current
    {"max_nbrs": 30, "min_nbrs": 5, "min_sim": 0.2},   # Konservativer
    {"max_nbrs": 50, "min_nbrs": 3, "min_sim": 0.15},  # Mehr Nachbarn
    {"max_nbrs": 40, "min_nbrs": 2, "min_sim": 0.05},  # Liberal
]

for i, config in enumerate(configs):
    print(f"Testing config {i+1}: {config}")

    model = ItemKNNScorer(**config)
    pipe_ii = topn_pipeline(model_ii)

    # test data is organized by user
    all_test = ItemListCollection(UserIDKey)
    # recommendations will be organized by model and user ID
    all_recs = ItemListCollection(["model", "user_id"])

    for split in crossfold_users(dataset, 1, SampleFrac(0.2)):
        # collect the test data
        all_test.add_from(split.test)

        # do the same for item-item
        fit_ii = pipe_ii.clone()
        fit_ii.train(split.train)
        ii_recs = recommend(fit_ii, split.test.keys(), 100)
        all_recs.add_from(ii_recs, model="II")
    
    ran = RunAnalysis()
    ran.add_metric(NDCG(k=10))
    ran.add_metric(RBP(k=10))
    ran.add_metric(RecipRank(k=10))
    ran.add_metric(Precision(k=10))
    ran.add_metric(Recall(k=10))
    results = ran.measure(all_recs, all_test)

    print(results.list_metrics().groupby("model").mean())

Testing config 1: {'max_nbrs': 20, 'min_nbrs': 3, 'min_sim': 0.1}
        NDCG@10   RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                          
II     0.213566  0.17138      0.386844      0.218707   0.205684
Testing config 2: {'max_nbrs': 30, 'min_nbrs': 5, 'min_sim': 0.2}
        NDCG@10   RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                          
II     0.215602  0.17305      0.392086      0.220159   0.207205
Testing config 3: {'max_nbrs': 50, 'min_nbrs': 3, 'min_sim': 0.15}
       NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                          
II     0.21455  0.172377      0.387912      0.219233   0.206444
Testing config 4: {'max_nbrs': 40, 'min_nbrs': 2, 'min_sim': 0.05}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.215199  0.172691   

Config 2 die beste

In [22]:
configs = [
    # Zusätzliche Tests:
    {"max_nbrs": 10, "min_nbrs": 2, "min_sim": 0.3},   # Sehr konservativ
    {"max_nbrs": 100, "min_nbrs": 5, "min_sim": 0.1},  # Viele Nachbarn
    {"max_nbrs": 15, "min_nbrs": 1, "min_sim": 0.4},   # Hohe Ähnlichkeit
    {"max_nbrs": 80, "min_nbrs": 1, "min_sim": 0.01},  # Sehr liberal
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.25},  # Balanced+
    {"max_nbrs": 60, "min_nbrs": 8, "min_sim": 0.15},  # Viele min_nbrs
]

for i, config in enumerate(configs):
    print(f"Testing config {i+1}: {config}")

    model = ItemKNNScorer(**config)
    pipe_ii = topn_pipeline(model_ii)

    # test data is organized by user
    all_test = ItemListCollection(UserIDKey)
    # recommendations will be organized by model and user ID
    all_recs = ItemListCollection(["model", "user_id"])

    for split in crossfold_users(dataset, 1, SampleFrac(0.2)):
        # collect the test data
        all_test.add_from(split.test)

        # do the same for item-item
        fit_ii = pipe_ii.clone()
        fit_ii.train(split.train)
        ii_recs = recommend(fit_ii, split.test.keys(), 100)
        all_recs.add_from(ii_recs, model="II")
    
    ran = RunAnalysis()
    ran.add_metric(NDCG(k=10))
    ran.add_metric(RBP(k=10))
    ran.add_metric(RecipRank(k=10))
    ran.add_metric(Precision(k=10))
    ran.add_metric(Recall(k=10))
    results = ran.measure(all_recs, all_test)

    print(results.list_metrics().groupby("model").mean())

Testing config 1: {'max_nbrs': 10, 'min_nbrs': 2, 'min_sim': 0.3}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.214725  0.172343      0.390142       0.22012   0.206539
Testing config 2: {'max_nbrs': 100, 'min_nbrs': 5, 'min_sim': 0.1}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.213367  0.171244      0.386539      0.218304   0.205203
Testing config 3: {'max_nbrs': 15, 'min_nbrs': 1, 'min_sim': 0.4}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.213885  0.171738      0.389135       0.21777   0.205199
Testing config 4: {'max_nbrs': 80, 'min_nbrs': 1, 'min_sim': 0.01}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.214221  0.

Config 5 hat sich als die beste herausgestellt. Diese wird nun nochmal feinabgestimmt. \
`
{"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.25} 
NDCG@10: 0.2166 (+0.5% vs. bisherige beste) 
Precision@10: 0.2208 (beste) 
Recall@10: 0.2074 (beste) 
`

In [None]:
fine_tuning_configs = [
    # Variation von max_nbrs (±5-10)
    {"max_nbrs": 20, "min_nbrs": 3, "min_sim": 0.25},
    {"max_nbrs": 22, "min_nbrs": 3, "min_sim": 0.25},
    {"max_nbrs": 28, "min_nbrs": 3, "min_sim": 0.25},
    {"max_nbrs": 30, "min_nbrs": 3, "min_sim": 0.25},  # Vergleich zu Run 1 Config 2
    
    # Variation von min_nbrs
    {"max_nbrs": 25, "min_nbrs": 2, "min_sim": 0.25},  # lockerer
    {"max_nbrs": 25, "min_nbrs": 4, "min_sim": 0.25},  # strikter
    {"max_nbrs": 25, "min_nbrs": 5, "min_sim": 0.25},  # Vergleich zu Run 1 Config 2
    
    # Variation von min_sim (±0.05)
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.20},  # weniger strikt
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.22},
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.28},
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.30},  # strikter (Vergleich zu Run 2 Config 1)
    
    # Kombinierte Variationen
    {"max_nbrs": 22, "min_nbrs": 4, "min_sim": 0.22},  # moderat in allen Parametern
    {"max_nbrs": 28, "min_nbrs": 2, "min_sim": 0.28},  # mehr Nachbarn, weniger Filter
    
    # Baseline zum Vergleich
    {"max_nbrs": 25, "min_nbrs": 3, "min_sim": 0.25}  # Original-Bestconfig
]



In [24]:

for i, config in enumerate(fine_tuning_configs):
    print(f"Testing config {i+1}: {config}")

    model = ItemKNNScorer(**config)
    pipe_ii = topn_pipeline(model_ii)

    # test data is organized by user
    all_test = ItemListCollection(UserIDKey)
    # recommendations will be organized by model and user ID
    all_recs = ItemListCollection(["model", "user_id"])

    for split in crossfold_users(dataset, 1, SampleFrac(0.2)):
        # collect the test data
        all_test.add_from(split.test)

        # do the same for item-item
        fit_ii = pipe_ii.clone()
        fit_ii.train(split.train)
        ii_recs = recommend(fit_ii, split.test.keys(), 100)
        all_recs.add_from(ii_recs, model="II")
    
    ran = RunAnalysis()
    ran.add_metric(NDCG(k=10))
    ran.add_metric(RBP(k=10))
    ran.add_metric(RecipRank(k=10))
    ran.add_metric(Precision(k=10))
    ran.add_metric(Recall(k=10))
    results = ran.measure(all_recs, all_test)

    print(results.list_metrics().groupby("model").mean())

Testing config 1: {'max_nbrs': 20, 'min_nbrs': 3, 'min_sim': 0.25}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.218115  0.174929      0.399141      0.220771   0.208244
Testing config 2: {'max_nbrs': 22, 'min_nbrs': 3, 'min_sim': 0.25}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.214969  0.172552      0.391443      0.219038    0.20621
Testing config 3: {'max_nbrs': 28, 'min_nbrs': 3, 'min_sim': 0.25}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.218662  0.175449      0.395808      0.223382   0.209604
Testing config 4: {'max_nbrs': 30, 'min_nbrs': 3, 'min_sim': 0.25}
        NDCG@10    RBP@10  RecipRank@10  Precision@10  Recall@10
model                                                           
II     0.214889  

**Klare Beste Konfiguration: Config 12**  
   (`max_nbrs=22`, `min_nbrs=4`, `min_sim=0.22`)  
   - **NDCG@10 (0.224785)**: +3.7% besser als der bisherige Bestwert  
   - **RBP@10 (0.180062)**: +3.7% besser als der bisherige Bestwert  
   - **RecipRank@10 (0.423903)**: +7.7% besser (stärkste Verbesserung)  
   - **Recall@10 (0.210559)**: Bester Wert  

**Config 12 als neue Baseline übernehmen** - Sie dominiert in 4/5 Metriken.

Neue COnfig gewählt, da COnfig 12 eine zu geringe Coverage besitzt. 

In [5]:
model_ii = ItemKNNScorer(
    max_nbrs=50,              # 22 Nachbarn
    min_nbrs=3,               # Mindestens 4 Nachbarn
    min_sim=0.15              # Mindest-Ähnlichkeit
    )
pipe_ii = topn_pipeline(model_ii)
pipe_ii.train(dataset)
recs = recommend(pipe_ii, [234], 100)

  return torch.sparse_csr_tensor(


In [46]:
# Einfachste Variante:
for user_key, item_list in recs:
    print(f"User: {user_key.user_id}")
    print(f"Anzahl Empfehlungen: {len(item_list)}")
    
    # Erste 5 Items
    for i in range(min(5, len(item_list))):
        item = item_list.iloc[i]
        print(f"  Item {item.name}: Score {item.score}")
    print()

User: 234
Anzahl Empfehlungen: 75


AttributeError: 'ItemList' object has no attribute 'iloc'

In [5]:
# test data is organized by user
all_test = ItemListCollection(UserIDKey)
# recommendations will be organized by model and user ID
all_recs = ItemListCollection(["model", "user_id"])

for split in crossfold_users(dataset, 5, SampleFrac(0.2)):
    # collect the test data
    all_test.add_from(split.test)

    # train the pipeline, cloning first so a fresh pipeline for each split
    fit_ii = pipe_ii.clone()
    fit_ii.train(split.train)
    ii_recs = recommend(fit_ii, split.test.keys(), 100)
    all_recs.add_from(ii_recs, model="II")

  return torch.sparse_csr_tensor(


In [57]:
ran = RunAnalysis()
ran.add_metric(NDCG(k=10))
ran.add_metric(Precision(k=10))
ran.add_metric(Recall(k=10))
ran.add_metric(RBP(k=10))
ran.add_metric(RecipRank(k=10))
results = ran.measure(all_recs, all_test)

print(results.list_metrics().groupby("model").mean())

        NDCG@10  Precision@10  Recall@10    RBP@10  RecipRank@10
model                                                           
II     0.210801      0.309379   0.169121  0.167914      0.426319


In [71]:
from lenskit.batch import recommend

def get_book_rec_simple(book_id, model, original_df, books_df=None, k=10):
    """
    Vereinfachte Version: Nimmt nur einen User der das Buch mochte
    """
    
    try:
        # Finde einen User der das Buch gut bewertet hat
        good_users = original_df[
            (original_df['book_id'] == book_id) & 
            (original_df['rating'] >= 4)
        ]['user_id'].unique()
        
        if len(good_users) == 0:
            return []
        
        # Nimm den ersten User
        user_id = good_users[0]
        
        # Generiere Empfehlungen für diesen User
        user_recs = recommend(model, [user_id], k+5)
        
        if user_recs is None or user_recs.empty:
            return []
        
        # Filtere das ursprüngliche Buch heraus
        filtered_recs = user_recs[user_recs['item'] != book_id]
        top_book_ids = filtered_recs['item'].head(k).tolist()
        
        # Konvertiere zu Titeln
        if books_df is not None:
            return [books_df[books_df['book_id'] == bid]['title'].iloc[0] 
                   for bid in top_book_ids 
                   if not books_df[books_df['book_id'] == bid].empty]
        else:
            return [f'Book ID: {bid}' for bid in top_book_ids]
            
    except Exception as e:
        print(f"Fehler: {e}")
        return []

In [73]:
get_book_rec_simple(11, fit_ii, df, books_df, k=10)

[]

In [6]:
recs = recommend(pipe_ii, [234], 10, n_jobs=1)
for user_key, item_list in recs:
    df = item_list.to_df()
    print("Spalten:", df.columns.tolist())
    print("Index name:", df.index.name)
    print(df.head())

Spalten: ['item_id', 'item_num', 'score', 'rank']
Index name: None
   item_id  item_num     score  rank
0      161       160  5.139027     1
1      155       154  5.047428     2
2     2687      2686  4.559760     3
3     2974      2973  4.554077     4
4     2282      2281  4.489484     5
