For example:

In [1]:
from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit.algorithms import basic, als
from lenskit.util import load_ml_ratings
from lenskit import topn
import pandas as pd

Generate the train-test pairs:

In [2]:
pairs = list(partition_users(load_ml_ratings(), 5, SampleN(5)))

Set up and run the `MultiEval` experiment:

In [None]:
eval = MultiEval('my-eval', recommend=20)
eval.add_datasets(pairs, name='ML-Small')
eval.add_algorithms(basic.Popular(), name='Pop')
eval.add_algorithms([als.BiasedMF(f) for f in [20, 30, 40, 50]],
                    attrs=['features'], name='ALS')
eval.run()

Now that the experiment is run, we can read its outputs.

First the run metadata:

In [4]:
runs = pd.read_csv('my-eval/runs.csv')
runs.set_index('RunId', inplace=True)
runs.head()

Unnamed: 0_level_0,AlgoClass,AlgoStr,DataSet,Partition,PredTime,RecTime,TrainTime,features,name
RunId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Popular,Popular,ML-Small,1,,0.578916,0.278333,,Pop
2,BiasedMF,"als.BiasedMF(features=20, regularization=0.1)",ML-Small,1,0.377277,1.324478,5.42651,20.0,ALS
3,BiasedMF,"als.BiasedMF(features=30, regularization=0.1)",ML-Small,1,0.326613,1.566073,1.30049,30.0,ALS
4,BiasedMF,"als.BiasedMF(features=40, regularization=0.1)",ML-Small,1,0.408973,1.570634,1.904973,40.0,ALS
5,BiasedMF,"als.BiasedMF(features=50, regularization=0.1)",ML-Small,1,0.357133,1.700047,2.390314,50.0,ALS


Then the recommendations:

In [5]:
recs = pd.read_parquet('my-eval/recommendations.parquet')
recs.head()

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


Unnamed: 0,item,score,user,rank,RunId
0,356,335,6,1,1
1,296,323,6,2,1
2,318,305,6,3,1
3,593,302,6,4,1
4,260,284,6,5,1


In order to evaluate the recommendation list, we need to build a combined set of truth data. Since this is a disjoint partition of users over a single data set, we can just concatenate the individual test frames:

In [6]:
truth = pd.concat((p.test for p in pairs), ignore_index=True)

Now we can set up an analysis and compute the results.

In [None]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
ndcg = rla.compute(recs, truth)
ndcg.head()

Next, we need to combine this with our run data, so that we know what algorithms and configurations we are evaluating:

In [11]:
ndcg = ndcg.join(runs[['AlgoClass', 'features']], on='RunId')
ndcg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ndcg,AlgoClass,features
user,RunId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,11,0.0,Popular,
1,12,0.0,BiasedMF,20.0
1,13,0.0,BiasedMF,30.0
1,14,0.0,BiasedMF,40.0
1,15,0.0,BiasedMF,50.0


The Popular algorithm has NaN feature count, which `groupby` doesn't like; let's fill those in.

In [15]:
ndcg.loc[ndcg['AlgoClass'] == 'Popular', 'features'] = 0

And finally, we can compute the overall average performance for each algorithm configuration:

In [16]:
ndcg.groupby(['AlgoClass', 'features'])['ndcg'].mean()

AlgoClass  features
BiasedMF   20.0        0.015960
           30.0        0.022558
           40.0        0.025901
           50.0        0.028949
Popular    0.0         0.091814
Name: ndcg, dtype: float64