For example:

In [1]:
from lenskit.batch import MultiEval
from lenskit.crossfold import partition_users, SampleN
from lenskit.algorithms import basic, als
from lenskit.datasets import MovieLens
from lenskit import topn
import pandas as pd

Generate the train-test pairs:

In [2]:
mlsmall = MovieLens('ml-latest-small')
pairs = list(partition_users(mlsmall.ratings, 5, SampleN(5)))

Set up and run the `MultiEval` experiment:

In [3]:
eval = MultiEval('my-eval', recommend=20)
eval.add_datasets(pairs, name='ML-Small')
eval.add_algorithms(basic.Popular(), name='Pop')
eval.add_algorithms([als.BiasedMF(f) for f in [20, 30, 40, 50]],
                    attrs=['features'], name='ALS')
eval.run()

  'start': level._start,
  'stop': level._stop,
  'step': level._step
  'start': level._start,
  'stop': level._stop,
  'step': level._step


Now that the experiment is run, we can read its outputs.

First the run metadata:

In [4]:
runs = pd.read_csv('my-eval/runs.csv')
runs.set_index('RunId', inplace=True)
runs.head()

Unnamed: 0_level_0,DataSet,Partition,AlgoClass,AlgoStr,name,TrainTime,PredTime,RecTime,features
RunId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,ML-Small,1,Popular,Popular,Pop,0.571511,,0.374972,
2,ML-Small,1,BiasedMF,"als.BiasedMF(features=20, regularization=0.1)",ALS,4.358745,0.23671,0.492864,20.0
3,ML-Small,1,BiasedMF,"als.BiasedMF(features=30, regularization=0.1)",ALS,0.298771,0.239025,0.565233,30.0
4,ML-Small,1,BiasedMF,"als.BiasedMF(features=40, regularization=0.1)",ALS,0.402154,0.237375,0.638256,40.0
5,ML-Small,1,BiasedMF,"als.BiasedMF(features=50, regularization=0.1)",ALS,0.489437,0.240399,0.63476,50.0


Then the recommendations:

In [5]:
recs = pd.read_parquet('my-eval/recommendations.parquet')
recs.head()

Unnamed: 0,item,score,user,rank,RunId
0,296,321.0,5,1,1
1,318,311.0,5,2,1
2,593,300.0,5,3,1
3,260,288.0,5,4,1
4,480,269.0,5,5,1


In order to evaluate the recommendation list, we need to build a combined set of truth data. Since this is a disjoint partition of users over a single data set, we can just concatenate the individual test frames:

In [6]:
truth = pd.concat((p.test for p in pairs), ignore_index=True)

Now we can set up an analysis and compute the results.

In [7]:
rla = topn.RecListAnalysis()
rla.add_metric(topn.ndcg)
ndcg = rla.compute(recs, truth)
ndcg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg
user,RunId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,11,20.0,0.0
1,12,20.0,0.0
1,13,20.0,0.0
1,14,20.0,0.0
1,15,20.0,0.0


Next, we need to combine this with our run data, so that we know what algorithms and configurations we are evaluating:

In [8]:
ndcg = ndcg.join(runs[['AlgoClass', 'features']], on='RunId')
ndcg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,nrecs,ndcg,AlgoClass,features
user,RunId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,11,20.0,0.0,Popular,
1,12,20.0,0.0,BiasedMF,20.0
1,13,20.0,0.0,BiasedMF,30.0
1,14,20.0,0.0,BiasedMF,40.0
1,15,20.0,0.0,BiasedMF,50.0


The Popular algorithm has NaN feature count, which `groupby` doesn't like; let's fill those in.

In [9]:
ndcg.loc[ndcg['AlgoClass'] == 'Popular', 'features'] = 0

And finally, we can compute the overall average performance for each algorithm configuration:

In [10]:
ndcg.groupby(['AlgoClass', 'features'])['ndcg'].mean()

AlgoClass  features
BiasedMF   20.0        0.011101
           30.0        0.015509
           40.0        0.015952
           50.0        0.018086
Popular    0.0         0.082958
Name: ndcg, dtype: float64