In [34]:
import pandas as pd
import numpy as np

import sys
sys.path.append('..')

from src.MultiViewCTR import MultiViewCTR
from sklearn.decomposition import NMF
from sklearn.decomposition import FastICA
from src.replayer import replayer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis

import time
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [4]:
%load_ext autoreload
%autoreload 2

Import preprocessed Movie Lens 100K data:

In [5]:
data = pd.read_csv('../data/clean/move-lens-100k-all.csv').set_index(['user_id', 'item_id', 'timestamp'])

# assert not datapoint duplications
assert not data.reset_index()[['user_id', 'item_id']].duplicated().any()

data['rating'] = (data.rating >= 4).astype(int)
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rating,gender_F,gender_M,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,...,genre_Western,genre_unknown,release_decade_1920.0,release_decade_1930.0,release_decade_1940.0,release_decade_1950.0,release_decade_1960.0,release_decade_1970.0,release_decade_1980.0,release_decade_1990.0
user_id,item_id,timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
259,286,874724727,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
259,185,874724781,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
259,173,874724843,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
259,288,874724905,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
259,117,874724988,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Split data into test and train sets:

In [6]:
# 50 / 50 split
n = 5 * len(data) // 10
X_train = data.drop('rating', axis=1)[:n]
X_test = data.drop('rating', axis=1)[n:]

y_train = data.rating[:n]
y_test = data.rating[n:]

Count the number of cold start datapoints in the test set:

In [7]:
# user ids in train and test
user_ids_train = X_train.index.get_level_values('user_id')
user_ids_test = X_test.index.get_level_values('user_id')

# item ids in train and test
item_ids_train = X_train.index.get_level_values('item_id')
item_ids_test = X_test.index.get_level_values('item_id')

# print pct of cold start users in test
print(
    "Pct. of cold-start users in test:",
    user_ids_test[user_ids_test.isin(user_ids_train)].nunique() \
         / user_ids_train.nunique()
)

# print pct of cold-start items in test
print(
    "Pct. of cold-items users in test:",
    item_ids_test[item_ids_test.isin(item_ids_train)].nunique() \
         / item_ids_train.nunique()
)

Pct. of cold-start users in test: 0.2721518987341772
Pct. of cold-items users in test: 1.0


Define and run replayer experiments:

In [35]:
def run_experiment(models):
    """
    Run replayer experiment using different MF models.
    """
    # init results list
    results = []
    for model in models:
        # unpack tuple
        name, model = model
        play = replayer(model)
        # fit data on train
        start_time = time.time()
        play.fit(X_train, y_train)
        tte = time.time() - start_time # time to execute
        # test out-of-sample
        ratings = play.test(X_test, y_test)
        # append results
        results.append(
            {
                "name": name,
                "model": str(model), 
                "avg_oos_rating": np.mean(ratings), 
                "n_recs": len(ratings),
                "tte": tte
            }
        )
    return pd.DataFrame(results)

# specify models to test
models = [
    ("LDA", LatentDirichletAllocation(n_components=2, random_state=0)),
    ("LDA", LatentDirichletAllocation(n_components=3, random_state=0)),
    ("LDA", LatentDirichletAllocation(n_components=4, random_state=0)),
    ("LDA", LatentDirichletAllocation(n_components=5, random_state=0)),
    ("LDA", LatentDirichletAllocation(n_components=6, random_state=0)),
    ("LDA", LatentDirichletAllocation(n_components=7, random_state=0)),
    ("NMF", NMF(n_components=2, init='random', random_state=0)),
    ("NMF", NMF(n_components=3, init='random', random_state=0)),
    ("NMF", NMF(n_components=4, init='random', random_state=0)),
    ("NMF", NMF(n_components=5, init='random', random_state=0)),
    ("NMF", NMF(n_components=6, init='random', random_state=0)),
    ("NMF", NMF(n_components=7, init='random', random_state=0)),
    ("FastICA", FastICA(n_components=2, random_state=0, whiten='unit-variance')),
    ("FastICA", FastICA(n_components=3, random_state=0, whiten='unit-variance')),
    ("FastICA", FastICA(n_components=4, random_state=0, whiten='unit-variance')),
    ("FastICA", FastICA(n_components=5, random_state=0, whiten='unit-variance')),
    ("FastICA", FastICA(n_components=6, random_state=0, whiten='unit-variance')),
    ("FastICA", FastICA(n_components=7, random_state=0, whiten='unit-variance')),
    ("MVCTR", MultiViewCTR(n_components=2, random_state=0)),
    ("MVCTR", MultiViewCTR(n_components=3, random_state=0)),
    ("MVCTR", MultiViewCTR(n_components=4, random_state=0)),
    ("MVCTR", MultiViewCTR(n_components=5, random_state=0)),
    ("MVCTR", MultiViewCTR(n_components=6, random_state=0)),
    ("MVCTR", MultiViewCTR(n_components=7, random_state=0)),
    ("PCA", PCA(n_components=2, random_state=0)),
    ("PCA", PCA(n_components=3, random_state=0)),
    ("PCA", PCA(n_components=4, random_state=0)),
    ("PCA", PCA(n_components=5, random_state=0)),
    ("PCA", PCA(n_components=6, random_state=0)),
    ("PCA", PCA(n_components=7, random_state=0))
]

# run experiment
results = run_experiment(models)
# show results
results

Unnamed: 0,name,model,avg_oos_rating,n_recs,tte
0,LDA,"LatentDirichletAllocation(n_components=2, rand...",0.842424,165,10.243471
1,LDA,"LatentDirichletAllocation(n_components=3, rand...",0.821429,196,11.386729
2,LDA,"LatentDirichletAllocation(n_components=4, rand...",0.822785,158,9.831526
3,LDA,"LatentDirichletAllocation(n_components=5, rand...",0.813559,177,9.1266
4,LDA,"LatentDirichletAllocation(n_components=6, rand...",0.788889,180,8.896375
5,LDA,"LatentDirichletAllocation(n_components=7, rand...",0.847458,177,8.157028
6,NMF,"NMF(init='random', n_components=2, random_stat...",0.86,200,0.184322
7,NMF,"NMF(init='random', n_components=3, random_stat...",0.84878,205,0.23293
8,NMF,"NMF(init='random', n_components=4, random_stat...",0.873786,206,0.453158
9,NMF,"NMF(init='random', n_components=5, random_stat...",0.841584,202,0.510306


Clean up results table:

In [52]:
results_ = results.copy()

# round numerical figures
results_['avg_oos_rating'] = results_['avg_oos_rating'].round(3)
results_['tte'] = results_['tte'].round(3)
results_['K'] = results_['model'].apply(lambda x: x.split('n_components=')[1][0])

# get best result for each model
best_result = results_.groupby('name').avg_oos_rating.transform(lambda x: x == x.max())

results_ = results_.loc[
    best_result, ['name', 'K', 'avg_oos_rating', 'n_recs', 'tte', ]
].rename(columns={
    'name': 'Model',
    'avg_oos_rating': 'Avg OOS Rating',
    'n_recs': 'Hit Count',
    'tte': 'TTE (s)'
})

results_

Unnamed: 0,Model,K,Avg OOS Rating,Hit Count,TTE (s)
5,LDA,7,0.847,177,8.157
8,NMF,4,0.874,206,0.453
15,FastICA,5,0.8,325,0.537
23,MVCTR,7,0.865,223,1.786
25,PCA,3,0.756,176,0.115


In [53]:
results_.to_csv('../data/results/replayer-experiment.csv', index=False)

Check for a stastical difference between NMF and MVCTR:

In [32]:
from scipy.stats import binomtest

# MVCTR vs NMF
binomtest(k=round(0.865*223), n=223, p=0.874, alternative='less')

BinomTestResult(k=193, n=223, alternative='less', statistic=0.8654708520179372, pvalue=0.3798283222305279)

In [28]:
# MVCTR vs LDA
binomtest(k=round(0.865*223), n=223, p=0.847, alternative='greater')

BinomTestResult(k=193, n=223, alternative='greater', statistic=0.8654708520179372, pvalue=0.2542258555700575)

In [27]:
# MVCTR vs FastICA
binomtest(k=round(0.865*223), n=223, p=0.800, alternative='greater')

BinomTestResult(k=193, n=223, alternative='greater', statistic=0.8654708520179372, pvalue=0.007095828130263902)

In [30]:
# MVCTR vs PCA
binomtest(k=round(0.865*223), n=223, p=0.756, alternative='greater')

BinomTestResult(k=193, n=223, alternative='greater', statistic=0.8654708520179372, pvalue=3.8517867521267934e-05)

In [33]:
0.865 - 0.847

0.018000000000000016

In [10]:
# model = MultiViewCTR(n_components=1, alpha=1, beta=1)

# play = replayer(model)
# # fit data on train
# play.fit(X_train, y_train)
# # test out-of-sample
# ratings = play.test(X_test, y_test)

# print(np.mean(ratings))