In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lenskit.algorithms import Recommender, als, item_knn as knn
from lenskit.topn import RecListAnalysis, ndcg, precision
from lenskit.metrics.predict import rmse
from lenskit import crossfold as xf, batch, topn, util
import numpy as np
from lenskit.algorithms.basic import Random
from collections import defaultdict
import pickle

Content-Boosted Collaborative Filtering:
This approach combines collaborative filtering with additional features derived from content-based methods.
For example, you can use TF-IDF or word embeddings of the review text as additional features when building a collaborative filtering model.

Factorization Machines (FM):
Factorization Machines can model interactions between features, making them suitable for combining collaborative and content-based features.
They can handle both sparse and dense input data.

Neural Collaborative Filtering (NCF):
NCF combines neural network architectures with collaborative filtering techniques.
The model can learn complex non-linear interactions between user-item pairs and can incorporate additional features such as text embeddings.

Deep Content-User Item Embedding Model:
This model combines deep learning architectures with content-based and collaborative features.
It uses neural networks to learn embeddings for users, items, and content features.

Autoencoders for Collaborative Filtering (AutoRec):
AutoRec is an autoencoder-based model designed for collaborative filtering.
You can extend it by incorporating content-based features into the input layer of the autoencoder.

LightFM:
LightFM is a hybrid recommendation model that incorporates both collaborative and content-based approaches.
It uses a latent factor model and can work with both implicit and explicit feedback.

Hybrid Matrix Factorization:
Combine matrix factorization techniques (such as Singular Value Decomposition or Alternating Least Squares) with content-based features.
Create separate latent factor matrices for users, items, and content features.

Ensemble Models:
Build separate models for collaborative filtering and content-based filtering and combine their predictions using an ensemble method.
Weighted averaging or stacking can be used to combine the predictions.

Hybrid with Feature Importance:
Train a model to predict user preferences using collaborative filtering and content-based features.
Use feature importance techniques to identify the most influential features for making recommendations.

Hybrid with Multi-Modal Embeddings:
If your dataset includes different types of information (e.g., text, images), you can use multi-modal embeddings to capture diverse features.

In [48]:
content_data = pd.read_json("mard/mard_metadata.json", lines=True)



In [2]:

reviews = pd.read_json("mard/mard_reviews.json" , lines=True)
# want to convert to form: user | item | rating | timestamp
reviews=reviews.drop(['helpful', 'reviewText', 'summary','reviewTime', 'reviewerName'],axis=1)
reviews=reviews[["reviewerID", 'amazon-id', "overall" ,"unixReviewTime"]]


new_column_names = {'reviewerID': 'user', 'amazon-id': 'item', 'overall': 'rating', "unixReviewTime": "timestamp"}
reviews.rename(columns=new_column_names, inplace=True)
reviews.head()

Unnamed: 0,user,item,rating,timestamp
0,A1OFY4ATO7D13W,26197898,5,1355702400
1,A2KH83L1F70QR8,26197898,5,1358121600
2,A1KGXC7IRLVJR3,615205399,5,1214438400
3,A1BT6LQ9NY6RO3,615205399,5,1214352000
4,A206OKO2FE2IPL,615205399,5,1229212800


In [4]:
def evaluate(aname, algo, train, test):
    '''
    Fit and generate test recommendations for algo object with name aname
    '''
    print(f"evaluating {aname}")
    fittable = util.clone(algo)
    fittable = Recommender.adapt(fittable)
    fittable.fit(train)
    print(f"fitted {aname}")
    users = test.user.unique()
    recs = batch.recommend(algo=fittable, users=users, n=50)
    recs['Algorithm'] = aname
    return recs

In [None]:
# code to partition the dataset into 5 train-test folds
print("Partitioning Data into 5 Folds...")
parts = xf.partition_users(reviews[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2))

for i, (train, test) in enumerate(parts):
    print(i)
    train.to_parquet(f"mard_train-{i}.parquet")
    test.to_parquet(f"mard_test-{i}.parquet")
    


In [5]:
algo_als = als.BiasedMF(50)

all_recs = []
test_data = []

for i in range(5):
    print(i)
    trainset = pd.read_parquet(f"mard_train-{i}.parquet")
    testset = pd.read_parquet(f"mard_test-{i}.parquet")
    print("data loaded")
    
    test_data.append(testset)
    all_recs.append(evaluate('biasedmf', algo_als, trainset, testset))
    
    

0
data loaded
evaluating biasedmf


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf


fitted biasedmf


2023-12-08 07:26:05.578537: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:26:05.586786: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:26:05.595875: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:26:05.600562: I tensorflow/core/platform/cpu_featu

1
data loaded
evaluating biasedmf
fitted biasedmf


2023-12-08 07:27:36.186942: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:27:36.186942: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:27:36.186944: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:27:36.186942: I tensorflow/core/platform/cpu_featu

2
data loaded
evaluating biasedmf
fitted biasedmf


2023-12-08 07:29:00.153883: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:29:00.159912: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:29:00.208896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:29:00.487337: I tensorflow/core/platform/cpu_featu

3
data loaded
evaluating biasedmf
fitted biasedmf


2023-12-08 07:30:16.607548: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:30:16.626893: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:30:16.642233: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:30:16.949074: I tensorflow/core/platform/cpu_featu

4
data loaded
evaluating biasedmf
fitted biasedmf


2023-12-08 07:31:29.503125: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:31:29.503135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:31:29.518858: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-08 07:31:29.536028: I tensorflow/core/platform/cpu_featu

In [14]:
test_data[0].head()

Unnamed: 0,user,item,rating
92018,A02039013W06XH9FVVFUZ,B000059O8H,5
114450,A10COVV4IM4POT,B00007M8R2,5
115409,A10I0WYTDVFWN7,B00008BRD2,5
52884,A10I0WYTDVFWN7,B000008UDN,5
243118,A10K94WC2ENHJF,B00925T99W,2


In [79]:
'''
preds:
item     score      user  rank  Algorithm

truth:
user        item  rating

trainset:
user        item  rating
'''


# same implementation as HW2
def get_entropy_table(reviews):
    
    entropy_table = defaultdict(float)
    
    items = reviews['item'].unique()
    num_items = len(items)
    
    review_counts = reviews['item'].value_counts()
    
    n = len(reviews)
    
    
    for i, item in enumerate(items):
        if i % 10 == 0:
            print(f"Calculating entropy of item {i} of {num_items}", end="\r")
            
        if item in entropy_table:
            continue

        times_rated = review_counts[item]
        
        frequency = (times_rated + 1) / n
        entropy = -1 * (frequency * np.log2(frequency))

        entropy_table[item] = entropy
    return entropy_table
    

def novelty(preds, truth, k, entropy_table):
    

    #convert df into dictionairy to improve performance
    return np.mean([entropy_table[item] for item in preds['item']])

    
    
    

In [83]:
def diversity(preds, truth, k):
    pass

In [62]:
et = get_entropy_table(reviews)
with open(f'entropy_table.pkl', 'wb') as file:
    pickle.dump(et, file)



Calculating entropy of item 0 of 64637Calculating entropy of item 10 of 64637Calculating entropy of item 20 of 64637Calculating entropy of item 30 of 64637Calculating entropy of item 40 of 64637Calculating entropy of item 50 of 64637Calculating entropy of item 60 of 64637Calculating entropy of item 70 of 64637Calculating entropy of item 80 of 64637Calculating entropy of item 90 of 64637Calculating entropy of item 100 of 64637Calculating entropy of item 110 of 64637Calculating entropy of item 120 of 64637Calculating entropy of item 130 of 64637Calculating entropy of item 140 of 64637Calculating entropy of item 150 of 64637Calculating entropy of item 160 of 64637Calculating entropy of item 170 of 64637Calculating entropy of item 180 of 64637Calculating entropy of item 190 of 64637Calculating entropy of item 200 of 64637Calculating entropy of item 210 of 64637Calculating entropy of item 220 of 64637Calculating entropy of item 230 of 64637Calculating entropy of item

Calculating entropy of item 15140 of 64637Calculating entropy of item 15150 of 64637Calculating entropy of item 15160 of 64637Calculating entropy of item 15170 of 64637Calculating entropy of item 15180 of 64637Calculating entropy of item 15190 of 64637Calculating entropy of item 15200 of 64637Calculating entropy of item 15210 of 64637Calculating entropy of item 15220 of 64637Calculating entropy of item 15230 of 64637Calculating entropy of item 15240 of 64637Calculating entropy of item 15250 of 64637Calculating entropy of item 15260 of 64637Calculating entropy of item 15270 of 64637Calculating entropy of item 15280 of 64637Calculating entropy of item 15290 of 64637Calculating entropy of item 15300 of 64637Calculating entropy of item 15310 of 64637Calculating entropy of item 15320 of 64637Calculating entropy of item 15330 of 64637Calculating entropy of item 15340 of 64637Calculating entropy of item 15350 of 64637Calculating entropy of item 15360 of 64637Calculating

Calculating entropy of item 26770 of 64637Calculating entropy of item 26780 of 64637Calculating entropy of item 26790 of 64637Calculating entropy of item 26800 of 64637Calculating entropy of item 26810 of 64637Calculating entropy of item 26820 of 64637Calculating entropy of item 26830 of 64637Calculating entropy of item 26840 of 64637Calculating entropy of item 26850 of 64637Calculating entropy of item 26860 of 64637Calculating entropy of item 26870 of 64637Calculating entropy of item 26880 of 64637Calculating entropy of item 26890 of 64637Calculating entropy of item 26900 of 64637Calculating entropy of item 26910 of 64637Calculating entropy of item 26920 of 64637Calculating entropy of item 26930 of 64637Calculating entropy of item 26940 of 64637Calculating entropy of item 26950 of 64637Calculating entropy of item 26960 of 64637Calculating entropy of item 26970 of 64637Calculating entropy of item 26980 of 64637Calculating entropy of item 26990 of 64637Calculating

Calculating entropy of item 40120 of 64637Calculating entropy of item 40130 of 64637Calculating entropy of item 40140 of 64637Calculating entropy of item 40150 of 64637Calculating entropy of item 40160 of 64637Calculating entropy of item 40170 of 64637Calculating entropy of item 40180 of 64637Calculating entropy of item 40190 of 64637Calculating entropy of item 40200 of 64637Calculating entropy of item 40210 of 64637Calculating entropy of item 40220 of 64637Calculating entropy of item 40230 of 64637Calculating entropy of item 40240 of 64637Calculating entropy of item 40250 of 64637Calculating entropy of item 40260 of 64637Calculating entropy of item 40270 of 64637Calculating entropy of item 40280 of 64637Calculating entropy of item 40290 of 64637Calculating entropy of item 40300 of 64637Calculating entropy of item 40310 of 64637Calculating entropy of item 40320 of 64637Calculating entropy of item 40330 of 64637Calculating entropy of item 40340 of 64637Calculating

Calculating entropy of item 55040 of 64637Calculating entropy of item 55050 of 64637Calculating entropy of item 55060 of 64637Calculating entropy of item 55070 of 64637Calculating entropy of item 55080 of 64637Calculating entropy of item 55090 of 64637Calculating entropy of item 55100 of 64637Calculating entropy of item 55110 of 64637Calculating entropy of item 55120 of 64637Calculating entropy of item 55130 of 64637Calculating entropy of item 55140 of 64637Calculating entropy of item 55150 of 64637Calculating entropy of item 55160 of 64637Calculating entropy of item 55170 of 64637Calculating entropy of item 55180 of 64637Calculating entropy of item 55190 of 64637Calculating entropy of item 55200 of 64637Calculating entropy of item 55210 of 64637Calculating entropy of item 55220 of 64637Calculating entropy of item 55230 of 64637Calculating entropy of item 55240 of 64637Calculating entropy of item 55250 of 64637Calculating entropy of item 55260 of 64637Calculating

In [69]:
def load_entropy_table():
    with open('entropy_table.pkl', 'rb') as f:
        return pickle.load(f)


In [80]:
entropy_table = load_entropy_table()

rla = RecListAnalysis()
rla.add_metric(ndcg, name="ndcg_50", k=50)
rla.add_metric(precision, name="precision_50",k=50)
rla.add_metric(novelty, name="novelty_50", k=50, entropy_table=entropy_table)



In [82]:
for i, (preds, truth) in enumerate(zip(all_recs, test_data)):
    preds = preds.reset_index(drop=True)
    truth = truth.reset_index(drop=True)
    
    
    results = rla.compute(preds, truth)
    print("preds, truth")
    print(preds)
    print(truth)
    
    print('---------')
    print(results)
    print(np.mean(results['ndcg_50']))
    print(np.mean(results['precision_50']))
    print(np.mean(results['novelty_50']))
    
    
    
    
    
    
    
    


truth index not unique: may have duplicate items
                      rating
LKTruthID item              
0         B000059O8H       5
1         B00007M8R2       5
2         B00008BRD2       5
          B000008UDN       5
3         B00925T99W       2
...                      ...
1975      B0000039Q3       5
1976      B0000025SB       5
1977      B00005NF46       4
          B000006CE7       5
1978      B0000C1YZI       3

[3130 rows x 1 columns]
truth index not unique: may have duplicate items
                      rating
LKTruthID item              
0         B00BIVN82M       5
1         B000002UAR       5
2         B00005M98K       4
3         B00003IQII       5
4         B00000FDK4       4
...                      ...
1968      B00008FHPH       5
1969      B000RT3QX2       5
1970      B000002UB6       5
1971      B000GELO9Q       5
1972      B0026P3G12       5

[3419 rows x 1 columns]


preds, truth
             item     score                   user  rank Algorithm
0      B008DCOVP2  5.056139  A02039013W06XH9FVVFUZ     1  biasedmf
1      B000G1SZN2  5.051163  A02039013W06XH9FVVFUZ     2  biasedmf
2      B00A6V2EMA  5.041079  A02039013W06XH9FVVFUZ     3  biasedmf
3      B007G9NE24  5.040811  A02039013W06XH9FVVFUZ     4  biasedmf
4      B00003GO0I  5.036407  A02039013W06XH9FVVFUZ     5  biasedmf
...           ...       ...                    ...   ...       ...
98945  B00000JB6H  4.768870          AZZWPNME0GQZ2    46  biasedmf
98946  B0094GW90U  4.768700          AZZWPNME0GQZ2    47  biasedmf
98947  B000002UB2  4.762956          AZZWPNME0GQZ2    48  biasedmf
98948  B000S6BMXU  4.762735          AZZWPNME0GQZ2    49  biasedmf
98949  B000056KWT  4.759977          AZZWPNME0GQZ2    50  biasedmf

[98950 rows x 5 columns]
                       user        item  rating
0     A02039013W06XH9FVVFUZ  B000059O8H       5
1            A10COVV4IM4POT  B00007M8R2       5
2            

truth index not unique: may have duplicate items
                      rating
LKTruthID item              
0         B000068UPU       5
1         B0006H2OTW       5
2         B00003Q5B1       2
3         B008UTV6DI       5
4         B0017QKVEQ       5
...                      ...
2004      B001BRA9SE       5
          B000KF2BS4       5
          B000YOLONO       5
2005      B000PFU9OM       5
2006      B00005TSRT       5

[3152 rows x 1 columns]


preds, truth
             item     score                   user  rank Algorithm
0      B00005ARDS  5.069901  A010397922UKJ9QFDYFIE     1  biasedmf
1      B00A6V2EMA  5.069174  A010397922UKJ9QFDYFIE     2  biasedmf
2      B000G1SZN2  5.053271  A010397922UKJ9QFDYFIE     3  biasedmf
3      B00003GO0I  5.038407  A010397922UKJ9QFDYFIE     4  biasedmf
4      B009XY1W5Q  5.035312  A010397922UKJ9QFDYFIE     5  biasedmf
...           ...       ...                    ...   ...       ...
98645  B00005O54G  4.981943          AZZ69W8AU7WTR    46  biasedmf
98646  B002CVQ7W0  4.981404          AZZ69W8AU7WTR    47  biasedmf
98647  B008FHAHSU  4.981007          AZZ69W8AU7WTR    48  biasedmf
98648  B000066I4I  4.980310          AZZ69W8AU7WTR    49  biasedmf
98649  B000A1OES8  4.979891          AZZ69W8AU7WTR    50  biasedmf

[98650 rows x 5 columns]
                       user        item  rating
0     A010397922UKJ9QFDYFIE  B00BIVN82M       5
1      A075187974975WXZYAJ0  B000002UAR       5
2            

truth index not unique: may have duplicate items
                      rating
LKTruthID item              
0         B004ZBIJE4       5
1         B000000SMI       5
2         B0009NDKUK       3
          B0000021FS       5
          B000E0VNXO       5
...                      ...
2020      B000QETLUU       5
2021      B00025YHTI       5
          B00006IKII       4
          B00004XT2D       5
2022      B000NX5GAI       5

[3225 rows x 1 columns]


preds, truth
              item     score            user  rank Algorithm
0       B000066I4I  5.137300  A104OKT9VCVMJ2     1  biasedmf
1       B00A6V2EMA  5.107660  A104OKT9VCVMJ2     2  biasedmf
2       B00000IWMO  5.100294  A104OKT9VCVMJ2     3  biasedmf
3       B0000262UV  5.098750  A104OKT9VCVMJ2     4  biasedmf
4       B000G1SZN2  5.098415  A104OKT9VCVMJ2     5  biasedmf
...            ...       ...             ...   ...       ...
100345  B0000799IF  5.048237   AZZ88P43O2E5L    46  biasedmf
100346  B000008KBD  5.048000   AZZ88P43O2E5L    47  biasedmf
100347  B0001Z4PVO  5.047058   AZZ88P43O2E5L    48  biasedmf
100348  B004562NLO  5.047015   AZZ88P43O2E5L    49  biasedmf
100349  B0000024T8  5.046205   AZZ88P43O2E5L    50  biasedmf

[100350 rows x 5 columns]
                user        item  rating
0     A104OKT9VCVMJ2  B000068UPU       5
1     A108H56BR3LPDT  B0006H2OTW       5
2      A109WIE49N0GV  B00003Q5B1       2
3     A10FAESLP6QCAM  B008UTV6DI       5
4     A10JL1ZHMP50UJ  B

truth index not unique: may have duplicate items
                      rating
LKTruthID item              
0         B002CZQ67M       4
1         B0000039Q3       5
2         B00FUABIGU       5
3         B0018QT94I       5
4         B001BL970G       5
...                      ...
2040      B00005HN2O       5
2041      B000068CZD       5
2042      B000003CSJ       3
2043      B001UXJIC2       4
          B000XS3WOU       3

[3175 rows x 1 columns]


preds, truth
              item     score                   user  rank Algorithm
0       B000002WB0  5.154309  A10127132IE1A73IN1HGO     1  biasedmf
1       B00EHQZUCG  5.140056  A10127132IE1A73IN1HGO     2  biasedmf
2       B000G1SZN2  5.132872  A10127132IE1A73IN1HGO     3  biasedmf
3       B00008NGHU  5.126511  A10127132IE1A73IN1HGO     4  biasedmf
4       B00003GO0I  5.119279  A10127132IE1A73IN1HGO     5  biasedmf
...            ...       ...                    ...   ...       ...
101145  B000001XSC  4.884279          AZT60HLHFL0V0    46  biasedmf
101146  B00DSAUMXO  4.883916          AZT60HLHFL0V0    47  biasedmf
101147  B00000IO60  4.882318          AZT60HLHFL0V0    48  biasedmf
101148  B00000AG67  4.881074          AZT60HLHFL0V0    49  biasedmf
101149  B0000AGWIJ  4.880675          AZT60HLHFL0V0    50  biasedmf

[101150 rows x 5 columns]
                       user        item  rating
0     A10127132IE1A73IN1HGO  B004ZBIJE4       5
1            A104KYSJJZLWB4  B000000SMI       5
