# Imports

In [1]:
import pickle
import numpy as np
from tqdm import tqdm
from IPython.display import display
from mae_score import evaluate, eval_stats
from data_utils import create_validationset, create_ensembleset

import ubcf # user-based collaborative filtering
import ibcf # item-based collaborative filtering
import gcn # link prediction with LightGCN

# Validation

## Creating Validation Set

In [3]:
create_validationset(ratio=0.8)

## Cosine Similarity

### User-Based Cosine Similarity

In [4]:
model = ubcf.cosine_similarity(VALIDATION=True)
model.predict(k=33)
print(model.evaluate())

0.8841309823677582


#### Hyperparameter Tuning w/ Validation

In [3]:
n = 40
scores = np.zeros(n)
for i in tqdm(range(n)):
    model = ubcf.cosine_similarity(VALIDATION=True)
    model.predict(k=i + 1)
    score = model.evaluate()
    scores[i] = score

print('')
print('MAE scores=\n', scores)
print(f'\n\nBest MAE Score: {scores.min()} k={scores.argmin() + 1}')

100%|██████████| 40/40 [00:19<00:00,  2.10it/s]


MAE scores=
 [1.13026268 1.05397625 0.9611371  0.95106153 0.92515293 0.91615689
 0.91327816 0.90716085 0.90464196 0.913638   0.9006837  0.8963656
 0.89096797 0.89744512 0.89564592 0.89708528 0.89780497 0.89564592
 0.88808924 0.89096797 0.88880892 0.89024829 0.88629003 0.88664987
 0.88988845 0.88844908 0.88916877 0.88988845 0.88880892 0.88664987
 0.88449082 0.88629003 0.88413098 0.88449082 0.88593019 0.88521051
 0.88557035 0.88557035 0.88521051 0.88413098]


Best MAE Score: 0.8841309823677582 k=33





### Item-Based Cosine Similarity

In [5]:
model = ibcf.cosine_similarity(VALIDATION=True)
model.predict(k=5)
print(model.evaluate())

0.9435048578625405


#### Hyperparameter Tuning w/ Validation

In [3]:
n = 40
scores = np.zeros(n)
for i in tqdm(range(n)):
    model = ibcf.cosine_similarity(VALIDATION=True)
    model.predict(k=i + 1)
    score = model.evaluate()
    scores[i] = score

print('')
print('MAE scores=\n', scores)
print(f'\n\nBest MAE Score: {scores.min()} k={scores.argmin() + 1}')

100%|██████████| 40/40 [00:12<00:00,  3.15it/s]


MAE scores=
 [1.12234617 1.00503778 0.96293631 0.94566391 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486 0.94350486
 0.94350486 0.94350486 0.94350486 0.94350486]


Best MAE Score: 0.9435048578625405 k=5





## Pearson Correlation

### User-Based Pearson Correlation

In [6]:
model = ubcf.pearson_correlation(VALIDATION=True)

model.predict(k=34)
model.evaluate()

0.8373515653112631

#### Hyperparameter Tuning w/ Validation

In [None]:
n = 40
scores = np.zeros(n)
for i in tqdm(range(n)):
    model = ubcf.pearson_correlation(VALIDATION=True)
    model.predict(k=i + 1)
    score = model.evaluate()
    scores[i] = score

print(scores)
print(f'\n\nBest MAE Score: {scores.min()} k={scores.argmin() + 1}')

### User-Based Pearson Correlation w/ Inverse User Frequency

In [7]:
model = ubcf.pearson_correlation_IUF(VALIDATION=True)
model.predict(k=33)
model.evaluate()

0.8387909319899244

#### Hyperparameter Tuning w/ Validation

In [6]:
n = 40
scores = np.zeros(n)
for i in tqdm(range(n)):
    model = ubcf.pearson_correlation_IUF(VALIDATION=True)
    model.predict(k=i + 1)
    score = model.evaluate()
    scores[i] = score

print('')
print('MAE scores=\n', scores)
print(f'\n\nBest MAE Score: {scores.min()} k={scores.argmin() + 1}')

100%|██████████| 40/40 [01:09<00:00,  1.75s/it]


MAE scores=
 [1.08096438 0.98056855 0.94890248 0.92479309 0.90572148 0.89024829
 0.87765383 0.87333573 0.86901763 0.86505937 0.86721842 0.86505937
 0.85354444 0.86074127 0.85390428 0.84994602 0.85210507 0.85426412
 0.84922634 0.85210507 0.84994602 0.84850666 0.84598777 0.84418856
 0.84418856 0.84166967 0.84274919 0.84094998 0.8402303  0.84094998
 0.83951062 0.83987046 0.83879093 0.83879093 0.83915077 0.83987046
 0.83951062 0.84166967 0.84238935 0.8402303 ]


Best MAE Score: 0.8387909319899244 k=33





### User-Based Pearson Correlation w/ Case Modification

In [8]:
model = ubcf.pearson_correlation_casemod(VALIDATION=True)
model.predict(k=26)
model.evaluate()

0.8377114069809284

#### Hyperparameter Tuning w/ Validation

In [3]:
n = 40
scores = np.zeros(n)
for i in tqdm(range(n)):
    model = ubcf.pearson_correlation_casemod(VALIDATION=True)
    model.predict(k=i + 1)
    score = model.evaluate()
    scores[i] = score

print('')
print('MAE scores=\n', scores)
print(f'\n\nBest MAE Score: {scores.min()} k={scores.argmin() + 1}')

100%|██████████| 40/40 [01:44<00:00,  2.62s/it]


MAE scores=
 [1.10255488 0.98812522 0.94602375 0.92911119 0.90392227 0.89168766
 0.87621447 0.87405542 0.86397985 0.86505937 0.86793811 0.86362001
 0.85462397 0.85642317 0.84778697 0.84742713 0.84958618 0.84742713
 0.84562792 0.84706729 0.84706729 0.84742713 0.84310903 0.84094998
 0.83951062 0.83771141 0.84059014 0.83807125 0.84059014 0.8402303
 0.83807125 0.83843109 0.83915077 0.83843109 0.83879093 0.83843109
 0.84202951 0.84166967 0.84130982 0.84059014]


Best MAE Score: 0.8377114069809284 k=26





## Custom Algorithms

### Ensemble

In [2]:
ensemble = ubcf.ensemble(
    ubcf.cosine_similarity, 
    ubcf.pearson_correlation,
    ubcf.pearson_correlation_IUF,
    ibcf.cosine_similarity,
    VALIDATION=True
)

ensemble.set_k(
    33, 
    34, 
    33, 
    5
)

ensemble.set_weights(
    0.2, 
    0.6, 
    0.1,
    .01
)

print('w =', ensemble.weights)
ensemble.predict()
ensemble.evaluate()

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 160 and the array at index 1 has size 40

In [None]:
ensemble = ubcf.ensemble(
    ubcf.cosine_similarity,
    ubcf.cosine_similarity,
    ubcf.pearson_correlation,
    ubcf.pearson_correlation,
    ubcf.pearson_correlation_IUF,
    ubcf.pearson_correlation_IUF,
    ubcf.pearson_correlation_casemod,
    ubcf.pearson_correlation_casemod,
    VALIDATION=True
)

ensemble.set_k(
    33,
    20,
    34,
    45,
    33,
    15,
    26,
    10
)

print(ensemble.weights)
ensemble.train()
print(ensemble.weights)

ensemble.predict()
ensemble.evaluate()

#### Hyperparameter Tuning w/ Validation

### User-Based Pearson Correlation w/ Link Prediction Punishment

In [6]:
model = gcn.link_prediction(VALIDATION=True)

model.predict()
model.evaluate()

1.5444404462036705

[0.05824033 0.13008102 0.19028459 0.98177447 0.75652937 0.36334604
 0.70889785 0.21173074]
[ 0.32432359  0.09659688  0.64078264 -0.87686836  0.34938404  0.12123744
  0.12592728  0.19344294]


0.8344728319539403

# Testing

The predicted ratings will be written in text files located in data/results/
<br>
- [results5](data/results/results5.txt)
- [results10](data/results/results10.txt)
- [results20](data/results//results20.txt)

#### User-based Cosine Similarity

In [2]:
for n in [5, 10, 20]:    
    model = ubcf.cosine_similarity(file_num=n, VALIDATION=False)
    model.predict(k=33)

#### User-based Pearson Correlation

In [21]:
model = ubcf.pearson_correlation(file_num=5, VALIDATION=False)
model.predict(k=45)

model = ubcf.pearson_correlation(file_num=10, VALIDATION=False)
model.predict(k=41)

model = ubcf.pearson_correlation(file_num=20, VALIDATION=False)
model.predict(k=33)

#### User-based Pearson Correlation w/ IUF

In [None]:
for n in [5, 10, 20]:    
    model = ubcf.pearson_correlation_IUF(file_num=n, VALIDATION=False)
    model.predict(k=33)

#### User-based Pearson Correlatin w/ Case Modification

In [3]:
for n in [5, 10, 20]:    
    model = ubcf.pearson_correlation_casemod(file_num=n, VALIDATION=False)
    model.predict(k=26)

#### Item-based Cosine Similarity

In [2]:
for n in [5, 10, 20]:    
    model = ibcf.cosine_similarity(file_num=n, VALIDATION=False)
    model.predict(k=5)

In [7]:
for n in [5, 10, 20]:
    ensemble = ubcf.ensemble(
        ubcf.cosine_similarity, 
        ubcf.pearson_correlation,
        ubcf.pearson_correlation_IUF,
        # ubcf.pearson_correlation_casemod,
        file_num=n,
        VALIDATION=False
    )

    ensemble.set_k(
        33, 
        34, 
        33, 
        # 26
    )

    ensemble.set_weights(
        0.40460079, 
        0.0729932,
        0.49376735
    )

    ensemble.predict()

In [11]:
ensemble = ubcf.ensemble(
    ubcf.cosine_similarity, 
    ubcf.pearson_correlation,
    ubcf.pearson_correlation_IUF,
    file_num=20,
    VALIDATION=False
)

ensemble.set_k(
    33, 
    34, 
    34, 
)

ensemble.set_weights(
    0.2,
    0.6,
    0.2
)

ensemble.predict()

array([[401,   2,   3],
       [401,   4,   3],
       [401,   8,   4],
       ...,
       [500, 811,   4],
       [500, 933,   2],
       [500, 958,   3]])