# Cornac

https://cornac.preferred.ai/

- Comparative Framwork for Multi-Modal Recommendation

- Focused on enabling convenient handling of side information (e.g., item description texts and images, social networks, etc.).

- Allows rapid experimentation and simple implementation of new models.

- Highly compatible with existing machine learning libraries such as TensorFlow and PyTorch.

- One of the frameworks recommended by ACM RecSys 2023 for evaluating and reproducing recommendation algorithms.

# Config

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

# MS recommenders API 
from msr.cornac_utils import predict_ranking
from msr.python_splitters import python_stratified_split

# Cornac API 
import cornac
from cornac.eval_methods import BaseMethod, RatioSplit, StratifiedSplit, CrossValidation
from cornac.models import UserKNN, ItemKNN, MF, EASE, NeuMF, VAECF
from cornac.metrics import Precision, Recall, NDCG, AUC, MAP

In [2]:
# Data column definition
COL_USER = "userId"
COL_ITEM = "movieId"
COL_RATING = "rating"
COL_PREDICTION = "prediction"

# Top k items to recommend
TOP_K = 5

# Random seed, Verbose, etc.
SEED = 42
VERBOSE = True

# Data Preparation

In [3]:
data = pd.read_csv('./data/ratings.csv')

In [4]:
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,0,4.0,964982703
1,1,2,4.0,964981247
2,1,5,4.0,964982224
3,1,43,5.0,964983815
4,1,46,5.0,964982931
...,...,...,...,...
100831,610,9416,4.0,1493848402
100832,610,9443,5.0,1493850091
100833,610,9444,5.0,1494273047
100834,610,9445,5.0,1493846352


In [5]:
train, test = python_stratified_split(
    data, 
    filter_by="user", 
    ratio=0.7,
    col_user=COL_USER,
    col_item=COL_ITEM,
    seed=SEED
)

print(
    f"ratings per train user:\t{train.groupby(COL_USER).size().mean()}", 
    f"ratings per test user:\t{test.groupby(COL_USER).size().mean()}",
    sep='\n'
)

ratings per train user:	115.70491803278688
ratings per test user:	49.6


# Modeling

In [6]:
# models['모델명'][0] => model 객체
# models['모델명'][1] => model 파라미터
models = {}

## User/Item K-Nearest-Neighbors (UserKNN/ItemKNN)

In [7]:
params = {
    'k': 20,
    # ['cosine', 'pearson']
    'similarity': 'cosine',
}

In [8]:
# U-to-U CF
model = UserKNN(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

In [9]:
# I-to-I CF
model = ItemKNN(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

## Matrix Factorization (MF)

In [10]:
params = {
    'k': 10,
    'max_iter': 25,
    'learning_rate': 0.01,
    'lambda_reg': 0.02,
    'use_bias': True,
    'early_stop': True,
}

In [11]:
model = MF(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

## Embarrassingly Shallow Autoencoders for Sparse Data (EASE)

In [12]:
params = {
    'lamb': 500,
    'posB': True,
}

In [13]:
model = EASE(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

## Neural Collaborative Filtering

In [14]:
params = {
    'num_factors': 8,
    'layers': [32, 16, 8],
    # ["tanh", "sigmoid", "relu", "leaky_relu"]
    'act_fn': 'tanh',
    'num_neg': 3,
    'lr': 0.001,
    'num_epochs': 10,
    'batch_size': 256,
    'backend': 'pytorch'
}

In [15]:
model = NeuMF(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

## Variational Autoencoder for Collaborative Filtering (VAECF)

In [16]:
params = {
    'k': 20,
    'autoencoder_structure': [40],
    # ["tanh", "sigmoid", "relu", "leaky_relu"]
    'act_fn': "tanh",
    # ["bern", "mult", "gaus", "pois"]
    'likelihood': "mult",
    'n_epochs': 100,
    'batch_size': 100,
    'learning_rate': 0.005,
    'beta': 0.1,
}

In [17]:
model = VAECF(**params, seed=SEED, verbose=VERBOSE)
models[model.name] = (model, params)

# Experiment

In [18]:
# 평가방법 설정
eval_method = BaseMethod.from_splits(
    train_data=np.array(train), 
    test_data=np.array(test),
    # Unknown users and items will be ignored. 
    exclude_unknowns=True,
    verbose=True
)

# # Random split
# ratio_split = RatioSplit(
#     data=data,
#     test_size=0.2,
#     exclude_unknowns=True,
#     seed=SEED,
#     verbose=VERBOSE
# )

# # K-fold CV
# ratio_split = CrossValidation(
#     data=data,
#     n_folds=5,
#     exclude_unknowns=True,
#     seed=SEED,
#     verbose=VERBOSE
# )

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 610
Number of items = 8469
Number of ratings = 70580
Max rating = 5.0
Min rating = 0.5
Global mean = 3.5
---
Test data:
Number of users = 610
Number of items = 8469
Number of ratings = 28808
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 610
Total items = 8469


In [19]:
# 평가척도 설정
metrics = [Recall(k=TOP_K), NDCG(k=TOP_K)]

In [20]:
%%time

# 실험 수행
ex = cornac.Experiment(
    eval_method=eval_method,
    models=[model[0] for model in models.values()],
    metrics=metrics,
    save_dir=None
).run()


[UserKNN] Training started!


  0%|          | 0/610 [00:00<?, ?it/s]


[UserKNN] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


[ItemKNN] Training started!


  0%|          | 0/8469 [00:00<?, ?it/s]


[ItemKNN] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


[MF] Training started!


  0%|          | 0/25 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


[EASEᴿ] Training started!

[EASEᴿ] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


[NeuMF] Training started!


  0%|          | 0/10 [00:00<?, ?it/s]


[NeuMF] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


[VAECF] Training started!


  0%|          | 0/100 [00:00<?, ?it/s]


[VAECF] Evaluation started!


Ranking:   0%|          | 0/610 [00:00<?, ?it/s]


TEST:
...
        | NDCG@5 | Recall@5 | Train (s) | Test (s)
------- + ------ + -------- + --------- + --------
UserKNN | 0.0009 |   0.0000 |    0.0322 |   2.5112
ItemKNN | 0.0006 |   0.0000 |    1.2976 |  30.8166
MF      | 0.0332 |   0.0023 |    0.0493 |   0.2540
EASEᴿ   | 0.3669 |   0.0734 |    8.4738 |   0.4680
NeuMF   | 0.2544 |   0.0407 |   23.4055 |   0.7490
VAECF   | 0.2861 |   0.0569 |    5.8007 |   0.4000

CPU times: total: 3min 44s
Wall time: 1min 14s


# Prediction

In [21]:
# 전체 데이터 Cornac 데이터 형식으로 변환
full_data = cornac.data.Dataset.from_uir(
    data.itertuples(index=False),
    seed=SEED
    )

# 모델 선택
model = UserKNN

# 전체 데이터로 다시 학습
model = model(**models['UserKNN'][1], verbose=VERBOSE, seed=SEED)
model.fit(full_data)

  0%|          | 0/610 [00:00<?, ?it/s]

<cornac.models.knn.recom_knn.UserKNN at 0x227f4108250>

In [22]:
%%time

# All item에 대한 예측값 생성
all_pred = predict_ranking(
    model,
    data, 
    usercol=COL_USER,
    itemcol=COL_ITEM, 
    remove_seen=True
)

100%|██████████| 610/610 [00:03<00:00, 190.03it/s]


CPU times: total: 6.22 s
Wall time: 6.28 s


In [23]:
all_pred

Unnamed: 0,userId,movieId,prediction
1,1,1,4.131964
3,1,3,3.268977
4,1,4,3.973975
6,1,6,3.908557
7,1,7,3.734480
...,...,...,...
5931635,610,9719,3.983332
5931636,610,9720,3.483332
5931637,610,9721,3.483332
5931638,610,9722,3.483332


In [24]:
%%time

# Top-K item 생성
top_k = (
    all_pred
    .groupby(COL_USER)
    .apply(lambda x: x.nlargest(TOP_K, COL_PREDICTION))
    .reset_index(drop=True)
    .drop(COL_PREDICTION, axis=1)
    .sort_values(by=COL_USER)
)

# # submission 저장
# t = pd.Timestamp.now()
# fname = f"./submit/{model.name}_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
# top_k.to_csv(fname, index=False)

CPU times: total: 453 ms
Wall time: 501 ms


In [25]:
top_k

Unnamed: 0,userId,movieId
0,1,6034
1,1,4039
2,1,4588
3,1,2661
4,1,9241
...,...,...
3048,610,2661
3045,610,6034
3046,610,4039
3047,610,4588
