In [1]:
import sys
import cornac
import tensorflow as tf

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")
print(f"Tensorflow version: {tf.__version__}")

SEED = 20240516
VERBOSE = True

System version: 3.11.9 | packaged by Anaconda, Inc. | (main, Apr 19 2024, 16:40:41) [MSC v.1916 64 bit (AMD64)]
Cornac version: 2.1
Tensorflow version: 2.16.1


In [2]:
import cornac
import pandas as pd
from cornac.eval_methods import RatioSplit
from cornac.models import BPR
from cornac.metrics import Precision, Recall

df = pd.read_csv("./cs608_ip_train_v3.csv")
df1 = pd.read_csv("./cs608_ip_probe_v3.csv")
data_train = df.values.tolist()
data_val = df1.values.tolist()
data = data_train + data_val

# Split the data into training, validation and testing sets
rs = RatioSplit(data=data, test_size=0.1, val_size=0.1, seed=SEED, verbose=VERBOSE)

# Instantiate Recall@100 for evaluation
rec50 = cornac.metrics.Recall(50)

# Instantiate a matrix factorization model (e.g., BPR)
bpr = BPR(
    k=1000, max_iter=3000, learning_rate=0.05, lambda_reg=0.01, seed=SEED, verbose=VERBOSE
)

rating_threshold = 1.0
exclude_unknowns = True
---
Training data:
Number of users = 21124
Number of items = 29781
Number of ratings = 225055
Max rating = 5.0
Min rating = 1.0
Global mean = 4.3
---
Test data:
Number of users = 21124
Number of items = 29781
Number of ratings = 27263
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 21124
Number of items = 29781
Number of ratings = 27295
---
Total users = 21124
Total items = 29781


In [4]:
from cornac.hyperopt import Discrete, Continuous
from cornac.hyperopt import GridSearch, RandomSearch

# Grid Search
gs_bpr = GridSearch(
    model=bpr,
    space=[
        Discrete(name="k", values=[1000, 2000, 3000]),
        Discrete(name="learning_rate", values=[0.01, 0.05, 0.1]),
        Discrete(name="lambda_reg", values=[0.01]),
        Discrete(name="max_iter", values=[3000, 4000, 5000]),
    ],
    metric=rec50,
    eval_method=rs,
)

In [5]:
%%time
# Define the experiment
cornac.Experiment(
    eval_method=rs,
    models=[gs_bpr],
    metrics=[rec50],
    user_based=False,
).run()

# Obtain the best params
print(gs_bpr.best_params)


[GridSearch_BPR] Training started!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 1000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.01, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 3000}


  0%|          | 0/3000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 4000}


  0%|          | 0/4000 [00:00<?, ?it/s]

Optimization finished!
Evaluating: {'k': 3000, 'lambda_reg': 0.01, 'learning_rate': 0.1, 'max_iter': 5000}


  0%|          | 0/5000 [00:00<?, ?it/s]

Optimization finished!
Best parameter settings: {'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 4000}
Recall@50 = 0.1558

[GridSearch_BPR] Evaluation started!


Ranking:   0%|          | 0/14835 [00:00<?, ?it/s]

Ranking:   0%|          | 0/14900 [00:00<?, ?it/s]


VALIDATION:
...
               | Recall@50 | Time (s)
-------------- + --------- + --------
GridSearch_BPR |    0.1558 |  80.3390

TEST:
...
               | Recall@50 |   Train (s) | Test (s)
-------------- + --------- + ----------- + --------
GridSearch_BPR |    0.1550 | 112321.3334 |  80.3642

{'k': 2000, 'lambda_reg': 0.01, 'learning_rate': 0.05, 'max_iter': 4000}
CPU times: total: 1d 4h 57min 26s
Wall time: 1d 7h 14min 42s


In [6]:
gs_bpr.best_model.save("bpr_best_model")

BPR model is saved to bpr_best_model\BPR\2024-05-19_16-09-58-232150.pkl


'bpr_best_model\\BPR\\2024-05-19_16-09-58-232150.pkl'

In [7]:
%%time
from tqdm.notebook import tqdm

import zipfile

def generate_submission(bpr):
    print(f"Generating recommendations using {bpr.name}...")
    df = pd.read_csv("./cs608_ip_train_v3.csv")
    
    # Sort the DataFrame by 'user_id' in ascending order
    df_sorted = df.sort_values(by='user_id')

    # Extract unique user IDs sorted in ascending order
    user_ids = df_sorted['user_id'].unique()

    # Generate recommendations for all users
    top_k_recommendations = []
    for user_id in tqdm(user_ids):
        recs = bpr.recommend(user_id=user_id, k=50)
        top_k_recommendations.append(recs)

    with open("submission.txt", "w") as file:
        for user_recommendations in top_k_recommendations:
            file.write(" ".join(map(str, user_recommendations)) + "\n")

    # zip the submission file
    with zipfile.ZipFile('submission.zip', 'w') as file:
        file.write('submission.txt')

CPU times: total: 0 ns
Wall time: 0 ns


In [8]:
generate_submission(gs_bpr.best_model)

Generating recommendations using BPR...


  0%|          | 0/21124 [00:00<?, ?it/s]