In [None]:
import pandas as pd
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import train_test_split, precision_at_k

In [18]:
path = '../data/raw/ml-latest-small/'

ratings = pd.read_csv(f'{path}ratings.csv')
movies = pd.read_csv(f'{path}movies.csv')

In [19]:
data = ratings[['userId', 'movieId', 'rating']]

user_ids = data['userId'].unique()
movie_ids = data['movieId'].unique()

# 데이터 매핑
user_to_idx = {v: k for k, v in enumerate(user_ids)}
movie_to_idx = {v: k for k, v in enumerate(movie_ids)}

# 디코더용
idx_to_user = {k: v for k, v in enumerate(user_ids)}
idx_to_movie = {k: v for k, v in enumerate(movie_ids)}

data['user_id_idx'] = data['userId'].map(user_to_idx)
data['movie_id_idx'] = data['movieId'].map(movie_to_idx)

# CSR Matrix 생성
sparse_item_user = sparse.csr_matrix(
    (data['rating'].astype(float), (data['user_id_idx'], data['movie_id_idx']))
)

train_csr, test_csr = train_test_split(sparse_item_user.tocoo(), train_percentage=0.8, random_state=42)

print(f"Matrix shape: {sparse_item_user.shape}")
print(f"1. data값:  {sparse_item_user.data[:10]}")
print(f"2. indices값: {sparse_item_user.indices[:10]}")
print(f"3. indptr값: {sparse_item_user.indptr[:10]})")

Matrix shape: (610, 9724)
1. data값:  [4. 4. 4. 5. 5. 3. 5. 4. 5. 5.]
2. indices값: [0 1 2 3 4 5 6 7 8 9]
3. indptr값: [   0  232  261  300  516  560  874 1026 1073 1119])


In [None]:
import mlflow
import pickle
from itertools import product
import os
import tempfile

factors_list = [50, 100]
regularization_list = [0.01, 0.1]
iteration_list = [20, 30]
alpha_list = [20, 40]

# 모든 하이퍼파라미터 조합 생성
params_combinations = list(product(factors_list, regularization_list, iteration_list, alpha_list))
mlflow.set_experiment("MovieLens_Baseline_MF_Grid_Search")

# 그리드 서치 실행
for factors, regularization, iteration, alpha in params_combinations:
    run_name = f"factors_{factors}_reg_{regularization}_iter_{iteration}_alpha_{alpha}"
    with mlflow.start_run(run_name=run_name):
        # 하이퍼파라미터 로깅
        params = {
            "factors": factors,
            "regularization": regularization,
            "iteration": iteration,
            "alpha": alpha
        }
        mlflow.log_params(params)

        # ALS 모델 설정
        model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iteration,
            random_state=42
        )

        train_conf = (train_csr * alpha).astype('double')

        # 모델 학습
        model.fit(train_conf)

        # Precision@10 평가
        p_at_10 = precision_at_k(model, train_csr, test_csr, K=10, show_progress=False)

        mlflow.log_metric("precision_at_10", p_at_10)

        # 모델 저장
        with tempfile.TemporaryDirectory() as tmp_dir:
            model_path = os.path.join(tmp_dir, "model.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(model, f)

            mlflow.log_artifact(model_path, artifact_path="model")
        
        print(f"Run: {run_name} - Precision@10: {p_at_10}") 
        
print("MovieLens Baseline MF Grid Search Completed.")

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:01<00:00, 13.95it/s]


Run: factors_50_reg_0.01_iter_20_alpha_20 - Precision@10: 0.20273655810368085


100%|██████████| 20/20 [00:01<00:00, 14.19it/s]


Run: factors_50_reg_0.01_iter_20_alpha_40 - Precision@10: 0.1740219695509732


100%|██████████| 30/30 [00:02<00:00, 13.96it/s]


Run: factors_50_reg_0.01_iter_30_alpha_20 - Precision@10: 0.20871073424551936


100%|██████████| 30/30 [00:02<00:00, 14.01it/s]


Run: factors_50_reg_0.01_iter_30_alpha_40 - Precision@10: 0.18115243784929658


100%|██████████| 20/20 [00:01<00:00, 14.07it/s]


Run: factors_50_reg_0.1_iter_20_alpha_20 - Precision@10: 0.19811138947774137


100%|██████████| 20/20 [00:01<00:00, 13.92it/s]


Run: factors_50_reg_0.1_iter_20_alpha_40 - Precision@10: 0.17845442281749854


100%|██████████| 30/30 [00:02<00:00, 14.02it/s]


Run: factors_50_reg_0.1_iter_30_alpha_20 - Precision@10: 0.20678358065137792


100%|██████████| 30/30 [00:02<00:00, 13.85it/s]


Run: factors_50_reg_0.1_iter_30_alpha_40 - Precision@10: 0.1850067450375795


100%|██████████| 20/20 [00:02<00:00,  7.86it/s]


Run: factors_100_reg_0.01_iter_20_alpha_20 - Precision@10: 0.2343418770476007


100%|██████████| 20/20 [00:02<00:00,  7.87it/s]


Run: factors_100_reg_0.01_iter_20_alpha_40 - Precision@10: 0.21641934862208517


100%|██████████| 30/30 [00:03<00:00,  7.98it/s]


Run: factors_100_reg_0.01_iter_30_alpha_20 - Precision@10: 0.23703989207939874


100%|██████████| 30/30 [00:03<00:00,  7.79it/s]


Run: factors_100_reg_0.01_iter_30_alpha_40 - Precision@10: 0.2220080940450954


100%|██████████| 20/20 [00:02<00:00,  7.97it/s]


Run: factors_100_reg_0.1_iter_20_alpha_20 - Precision@10: 0.24012333783002504


100%|██████████| 20/20 [00:02<00:00,  7.98it/s]


Run: factors_100_reg_0.1_iter_20_alpha_40 - Precision@10: 0.22894584698400464


100%|██████████| 30/30 [00:03<00:00,  7.94it/s]


Run: factors_100_reg_0.1_iter_30_alpha_20 - Precision@10: 0.24012333783002504


100%|██████████| 30/30 [00:03<00:00,  7.92it/s]

Run: factors_100_reg_0.1_iter_30_alpha_40 - Precision@10: 0.23299286953170167
MovieLens Baseline MF Grid Search Completed.



