In [1]:
import pandas as pd
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import train_test_split, precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path = '../data/raw/ml-latest-small/'

ratings = pd.read_csv(f'{path}ratings.csv')
movies = pd.read_csv(f'{path}movies.csv')

In [5]:
data = ratings[['userId', 'movieId', 'rating']]

user_ids = data['userId'].unique()
movie_ids = data['movieId'].unique()

# 데이터 매핑
user_to_idx = {v: k for k, v in enumerate(user_ids)}
movie_to_idx = {v: k for k, v in enumerate(movie_ids)}

# 디코더용
idx_to_user = {k: v for k, v in enumerate(user_ids)}
idx_to_movie = {k: v for k, v in enumerate(movie_ids)}

data['user_id_idx'] = data['userId'].map(user_to_idx)
data['movie_id_idx'] = data['movieId'].map(movie_to_idx)

# CSR Matrix 생성
sparse_item_user = sparse.csr_matrix(
    (data['rating'].astype(float), (data['user_id_idx'], data['movie_id_idx']))
)

train_csr, test_csr = train_test_split(sparse_item_user.tocoo(), train_percentage=0.8, random_state=42)

print(f"Matrix shape: {sparse_item_user.shape}")
print(f"1. data값:  {sparse_item_user.data[:10]}")
print(f"2. indices값: {sparse_item_user.indices[:10]}")
print(f"3. indptr값: {sparse_item_user.indptr[:10]})")

Matrix shape: (610, 9724)
1. data값:  [4. 4. 4. 5. 5. 3. 5. 4. 5. 5.]
2. indices값: [0 1 2 3 4 5 6 7 8 9]
3. indptr값: [   0  232  261  300  516  560  874 1026 1073 1119])


In [6]:
import mlflow
import pickle
from itertools import product
import os
import tempfile

In [None]:
# 하이퍼파라미터 후보
factors_list = [50, 100]
regularization_list = [0.01, 0.1]
iteration_list = [20, 30]
alpha_list = [20, 40]

# 모든 하이퍼파라미터 조합 생성
params_combinations = list(product(factors_list, regularization_list, iteration_list, alpha_list))
mlflow.set_experiment("MovieLens_Baseline_MF_Grid_Search")

# 그리드 서치 실행
for factors, regularization, iteration, alpha in params_combinations:
    run_name = f"factors_{factors}_reg_{regularization}_iter_{iteration}_alpha_{alpha}"
    with mlflow.start_run(run_name=run_name):
        # 하이퍼파라미터 로깅
        params = {
            "factors": factors,
            "regularization": regularization,
            "iteration": iteration,
            "alpha": alpha
        }
        mlflow.log_params(params)

        # ALS 모델 설정
        model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iteration,
            random_state=42
        )

        train_conf = (train_csr * alpha).astype('double')

        # 모델 학습
        model.fit(train_conf)

        # Precision@10 평가
        p_at_10 = precision_at_k(model, train_csr, test_csr, K=10, show_progress=False)

        mlflow.log_metric("precision_at_10", p_at_10)

        # 모델 저장
        with tempfile.TemporaryDirectory() as tmp_dir:
            model_path = os.path.join(tmp_dir, "model.pkl")
            with open(model_path, "wb") as f:
                pickle.dump(model, f)

            mlflow.log_artifact(model_path, artifact_path="model")
        
        print(f"Run: {run_name} - Precision@10: {p_at_10}") 
        
print("MovieLens Baseline MF Grid Search Completed.")

2026/01/07 23:00:42 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/07 23:00:42 INFO mlflow.store.db.utils: Updating database tables
2026/01/07 23:00:42 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/07 23:00:42 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2026/01/07 23:00:42 INFO alembic.runtime.migration: Running 

Run: factors_50_reg_0.01_iter_20_alpha_20 - Precision@10: 0.20273655810368085


100%|██████████| 20/20 [00:01<00:00, 13.80it/s]


Run: factors_50_reg_0.01_iter_20_alpha_40 - Precision@10: 0.1740219695509732


100%|██████████| 30/30 [00:02<00:00, 13.92it/s]


Run: factors_50_reg_0.01_iter_30_alpha_20 - Precision@10: 0.20871073424551936


100%|██████████| 30/30 [00:02<00:00, 13.91it/s]


Run: factors_50_reg_0.01_iter_30_alpha_40 - Precision@10: 0.18115243784929658


100%|██████████| 20/20 [00:01<00:00, 13.83it/s]


Run: factors_50_reg_0.1_iter_20_alpha_20 - Precision@10: 0.19811138947774137


100%|██████████| 20/20 [00:01<00:00, 13.88it/s]


Run: factors_50_reg_0.1_iter_20_alpha_40 - Precision@10: 0.17845442281749854


100%|██████████| 30/30 [00:02<00:00, 13.80it/s]


Run: factors_50_reg_0.1_iter_30_alpha_20 - Precision@10: 0.20678358065137792


100%|██████████| 30/30 [00:02<00:00, 14.08it/s]


Run: factors_50_reg_0.1_iter_30_alpha_40 - Precision@10: 0.1850067450375795


100%|██████████| 20/20 [00:02<00:00,  7.88it/s]


Run: factors_100_reg_0.01_iter_20_alpha_20 - Precision@10: 0.2343418770476007


100%|██████████| 20/20 [00:02<00:00,  7.80it/s]


Run: factors_100_reg_0.01_iter_20_alpha_40 - Precision@10: 0.21641934862208517


100%|██████████| 30/30 [00:03<00:00,  7.84it/s]


Run: factors_100_reg_0.01_iter_30_alpha_20 - Precision@10: 0.23703989207939874


100%|██████████| 30/30 [00:03<00:00,  7.86it/s]


Run: factors_100_reg_0.01_iter_30_alpha_40 - Precision@10: 0.2220080940450954


100%|██████████| 20/20 [00:02<00:00,  7.85it/s]


Run: factors_100_reg_0.1_iter_20_alpha_20 - Precision@10: 0.24012333783002504


100%|██████████| 20/20 [00:02<00:00,  7.89it/s]


Run: factors_100_reg_0.1_iter_20_alpha_40 - Precision@10: 0.22894584698400464


100%|██████████| 30/30 [00:03<00:00,  7.88it/s]


Run: factors_100_reg_0.1_iter_30_alpha_20 - Precision@10: 0.24012333783002504


100%|██████████| 30/30 [00:03<00:00,  7.82it/s]

Run: factors_100_reg_0.1_iter_30_alpha_40 - Precision@10: 0.23299286953170167
MovieLens Baseline MF Grid Search Completed.





In [7]:
best_params = {
    'factors': 100, 
    'regularization': 0.1, 
    'iterations': 30, 
    'alpha': 40
}

best_model = AlternatingLeastSquares(
    factors=best_params['factors'],
    regularization=best_params['regularization'],
    iterations=best_params['iterations'],
    random_state=42
)

all_data_conf = (sparse_item_user * best_params['alpha']).astype('double')
best_model.fit(all_data_conf)

with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Best Model save completed: best_model.pkl")

  check_blas_config()
100%|██████████| 30/30 [00:04<00:00,  6.90it/s]

Best Model save completed: best_model.pkl





In [None]:
user_id = 1 # 테스트할 사용자 ID
user_idx = user_to_idx[user_id] # 사용자 인덱스
N = 5  # 추천할 영화 수

if user_idx is None:
    print(f"User ID {user_id} not found in the dataset.")
else:
    print(f"User {user_id}가 좋아하는 영화 (Input).") # 사용자가 평가한 영화 중 상위 N개 
    user_history = data[data['userId'] == user_id]

    for _, row in user_history.sort_values(by='rating', ascending=False).head(N).iterrows():
        title, genres = movies[movies['movieId'] == row['movieId']][['title', 'genres']].values[0]
        print(f"영화 제목: {title}, 장르: {genres} (평점: {row['rating']})")
    print()
    print("모델의 추천영화 (Top 5).")

    recommandations = best_model.recommend(user_idx, sparse_item_user[user_idx], N=N)
    for item_idx, score in zip(*recommandations):
        original_movie_id = idx_to_movie[item_idx]
        title, genres = movies[movies['movieId'] == original_movie_id][['title', 'genres']].values[0]
        print(f"영화 제목: {title}, 장르: {genres} (추천 점수: {score})")

""" 
    과연 잘 추천해 줬을까?

    User 1가 좋아하는 영화 (Input).
    영화 제목: M*A*S*H (a.k.a. MASH) (1970), 장르: Comedy|Drama|War (평점: 5.0)
    영화 제목: Excalibur (1981), 장르: Adventure|Fantasy (평점: 5.0)
    영화 제목: Indiana Jones and the Last Crusade (1989), 장르: Action|Adventure (평점: 5.0)
    영화 제목: Pink Floyd: The Wall (1982), 장르: Drama|Musical (평점: 5.0)
    영화 제목: From Russia with Love (1963), 장르: Action|Adventure|Thriller (평점: 5.0)

    모델의 추천영화 (Top 5).
    영화 제목: Die Hard 2 (1990), 장르: Action|Adventure|Thriller (추천 점수: 1.380651593208313)
    영화 제목: Backdraft (1991), 장르: Action|Drama (추천 점수: 1.2297831773757935)
    영화 제목: Top Gun (1986), 장르: Action|Romance (추천 점수: 1.2203583717346191)
    영화 제목: Robin Hood: Prince of Thieves (1991), 장르: Adventure|Drama (추천 점수: 1.1712291240692139)
    영화 제목: There's Something About Mary (1998), 장르: Comedy|Romance (추천 점수: 1.1405789852142334)
"""

User 1가 좋아하는 영화 (Input).
영화 제목: M*A*S*H (a.k.a. MASH) (1970), 장르: Comedy|Drama|War (평점: 5.0)
영화 제목: Excalibur (1981), 장르: Adventure|Fantasy (평점: 5.0)
영화 제목: Indiana Jones and the Last Crusade (1989), 장르: Action|Adventure (평점: 5.0)
영화 제목: Pink Floyd: The Wall (1982), 장르: Drama|Musical (평점: 5.0)
영화 제목: From Russia with Love (1963), 장르: Action|Adventure|Thriller (평점: 5.0)

모델의 추천영화 (Top 5).
영화 제목: Die Hard 2 (1990), 장르: Action|Adventure|Thriller (추천 점수: 1.380651593208313)
영화 제목: Backdraft (1991), 장르: Action|Drama (추천 점수: 1.2297831773757935)
영화 제목: Top Gun (1986), 장르: Action|Romance (추천 점수: 1.2203583717346191)
영화 제목: Robin Hood: Prince of Thieves (1991), 장르: Adventure|Drama (추천 점수: 1.1712291240692139)
영화 제목: There's Something About Mary (1998), 장르: Comedy|Romance (추천 점수: 1.1405789852142334)
