In [36]:
import pandas as pd
import numpy as np
import random
import implicit
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.utils import check_random_state
from scipy.sparse import csr_matrix

In [37]:
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)

In [38]:
df = pd.read_csv('../data/ratings.csv')

ALS implicit model

In [43]:
def precision_at_k(actual, predicted, k=10):
    predicted_k = predicted[:k]
    if len(predicted_k) == 0:
        return 0.0
    return len(set(predicted_k) & set(actual)) / k

def recall_at_k(actual, predicted, k=10):
    if len(actual) == 0:
        return 0.0
    predicted_k = predicted[:k]
    return len(set(predicted_k) & set(actual)) / len(actual)

def average_precision(actual, predicted, k=10):
    score = 0.0
    hits = 0
    for i, p in enumerate(predicted[:k]):
        if p in actual:
            hits += 1
            score += hits / (i + 1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)

def ndcg_k(actual, predicted, k=10):
    import math

    def dcg(rel):
        return sum((1.0 / math.log2(i + 2)) for i, r in enumerate(rel) if r == 1)

    predicted_k = predicted[:k]
    rel = [1 if p in actual else 0 for p in predicted_k]
    idcg = dcg(sorted(rel, reverse=True))
    dcg_val = dcg(rel)
    return dcg_val / idcg if idcg > 0 else 0.0

def train_model(train_matrix, factors=50, regularization=0.01, iterations=15, random_state=42, alpha=40):
    matrix = train_matrix.tocsr().astype(np.float32)
    confidence_matrix = matrix.multiply(alpha).tocsr()

    model = implicit.als.AlternatingLeastSquares(
        factors=int(factors), 
        regularization=float(regularization),
        iterations=int(iterations),
        random_state=random_state
    )

    model.fit(confidence_matrix)
    return model

def create_compatible_matrix(original_matrix, target_rows, target_cols):
    new_matrix = csr_matrix((target_rows, target_cols), dtype=np.float32)
    rows_to_copy = min(original_matrix.shape[0], target_rows)
    cols_to_copy = min(original_matrix.shape[1], target_cols)
    new_matrix[:rows_to_copy, :cols_to_copy] = original_matrix[:rows_to_copy, :cols_to_copy]
    return new_matrix

def evaluate_model_topk(model, train_matrix, test_matrix, K=10):
    num_users_model = model.user_factors.shape[0]
    num_items_model = model.item_factors.shape[0]
    
    train_compatible = create_compatible_matrix(train_matrix, num_users_model, num_items_model)
    test_compatible = create_compatible_matrix(test_matrix, num_users_model, num_items_model)
    
    test_csr = test_compatible.tocsr()
    train_csr = train_compatible.tocsr()

    precisions = []
    recalls = []
    maps = []
    ndcgs = []

    for user in range(num_users_model):
        if user >= test_csr.shape[0]:
            continue
            
        actual_items = test_csr[user].indices.tolist()

        if len(actual_items) == 0:
            continue

        try:
            recommended, _ = model.recommend(
                user,
                train_csr,
                N=K,
                filter_already_liked_items=True
            )

            precisions.append(precision_at_k(actual_items, recommended, K))
            recalls.append(recall_at_k(actual_items, recommended, K))
            maps.append(average_precision(actual_items, recommended, K))
            ndcgs.append(ndcg_k(actual_items, recommended, K))
            
        except Exception as e:
            continue

    if not precisions:
        return {
            "precision@K": 0.0,
            "recall@K": 0.0,
            "map@K": 0.0,
            "ndcg@K": 0.0,
        }

    return {
        "precision@K": float(np.mean(precisions)),
        "recall@K": float(np.mean(recalls)),
        "map@K": float(np.mean(maps)),
        "ndcg@K": float(np.mean(ndcgs)),
    }

def model_grid_search(train_matrix, test_matrix, param_distributions, n_iter=10, random_state=42, K=10):
    best_score = float('inf')
    best_params = None
    best_model = None
    results = []

    rng = check_random_state(random_state)
    param_names = list(param_distributions.keys())

    for i in range(n_iter):
        params = {name: rng.choice(param_distributions[name]) for name in param_names}

        try:
            model = train_model(
                train_matrix,
                factors=int(params.get('factors', 50)),
                regularization=float(params.get('regularization', 0.01)), 
                iterations=int(params.get('iterations', 15)), 
                random_state=random_state
            )

            metrics = evaluate_model_topk(model, train_matrix, test_matrix, K)
            score = metrics["ndcg@K"]

            results.append({
                "params": params.copy(),
                "metrics": metrics,
                "iteration": i
            })

        except Exception as e:
            continue

        if score < best_score:
            best_score = score
            best_params = params.copy()
            best_model = model

    if results:
        results.sort(key=lambda x: x['metrics']['ndcg@K'], reverse=True)
    else:
        print("Не удалось выполнить ни одной итерации")

    return best_model, best_params, best_score, results

def recommend_for_user(model, train_matrix, user_id, user_to_idx, movie_to_idx, n=10, filter_liked=True):
    if user_id not in user_to_idx:
        raise ValueError(f"Пользователь {user_id} не найден в словаре")
    
    user_idx = user_to_idx[user_id]
    num_users_model = model.user_factors.shape[0]
    num_items_model = model.item_factors.shape[0]
    
    if user_idx >= num_users_model:
        raise ValueError(f"Индекс пользователя {user_idx} превышает размер модели")
    
    train_compatible = create_compatible_matrix(train_matrix, num_users_model, num_items_model)
    user_items = train_compatible.tocsr()

    if filter_liked:
        recommendations = model.recommend(
            user_idx,
            user_items,
            N=n,
            filter_already_liked_items=True
        )
    else:
        recommendations = model.recommend(user_idx, user_items, N=n, filter_already_liked_items=False)

    idx_to_movie = {idx: movie_id for movie_id, idx in movie_to_idx.items()}

    recommended_items = []
    for item_idx, score in zip(recommendations[0], recommendations[1]):
        if item_idx in idx_to_movie:
            movie_id = idx_to_movie[item_idx]
            recommended_items.append((movie_id, float(score)))

    return recommended_items

In [27]:
import sys

sys.path.append('..')
from src.data_preprocessing import train_test_split_by_user, create_rating_matrix

matrix, user_to_idx, movie_to_idx, user_ids, movie_ids= create_rating_matrix(df)

train_matrix, test_matrix = train_test_split_by_user(matrix)

In [45]:
param_distributions = {
    'factors': [50, 100, 150, 200], 
    'regularization': [0.001, 0.01, 0.1, 1.0],  
    'iterations': [20, 30, 50, 100], 
}
best_model, best_params, best_score, results = model_grid_search(train_matrix, test_matrix, param_distributions, n_iter=5)


100%|██████████| 20/20 [00:00<00:00, 27.94it/s]
100%|██████████| 100/100 [00:03<00:00, 27.22it/s]
100%|██████████| 50/50 [00:01<00:00, 31.95it/s]
100%|██████████| 50/50 [00:01<00:00, 29.61it/s]
100%|██████████| 100/100 [00:04<00:00, 23.44it/s]


In [42]:
print("ЛУЧШИЕ ПАРАМЕТРЫ:")
print(f"Параметры: {best_params}")
print(f"Лучший NDCG@K: {best_score:.4f}")

print("\nТОП-5 лучших комбинаций:")
for i, result in enumerate(results[:5]):
    print(f"{i+1}. Параметры: {result['params']}")
    print(f"   Precision@K: {result['metrics']['precision@K']:.4f}, Recall@K: {result['metrics']['recall@K']:.4f}")
    print(f"   MAP@K: {result['metrics']['map@K']:.4f}, NDCG@K: {result['metrics']['ndcg@K']:.4f}")

try:
    user_id = df['userId'].iloc[0]
    recommendations = recommend_for_user(best_model, train_matrix, user_id, user_to_idx, movie_to_idx, n=5, filter_liked=False)
    print(f"\nРекомендации для пользователя {user_id}:")
    for movie_id, score in recommendations:
        print(f"  Фильм {movie_id}: оценка {score:.4f}")
except Exception as e:
    print(f"\nОшибка рекомендаций: {e}")

ЛУЧШИЕ ПАРАМЕТРЫ:
Параметры: {'factors': np.int64(150), 'regularization': np.float64(1.0), 'iterations': np.int64(20)}
Лучший NDCG@K: 0.0000

ТОП-5 лучших комбинаций:
1. Параметры: {'factors': np.int64(150), 'regularization': np.float64(1.0), 'iterations': np.int64(20)}
   Precision@K: 0.0000, Recall@K: 0.0000
   MAP@K: 0.0000, NDCG@K: 0.0000
2. Параметры: {'factors': np.int64(150), 'regularization': np.float64(0.1), 'iterations': np.int64(100)}
   Precision@K: 0.0000, Recall@K: 0.0000
   MAP@K: 0.0000, NDCG@K: 0.0000
3. Параметры: {'factors': np.int64(50), 'regularization': np.float64(0.001), 'iterations': np.int64(50)}
   Precision@K: 0.0000, Recall@K: 0.0000
   MAP@K: 0.0000, NDCG@K: 0.0000
4. Параметры: {'factors': np.int64(100), 'regularization': np.float64(0.1), 'iterations': np.int64(50)}
   Precision@K: 0.0000, Recall@K: 0.0000
   MAP@K: 0.0000, NDCG@K: 0.0000
5. Параметры: {'factors': np.int64(150), 'regularization': np.float64(0.1), 'iterations': np.int64(100)}
   Precision@K