# UserCF、ItemCF、MX-SVD

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import random

def load_dataset(path="./datas/hetrec2011-lastfm-2k"):
    # Load user-artist interactions
    user_artists = pd.read_csv(f"{path}/user_artists.dat", sep='\t')

    # Load artists data
    artists = pd.read_csv(f"{path}/artists.dat", sep='\t')

    return user_artists, artists

# Preprocess data
def preprocess_data(user_artists):
    # Create user-item matrix
    user_item_matrix_df = user_artists.pivot(index='userID', columns='artistID', values='weight').fillna(0)
    return user_item_matrix_df

# userCF核心原理

In [24]:
def user_based_cf(user_item_matrix_df, user_id, n_users=10, n_recommendations=10):
    # Calculate user similarity
    user_similarity = cosine_similarity(user_item_matrix_df)
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix_df.index, columns=user_item_matrix_df.index)

    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_users+1].index

    # Get recommendations
    recommendations = defaultdict(float)

    # 对于当前没听过的艺术家，以相似用户喜欢的艺术家频次为准,
    # recommendations[item] = 累加（某user和其相似用户的相似度 * weight）
    current_user_mean_weight = user_item_matrix_df.loc[user_id].mean()

    for similar_user in similar_users:
        similarity_between_user = user_similarity_df.loc[user_id, similar_user]
        similar_user_mean_weight = user_item_matrix_df.loc[similar_user].mean()

        for item in user_item_matrix_df.columns:
            if user_item_matrix_df.loc[user_id, item] == 0 and user_item_matrix_df.loc[similar_user, item] > 0:
                recommendations[item] += similarity_between_user * (user_item_matrix_df.loc[similar_user, item] - similar_user_mean_weight)

    recommendations = {item: score + current_user_mean_weight for item, score in recommendations.items()}
    # Sort recommendations
    recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return recommendations

# itemCf核心原理

In [25]:
def item_based_cf(user_item_matrix_df, user_id, n_items=10, n_recommendations=10):
    # 1. Calculate item similarity between items
    item_similarity = cosine_similarity(user_item_matrix_df.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix_df.columns, columns=user_item_matrix_df.columns)

    # 2. Get items the user has interacted with and weights > 0
    user_related_items = user_item_matrix_df.loc[user_id]
    related_items_id = user_related_items[user_related_items > 0].index.tolist()

    # Get recommendations
    recommendations = defaultdict(float)

    for item_id in related_items_id:
        item_weight = user_item_matrix_df.loc[user_id, item_id]

        # 2.1 Find n top items similar to current item, according to item_similarity_df
        similar_items = item_similarity_df[item_id].sort_values(ascending=False)[1:n_items+1]

        for similar_item, similarity_between_item in similar_items.items():
            # 2.2 only find the similar item is not in related_items_id, add it to recommendations
            if similar_item in related_items_id:
                continue
            # the below similar_item's weight must be 0
            recommendations[similar_item] += similarity_between_item * item_weight

    # Sort recommendations
    recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return recommendations

In [31]:
def is_sparse_matrix(df, threshold=0.1):
    """
    判断一个DataFrame是否是稀疏矩阵

    参数:
    df (pd.DataFrame): 需要判断的DataFrame
    threshold (float): 稀疏矩阵的阈值，非零元素比例低于该值则认为是稀疏矩阵

    返回:
    bool: 如果是稀疏矩阵返回True，否则返回False
    """
    total_elements = df.size
    non_zero_elements = df.astype(bool).sum().sum()
    sparsity_ratio = non_zero_elements / total_elements
    return sparsity_ratio

# main运行

In [32]:
def main():
    # Load and preprocess data
    user_artists, artists = load_dataset()
    user_item_matrix_df = preprocess_data(user_artists)
    print("是否是稀疏矩阵:", is_sparse_matrix(user_item_matrix_df))
    # Example: Get recommendations for a specific user
    user_id = user_item_matrix_df.index[0]  # First user in the dataset
    # user_id = int(input("input the user id:"))

    print(f"User-based recommendations for user {user_id}:")
    user_recommendations = user_based_cf(user_item_matrix_df, user_id)
    for item_id, score in user_recommendations:
        artist_name = artists[artists['id'] == item_id]['name'].values[0] if item_id in artists['id'].values else "Unknown"
        print(f"Artist ID: {item_id}, Score: {score:.2f}, Name: {artist_name}")

    print(f"\nItem-based recommendations for user {user_id}:")
    item_recommendations = item_based_cf(user_item_matrix_df, user_id)
    for item_id, score in item_recommendations:
        artist_name = artists[artists['id'] == item_id]['name'].values[0] if item_id in artists['id'].values else "Unknown"
        print(f"Artist ID: {item_id}, Score: {score:.2f}, Name: {artist_name}")

main()

是否是稀疏矩阵: 0.002782815119924182
User-based recommendations for user 2:
Artist ID: 511, Score: 11031.56, Name: U2
Artist ID: 159, Score: 9437.26, Name: The Cure
Artist ID: 1001, Score: 8636.79, Name: Pet Shop Boys
Artist ID: 2562, Score: 6256.60, Name: Arcadia
Artist ID: 1014, Score: 5248.06, Name: Erasure
Artist ID: 993, Score: 5025.80, Name: Simple Minds
Artist ID: 187, Score: 4850.40, Name: a-ha
Artist ID: 4313, Score: 4472.78, Name: Nephew
Artist ID: 227, Score: 3799.39, Name: The Beatles
Artist ID: 6776, Score: 3573.63, Name: Book of Love

Item-based recommendations for user 2:
Artist ID: 2556, Score: 14102.14, Name: The Power Station
Artist ID: 8995, Score: 13904.49, Name: Andy Taylor
Artist ID: 1076, Score: 13796.74, Name: Wham!
Artist ID: 2562, Score: 13181.35, Name: Arcadia
Artist ID: 13161, Score: 12606.11, Name: Private
Artist ID: 4313, Score: 12600.67, Name: Nephew
Artist ID: 6350, Score: 12455.23, Name: TV-2
Artist ID: 996, Score: 11801.53, Name: Mike & The Mechanics
Artist I

# Long program

In [53]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import random
from tqdm import tqdm

# Load the dataset
def load_dataset(path="./datas/hetrec2011-lastfm-2k"):
    # Load user-artist interactions
    user_artists = pd.read_csv(f"{path}/user_artists.dat", sep='\t')

    # Load artists data
    artists = pd.read_csv(f"{path}/artists.dat", sep='\t')

    return user_artists, artists

# Preprocess data
def preprocess_data(user_artists):
    # Create user-item matrix
    user_item_matrix_df = user_artists.pivot(index='userID', columns='artistID', values='weight').fillna(0)
    return user_item_matrix_df

# User-based collaborative filtering
# 根据与user_id的相似用户，推荐n个item（artist）给它
def user_based_cf(user_item_matrix_df, user_id, n_users=10, n_recommendations=10):
    # Calculate user similarity
    user_similarity = cosine_similarity(user_item_matrix_df)
    user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix_df.index, columns=user_item_matrix_df.index)

    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_users+1].index

    # Get recommendations
    recommendations = defaultdict(float)

    # 对于当前没听过的艺术家，以相似用户喜欢的艺术家频次为准,
    # recommendations[item] = 累加（某user和其相似用户的相似度 * weight）
    for similar_user in similar_users:
        similarity_between_user = user_similarity_df.loc[user_id, similar_user]

        for item in user_item_matrix_df.columns:
            if user_item_matrix_df.loc[user_id, item] == 0 and user_item_matrix_df.loc[similar_user, item] > 0:
                recommendations[item] += similarity_between_user * user_item_matrix_df.loc[similar_user, item]

    # Sort recommendations
    recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return recommendations

# Item-based collaborative filtering
def item_based_cf(user_item_matrix_df, user_id, n_items=10, n_recommendations=10):
    # 1. Calculate item similarity between items
    item_similarity = cosine_similarity(user_item_matrix_df.T)
    item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix_df.columns, columns=user_item_matrix_df.columns)

    # 2. Get items the user has interacted with and weights > 0
    user_related_items = user_item_matrix_df.loc[user_id]
    related_items_id = user_related_items[user_related_items > 0].index.tolist()

    # Get recommendations
    recommendations = defaultdict(float)

    for item_id in related_items_id:
        item_weight = user_item_matrix_df.loc[user_id, item_id]

        # 2.1 Find n top items similar to current item, according to item_similarity_df
        similar_items = item_similarity_df[item_id].sort_values(ascending=False)[1:n_items+1]

        for similar_item, similarity_between_item in similar_items.items():
            # 2.2 only find the similar item is not in related_items_id, add it to recommendations
            if similar_item in related_items_id:
                continue
            # the below similar_item's weight must be 0
            recommendations[similar_item] += similarity_between_item * item_weight

    # Sort recommendations
    recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    return recommendations

# Evaluate recommendations
def evaluate(user_item_matrix, test_ratio=0.1, n_users=10, n_items=10, n_recommendations=10):
    # Create a copy of the matrix to avoid modifying the original
    matrix = user_item_matrix.copy()

    # Metrics storage
    metrics = {
        'user_hr': [], 'user_ndcg': [], 'user_mrr': [],
        'item_hr': [], 'item_ndcg': [], 'item_mrr': []
    }

    # Add progress bar
    total_users = len(matrix.index)
    print(f"Evaluating recommendations for {total_users} users...")

    # For each user, hide some interactions as test data
    for user_id in tqdm(matrix.index, desc="Evaluating", ncols=80):
        # Get items this user has interacted with
        user_items = matrix.columns[matrix.loc[user_id] > 0].tolist()

        # Skip users with too few interactions
        if len(user_items) <= 2:
            continue

        # Randomly select items for testing
        n_test = max(1, int(len(user_items) * test_ratio))
        test_items = random.sample(user_items, n_test)

        # Create training matrix by setting test items to zero
        train_matrix = matrix.copy()
        for item in test_items:
            train_matrix.loc[user_id, item] = 0

        # Get recommendations from both methods
        user_recs = user_based_cf(train_matrix, user_id, n_users, n_recommendations)
        item_recs = item_based_cf(train_matrix, user_id, n_items, n_recommendations)

        # Extract just the item IDs
        user_rec_items = [item_id for item_id, _ in user_recs]
        item_rec_items = [item_id for item_id, _ in item_recs]

        # Calculate metrics for user-based CF
        metrics['user_hr'].append(hit_ratio(user_rec_items, test_items))
        metrics['user_ndcg'].append(ndcg(user_rec_items, test_items))
        metrics['user_mrr'].append(mrr(user_rec_items, test_items))

        # Calculate metrics for item-based CF
        metrics['item_hr'].append(hit_ratio(item_rec_items, test_items))
        metrics['item_ndcg'].append(ndcg(item_rec_items, test_items))
        metrics['item_mrr'].append(mrr(item_rec_items, test_items))
        # print(metrics)

    # Calculate average metrics
    avg_metrics = {k: np.mean(v) for k, v in metrics.items() if v}

    return avg_metrics

# Hit Ratio@K
def hit_ratio(recommended_items, test_items):
    hits = len(set(recommended_items) & set(test_items))
    return hits / len(test_items) if test_items else 0

# NDCG@K
def ndcg(recommended_items, test_items):
    dcg = 0
    idcg = 0

    # Calculate DCG
    for i, item in enumerate(recommended_items):
        if item in test_items:
            # Using binary relevance (1 if hit, 0 if miss)
            dcg += 1 / np.log2(i + 2)  # i+2 because i starts from 0

    # Calculate IDCG (ideal DCG - items are perfectly ranked)
    for i in range(min(len(test_items), len(recommended_items))):
        idcg += 1 / np.log2(i + 2)

    return dcg / idcg if idcg > 0 else 0

# MRR@K
def mrr(recommended_items, test_items):
    for i, item in enumerate(recommended_items):
        if item in test_items:
            return 1 / (i + 1)  # i+1 because i starts from 0
    return 0

# Main function
def main():
    # Load and preprocess data
    user_artists, artists = load_dataset()
    user_item_matrix_df = preprocess_data(user_artists)

    # Example: Get recommendations for a specific user
    user_id = user_item_matrix_df.index[0]  # First user in the dataset
    # user_id = int(input("input the user id:"))

    print(f"User-based recommendations for user {user_id}:")
    user_recommendations = user_based_cf(user_item_matrix_df, user_id)
    for item_id, score in user_recommendations:
        artist_name = artists[artists['id'] == item_id]['name'].values[0] if item_id in artists['id'].values else "Unknown"
        print(f"Artist ID: {item_id}, Score: {score:.2f}, Name: {artist_name}")

    print(f"\nItem-based recommendations for user {user_id}:")
    item_recommendations = item_based_cf(user_item_matrix_df, user_id)
    for item_id, score in item_recommendations:
        artist_name = artists[artists['id'] == item_id]['name'].values[0] if item_id in artists['id'].values else "Unknown"
        print(f"Artist ID: {item_id}, Score: {score:.2f}, Name: {artist_name}")

    # Evaluate the models
    metrics = evaluate(user_item_matrix_df)
    print(f"\nEvaluation results:")
    print(f"User-based CF - HR@K: {metrics['user_hr']:.4f}, NDCG@K: {metrics['user_ndcg']:.4f}, MRR@K: {metrics['user_mrr']:.4f}")
    print(f"Item-based CF - HR@K: {metrics['item_hr']:.4f}, NDCG@K: {metrics['item_ndcg']:.4f}, MRR@K: {metrics['item_mrr']:.4f}")

if __name__ == "__main__":
    main()

User-based recommendations for user 2:
Artist ID: 511, Score: 11033.60, Name: U2
Artist ID: 159, Score: 9444.82, Name: The Cure
Artist ID: 1001, Score: 8645.83, Name: Pet Shop Boys
Artist ID: 2562, Score: 6262.49, Name: Arcadia
Artist ID: 1014, Score: 5252.81, Name: Erasure
Artist ID: 993, Score: 5032.88, Name: Simple Minds
Artist ID: 187, Score: 4859.54, Name: a-ha
Artist ID: 4313, Score: 4470.64, Name: Nephew
Artist ID: 227, Score: 3800.56, Name: The Beatles
Artist ID: 6776, Score: 3569.32, Name: Book of Love

Item-based recommendations for user 2:
Artist ID: 2556, Score: 14102.14, Name: The Power Station
Artist ID: 8995, Score: 13904.49, Name: Andy Taylor
Artist ID: 1076, Score: 13796.74, Name: Wham!
Artist ID: 2562, Score: 13181.35, Name: Arcadia
Artist ID: 13161, Score: 12606.11, Name: Private
Artist ID: 4313, Score: 12600.67, Name: Nephew
Artist ID: 6350, Score: 12455.23, Name: TV-2
Artist ID: 996, Score: 11801.53, Name: Mike & The Mechanics
Artist ID: 4042, Score: 6047.56, Name:

Evaluating: 100%|█████████████████████████| 1892/1892 [2:59:03<00:00,  5.68s/it]


Evaluation results:
User-based CF - HR@K: 0.1279, NDCG@K: 0.1162, MRR@K: 0.2138
Item-based CF - HR@K: 0.0174, NDCG@K: 0.0159, MRR@K: 0.0337



