In [1]:
import pandas as pd
import time
import numpy as np
from scipy import sparse
import warnings
from implicit_markov_chain import ImplicitMarkovChain
from model_helper import save_model
warnings.filterwarnings('ignore')

# Load the data
print("Loading MovieLens 20M data...")
selected_users = pd.read_csv('data/users_selection.csv')
ratings = pd.read_csv('data/ml-20m/ratings.csv')

# print(f"Movies: {movies.shape}")
print(f"Ratings: {ratings.shape}")
print("\nFirst few rows of ratings:")
print(ratings.head())
# print("\nRating distribution:")
# print(ratings['rating'].value_counts().sort_index())

Loading MovieLens 20M data...
Ratings: (20000263, 4)

First few rows of ratings:
   userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580


In [None]:
# np.random.seed(14)

Selected 100 unique users


In [3]:
iterations = 100

for iter in range(iterations):
    print(f"{iter+1}/{iterations}")
    selected_user_ids = np.random.choice(selected_users['x'].unique(), 100, replace=False)
    # print(f"Selected {len(selected_user_ids)} unique users")

    filtered_ratings = ratings[ratings['userId'].isin(selected_user_ids)]
    # print(f"Original ratings shape: {ratings.shape}")
    # print(f"Filtered ratings shape: {filtered_ratings.shape}")

    filtered_ratings_sorted = filtered_ratings.sort_values(['userId', 'timestamp'])

    # Leave Last One Out
    train_ratings = filtered_ratings_sorted.groupby('userId').apply(lambda x: x.iloc[:-1]).reset_index(drop=True)
    test_ratings = filtered_ratings_sorted.groupby('userId').tail(1)

    print(f"Training set: {train_ratings.shape[0]:,} ratings")
    print(f"Test set: {test_ratings.shape[0]:,} ratings")

    print("Training implicit feedback Markov chain...")
    model = ImplicitMarkovChain(alpha=2.0)
    start = time.time()
    model.build_transition_matrix_efficient(train_ratings, sample_fraction=1)
    print(f"{time.time() - start:.4f} seconds")

    K = 20
    total_recall = 0
    total_precision = 0
    user_recommendations = {}
    train_ratings_sorted = train_ratings.sort_values(['userId', 'timestamp'])

    for user_id in selected_user_ids:
        user_data = train_ratings_sorted[train_ratings_sorted['userId'] == user_id]
        user_training_samples = len(user_data)
        
        candidate_scores = {}

        last_movie_rated = user_data['movieId'].tail(1)
        probabilities = model.transition_matrix[model.movie_to_idx[last_movie_rated.values[0]]]
        top_indices = np.argsort(probabilities)[-(K*15):][::-1]

        for i in top_indices:
            if len(candidate_scores) == K:
                break
            candidate_movie_id = model.idx_to_movie[i]
            candidate_prob = probabilities[i]
            
            if candidate_movie_id not in user_data['movieId'].values:
                # Keep the highest probability for each movie
                if candidate_movie_id not in candidate_scores or candidate_prob > candidate_scores[candidate_movie_id]:
                    candidate_scores[candidate_movie_id] = candidate_prob

        top_movies = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)[:K]
        candidate_movies = [movie_id for movie_id, score in top_movies]
        
        # validation
        user_test_ratings = test_ratings[test_ratings['userId'] == user_id]
        relevant_movies = set(user_test_ratings['movieId'].tolist())
        recommended_movies = set(candidate_movies)
        
        if relevant_movies in recommended_movies:
            # its a hit
            total_recall += 1


    avg_recall = total_recall / len(test_ratings)
    avg_precision = total_precision / (len(test_ratings)*K)

    print("\n=== Final Results ===")
    print(f"Total users processed: {len(selected_user_ids)}")
    print(f"Recall@{K}: {avg_recall:.4f}")
    print(f"Precision@{K}: {avg_precision:.4f}")

1/100
Training set: 159,872 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


100%|██████████| 100/100 [06:19<00:00,  3.79s/it]


448.4392 seconds

=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
2/100
Training set: 144,705 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


100%|██████████| 100/100 [02:53<00:00,  1.74s/it]


203.9896 seconds

=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
3/100
Training set: 148,299 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


100%|██████████| 100/100 [03:33<00:00,  2.13s/it]


274.2236 seconds

=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
4/100
Training set: 148,204 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


100%|██████████| 100/100 [03:56<00:00,  2.36s/it]


286.3692 seconds

=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
5/100
Training set: 157,140 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


100%|██████████| 100/100 [04:50<00:00,  2.91s/it]


372.8204 seconds

=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
6/100
Training set: 144,934 ratings
Test set: 100 ratings
Training implicit feedback Markov chain...
Processing user ratings...
User Groups: 100


  4%|▍         | 4/100 [00:08<03:13,  2.02s/it]


KeyboardInterrupt: 

In [40]:
K = 20
total_recall = 0
total_precision = 0
user_recommendations = {}
train_ratings_sorted = train_ratings.sort_values(['userId', 'timestamp'])

for user_id in selected_user_ids:
    user_data = train_ratings_sorted[train_ratings_sorted['userId'] == user_id]
    user_training_samples = len(user_data)
    
    candidate_scores = {}

    last_movie_rated = user_data['movieId'].tail(1)
    probabilities = model.transition_matrix[model.movie_to_idx[last_movie_rated.values[0]]]
    top_indices = np.argsort(probabilities)[-(K*15):][::-1]

    for i in top_indices:
        if len(candidate_scores) == K:
            break
        candidate_movie_id = model.idx_to_movie[i]
        candidate_prob = probabilities[i]
        
        if candidate_movie_id not in user_data['movieId'].values:
            # Keep the highest probability for each movie
            if candidate_movie_id not in candidate_scores or candidate_prob > candidate_scores[candidate_movie_id]:
                candidate_scores[candidate_movie_id] = candidate_prob

    top_movies = sorted(candidate_scores.items(), key=lambda x: x[1], reverse=True)[:K]
    candidate_movies = [movie_id for movie_id, score in top_movies]
    
    # validation
    user_test_ratings = test_ratings[test_ratings['userId'] == user_id]
    relevant_movies = set(user_test_ratings['movieId'].tolist())
    recommended_movies = set(candidate_movies)
    
    if relevant_movies in recommended_movies:
        # its a hit
        total_recall += 1


avg_recall = total_recall / len(test_ratings)
avg_precision = total_precision / (len(test_ratings)*K)

print("\n=== Final Results ===")
print(f"Total users processed: {len(selected_user_ids)}")
print(f"Recall@{K}: {avg_recall:.4f}")
print(f"Precision@{K}: {avg_precision:.4f}")


=== Final Results ===
Total users processed: 100
Recall@20: 0.0000
Precision@20: 0.0000
