# Recommender Systems Code
Author: Juho Hotari

Here we test KNN, Popularity and Randomized algorithms and evaluate the performance based on the NDCG@5 value.

## Task 1-3.

### KNN User-Based

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

In [2]:
class UserBasedKNNRecommender:
    def __init__(self, k_neighbors=5):
        self.k_neighbors = k_neighbors
        self.user_item_matrix = None
        self.user_item_matrix_scaled = None
        self.user_similarity_matrix = None
        self.nearest_neighbors_model = None

    def fit(self, train_file_path):
        # Load training data
        train_data = pd.read_csv(train_file_path)

        # Create user-item matrix
        self.user_item_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating')

        # Fill missing values with 0
        self.user_item_matrix = self.user_item_matrix.fillna(0)

        # Transpose user-item matrix to get item-user matrix
        self.item_user_matrix = self.user_item_matrix.T

        # Normalize the data
        min_max_scaler = MinMaxScaler()
        self.user_item_matrix_scaled = min_max_scaler.fit_transform(self.user_item_matrix)

        # Compute cosine similarity between users
        self.user_similarity_matrix = cosine_similarity(self.user_item_matrix_scaled)

        # Fit Nearest Neighbors model
        self.nearest_neighbors_model = NearestNeighbors(n_neighbors=self.k_neighbors, metric='cosine', algorithm='brute')
        self.nearest_neighbors_model.fit(self.user_item_matrix_scaled)

    def predict(self, user_id, item_id):
        if user_id not in self.user_item_matrix.index:
            return 0

        user_index = self.user_item_matrix.index.get_loc(user_id)

        # Find k-nearest neighbors
        _, indices = self.nearest_neighbors_model.kneighbors([self.user_item_matrix_scaled[user_index]], n_neighbors=self.k_neighbors)

        # Predict rating based on weighted sum of ratings from neighbors
        weighted_sum = 0
        similarity_sum = 0
        for neighbor_index in indices[0]:
            if neighbor_index < self.user_item_matrix_scaled.shape[0]:
                similarity = self.user_similarity_matrix[user_index][neighbor_index]
                rating = self.user_item_matrix.iloc[neighbor_index][item_id]
                weighted_sum += similarity * rating
                similarity_sum += abs(similarity)

        if similarity_sum == 0:
            return 0
        else:
            predicted_rating = weighted_sum / similarity_sum
            return predicted_rating

    def evaluate_ndcg(self, test_data, k=5, negative_samples=100):
        ndcg_sum = 0
        common_users = set(test_data['user_id']).intersection(set(self.user_item_matrix.index))
    
        for user_id in common_users:
            try:
                group = test_data[test_data['user_id'] == user_id]
    
                # Select the top k true ratings
                true_ratings = group.sort_values(by='rating', ascending=False).head(k)['rating'].values
    
                # Predict ratings for all items in the group
                predicted_ratings = [self.predict(user_id, item_id) for item_id in group['item_id'].values]
    
                # Uniformly sample negative examples
                negative_samples_indices = np.random.choice(self.user_item_matrix.columns, size=negative_samples, replace=False)
                negative_samples_ratings = [self.predict(user_id, item_id) for item_id in negative_samples_indices]
    
                # Combine true ratings and predicted ratings (including negative samples)
                predicted_ratings = np.concatenate([predicted_ratings, negative_samples_ratings])
    
                # Sort the predicted ratings in descending order
                predicted_ranking = np.argsort(predicted_ratings)[::-1][:min(k, len(predicted_ratings))]
    
                # Calculate DCG and IDCG
                # Calculate DCG
                dcg = np.sum((2 ** true_ratings - 1) / np.log2(2 + np.arange(1, min(k, len(true_ratings)) + 1)))

                # Calculate IDCG
                sorted_true_ratings = np.sort(true_ratings)[::-1]
                idcg = np.sum((2 ** sorted_true_ratings - 1) / np.log2(2 + np.arange(1, min(k, len(true_ratings)) + 1)))

    
                # Update NDCG sum
                ndcg_sum += dcg / idcg
            except KeyError:
                continue
    
        # Calculate the average NDCG over all users
        average_ndcg = ndcg_sum / len(common_users)
        return average_ndcg

### k=5 neighbors

In [4]:
%%time
# Instantiate the recommender
recommender = UserBasedKNNRecommender(k_neighbors=5)

# Train the recommender on the training data
recommender.fit('train.csv')

# Evaluate performance on test data
test_data = pd.read_csv('test.csv')
ndcg_at_5 = recommender.evaluate_ndcg(test_data, k=5)

print(f"NDCG@5: {ndcg_at_5}")

NDCG@5: 0.8340807174887892
CPU times: user 40min 27s, sys: 2h 1min 40s, total: 2h 42min 7s
Wall time: 16min 16s


### k=3 neighbors

In [5]:
%%time
# Instantiate the recommender
recommender = UserBasedKNNRecommender(k_neighbors=3)

# Train the recommender on the training data
recommender.fit('train.csv')

# Evaluate performance on validation data
test_data = pd.read_csv('validation.csv')
ndcg_at_5 = recommender.evaluate_ndcg(test_data, k=5)

print(f"NDCG@5: {ndcg_at_5}")

NDCG@5: 0.8389261744966443
CPU times: user 41min 4s, sys: 2h 3min 17s, total: 2h 44min 21s
Wall time: 16min 30s


### k=4 neighbors

In [6]:
%%time
# Instantiate the recommender
recommender = UserBasedKNNRecommender(k_neighbors=4)

# Train the recommender on the training data
recommender.fit('train.csv')

# Evaluate performance on validation data
test_data = pd.read_csv('validation.csv')
ndcg_at_5 = recommender.evaluate_ndcg(test_data, k=5)

print(f"NDCG@5: {ndcg_at_5}")

NDCG@5: 0.8389261744966443
CPU times: user 41min 31s, sys: 2h 5min 13s, total: 2h 46min 45s
Wall time: 16min 44s


### Popularity algorithm

In [13]:
import random
class PopularityAlgorithm:
    def __init__(self, train_dataset):
        self.train_dataset = train_dataset
        self.item_popularity = self.calculate_item_popularity()

    def calculate_item_popularity(self):
        item_popularity = self.train_dataset['item_id'].value_counts().reset_index()
        item_popularity.columns = ['item_id', 'popularity']
        return item_popularity.sort_values(by='popularity', ascending=False)

    def generate_negative_samples(self, user_id, test_items, n_samples=100):
        all_items = set(self.train_dataset['item_id'].unique()) - set(test_items)
        negative_samples = random.sample(all_items, n_samples)
        return [(user_id, item, 0) for item in negative_samples]

    def evaluate(self, test_dataset):
        result_table = []

        for user_id in test_dataset['user_id'].unique():
            user_test_set = test_dataset[test_dataset['user_id'] == user_id]
            test_items = user_test_set['item_id'].tolist()

            # Generate negative samples
            negative_samples = self.generate_negative_samples(user_id, test_items)

            # Combine test items and negative samples
            all_items = test_items + [item[1] for item in negative_samples]

            # Calculate scores for all items
            scores = self.calculate_scores(user_id, all_items)

            # Create the result table
            user_results = pd.DataFrame({
                'user_id': [user_id] * len(all_items),
                'item_id': all_items,
                'rating': [user_test_set[user_test_set['item_id'] == item]['rating'].values[0] if item in test_items else 0 for item in all_items],
                'score': scores
            })

            result_table.append(user_results)

        return pd.concat(result_table, ignore_index=True)

    def calculate_scores(self, user_id, items):
        # Example: Using item popularity as scores
        return [self.item_popularity[self.item_popularity['item_id'] == item]['popularity'].values[0] if item in self.item_popularity['item_id'].values else 0 for item in items]

    def calculate_dcg(self, user_id, sorted_items, k):
        return np.sum((2 ** sorted_items['rating'].values - 1) / np.log2(np.arange(2, k + 2)))

    def calculate_idcg(self, user_id, sorted_items_true_rating, k):
        return np.sum((2 ** sorted_items_true_rating['rating'].values - 1) / np.log2(np.arange(2, k + 2)))

    def calculate_ndcg(self, result_table, k=5):
        ndcg_scores = []

        unique_users = result_table['user_id'].unique()

        for user_id in unique_users:
            user_results = result_table[result_table['user_id'] == user_id]

            # Sort items by score in descending order (SORT BY SCORE)
            sorted_items = user_results.sort_values(by='score', ascending=False).head(k)

            # Calculate DCG
            dcg = self.calculate_dcg(user_id, sorted_items, k)

            # Sort items by true rating in descending order (SORT BY USER RATING)
            sorted_items_true_rating = user_results.sort_values(by='rating', ascending=False).head(k)

            # Calculate IDCG
            idcg = self.calculate_idcg(user_id, sorted_items_true_rating, k)

            # Calculate NDCG
            if idcg > 0:
                ndcg = dcg / idcg
                ndcg_scores.append(ndcg)

        # Calculate average NDCG
        average_ndcg = np.mean(ndcg_scores)
        return average_ndcg

In [14]:
%%time
train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

# Create an instance of the PopularityAlgorithm class
popularity_algorithm = PopularityAlgorithm(train_set)

# Evaluate the Popularity algorithm on the test set
result_table_popularity = popularity_algorithm.evaluate(test_set)

# Calculate NDCG@5 for the Popularity algorithm
ndcg_popularity = popularity_algorithm.calculate_ndcg(result_table_popularity, k=5)

# Print the average NDCG@5 for the Popularity algorithm
print(f"Average NDCG@5 for Popularity Algorithm: {ndcg_popularity:.4f}")

since Python 3.9 and will be removed in a subsequent version.
  negative_samples = random.sample(all_items, n_samples)


Average NDCG@5 for Popularity Algorithm: 0.5485
CPU times: user 22.8 s, sys: 10.4 ms, total: 22.8 s
Wall time: 22.9 s


### Randomized algorithm

In [15]:
class RandomizedAlgorithm:
    def __init__(self, train_dataset):
        self.train_dataset = train_dataset

    def generate_negative_samples(self, user_id, test_items, n_samples=100):
        all_items = set(self.train_dataset['item_id'].unique()) - set(test_items)
        negative_samples = random.sample(all_items, n_samples)
        return [(user_id, item, 0) for item in negative_samples]

    def evaluate(self, test_dataset):
        result_table = []

        for user_id in test_dataset['user_id'].unique():
            user_test_set = test_dataset[test_dataset['user_id'] == user_id]
            test_items = user_test_set['item_id'].tolist()

            # Generate negative samples
            negative_samples = self.generate_negative_samples(user_id, test_items)

            # Combine test items and negative samples
            all_items = test_items + [item[1] for item in negative_samples]

            # Calculate scores for all items
            scores = self.calculate_scores(user_id, all_items)

            # Create the result table
            user_results = pd.DataFrame({
                'user_id': [user_id] * len(all_items),
                'item_id': all_items,
                'rating': [user_test_set[user_test_set['item_id'] == item]['rating'].values[0] if item in test_items else 0 for item in all_items],
                'score': scores
            })

            result_table.append(user_results)

        return pd.concat(result_table, ignore_index=True)

    def calculate_scores(self, user_id, items):
        # Example: Using random scores
        return np.random.rand(len(items))

    def calculate_dcg(self, user_id, sorted_items, k):
        return np.sum((2 ** sorted_items['rating'].values - 1) / np.log2(np.arange(2, k + 2)))

    def calculate_idcg(self, user_id, sorted_items_true_rating, k):
        return np.sum((2 ** sorted_items_true_rating['rating'].values - 1) / np.log2(np.arange(2, k + 2)))

    def calculate_ndcg(self, result_table, k=5):
        ndcg_scores = []

        unique_users = result_table['user_id'].unique()

        for user_id in unique_users:
            user_results = result_table[result_table['user_id'] == user_id]

            # Sort items by score in descending order (SORT BY SCORE)
            sorted_items = user_results.sort_values(by='score', ascending=False).head(k)

            # Calculate DCG
            dcg = self.calculate_dcg(user_id, sorted_items, k)

            # Sort items by true rating in descending order (SORT BY RATING)
            sorted_items_true_rating = user_results.sort_values(by='rating', ascending=False).head(k)

            # Calculate IDCG
            idcg = self.calculate_idcg(user_id, sorted_items_true_rating, k)

            # Calculate NDCG
            if idcg > 0:
                ndcg = dcg / idcg
                ndcg_scores.append(ndcg)

        # Calculate average NDCG
        average_ndcg = np.mean(ndcg_scores)
        return average_ndcg

In [16]:
%%time
# Create an instance of the RandomizedAlgorithm class
randomized_algorithm = RandomizedAlgorithm(train_set)

# Evaluate the Randomized algorithm on the test set
result_table_randomized = randomized_algorithm.evaluate(test_set)

# Calculate NDCG@5 for the Randomized algorithm
ndcg_randomized = randomized_algorithm.calculate_ndcg(result_table_randomized, k=5)

# Print the average NDCG@5 for the Randomized algorithm
print(f"Average NDCG@5 for Randomized Algorithm: {ndcg_randomized:.4f}")

since Python 3.9 and will be removed in a subsequent version.
  negative_samples = random.sample(all_items, n_samples)


Average NDCG@5 for Randomized Algorithm: 0.1041
CPU times: user 6.07 s, sys: 3.87 ms, total: 6.08 s
Wall time: 6.1 s


## Task 4.

In [None]:
# Load your dataset and metadata
data = pd.read_csv('ratings.csv')  
metadata = pd.read_csv('metadata.csv')   

# Pick a random user
random_user = np.random.choice(data['user_id'].unique())

# Get the movies watched by the random user
movies_watched = data[data['user_id'] == random_user]['item_id']

# Get recommendations for the user using kNN collaborative filtering
recommendations = []
for item_id in data['item_id'].unique():
    # Check if the user has not watched the movie
    if item_id not in movies_watched.values:
        # Predict the rating for the movie
        predicted_rating = recommender.predict(random_user, item_id)
        recommendations.append((item_id, predicted_rating))

# Sort recommendations by predicted rating in descending order
recommendations.sort(key=lambda x: x[1], reverse=True)

# Print the top 5 recommendations with movie titles
print(f"User {random_user} - Movies Watched:")
for movie_id in movies_watched.values:
    movie_title = metadata[metadata['item_id'] == movie_id]['title'].values
    if len(movie_title) > 0:
        print(f"- {movie_title[0]}")
    else:
        print(f"- Movie {movie_id} (Title not found in metadata)")

print("\nTop 5 Recommendations:")
for i, (item_id, predicted_rating) in enumerate(recommendations[:5], 1):
    movie_title = metadata[metadata['item_id'] == item_id]['title'].values
    if len(movie_title) > 0:
        print(f"{i}. {movie_title[0]} - Predicted Rating: {predicted_rating}")
    else:
        print(f"{i}. Movie {item_id} (Title not found in metadata) - Predicted Rating: {predicted_rating}")

In [None]:
# Choose a few users for demonstration
selected_users = ratings['user_id'].sample(3).values

# Prepare user-item matrix for kNN collaborative filtering
user_item_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)

# Fit kNN model
recommender = UserBasedKNNRecommender(k_neighbors=4)

# Train the recommender on the training data
recommender.fit('train.csv')

# Display top similar users for each selected user
for user_id in selected_users:
    user_index = user_item_matrix.index.get_loc(user_id)
    
    # Find top similar users
    similar_users = recommender.kneighbors(user_item_matrix.iloc[user_index, :].values.reshape(1, -1), n_neighbors=5 + 1, return_distance=False)[0][1:]
    
    print(f"\nTop 5 Similar Users for User {user_id}:")
    print(user_item_matrix.index[similar_users])