# Building a User-User Recommendation Engine

In [11]:
import os
import requests
import zipfile
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

In [12]:
def download_movielens(url, path='movielens.zip'):
    response = requests.get(url)
    with open(path, 'wb') as file:
        file.write(response.content)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall("movielens")

dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
download_movielens(dataset_url)

# Load the dataset
ratings = pd.read_csv('movielens/ml-latest-small/ratings.csv', usecols=['userId', 'movieId', 'rating'])


In [13]:
# Create a user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# Convert the user-item matrix into a sparse matrix
sparse_user_item = csr_matrix(user_item_matrix)

# Compute cosine similarity between users
user_similarity = cosine_similarity(sparse_user_item)

In [14]:
# Recommendation function
def recommend_movies(user_id, num_recommendations=5):
    similarity_scores = user_similarity[user_id - 1]
    sorted_indices = np.argsort(similarity_scores)[::-1]
    top_users = sorted_indices[1:num_recommendations+1]  # skip the first user as it will be the target user itself

    recommended_movies = set()
    for similar_user in top_users:
        similar_user_movies = user_item_matrix.columns[np.argsort(user_item_matrix.iloc[similar_user])][::-1]
        recommended_movies.update(similar_user_movies[:num_recommendations])

    return list(recommended_movies)[:num_recommendations]



In [15]:
# Example
user_id = 1
recommendations = recommend_movies(user_id)
print(f"Recommended Movies for User {user_id}:", recommendations)

Recommended Movies for User 1: [1, 899, 904, 3081, 2959]


# Doing the same thing but using Matrix Factorization

In [16]:
import requests
import zipfile
import pandas as pd
from scipy.sparse.linalg import svds
import numpy as np

In [17]:
def download_movielens(url, path='movielens.zip'):
    response = requests.get(url)
    with open(path, 'wb') as file:
        file.write(response.content)
    with zipfile.ZipFile(path, 'r') as zip_ref:
        zip_ref.extractall("movielens")

dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
download_movielens(dataset_url)

ratings = pd.read_csv('movielens/ml-latest-small/ratings.csv', usecols=['userId', 'movieId', 'rating'])


In [18]:
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

# Normalize the matrix
user_ratings_mean = np.mean(user_item_matrix, axis=1)
user_item_matrix_demeaned = user_item_matrix.sub(user_ratings_mean, axis=0)

# Convert to a numpy array
user_item_matrix_demeaned_np = user_item_matrix_demeaned.values

In [20]:
# Perform SVD, k is the number of factors
U, sigma, Vt = svds(user_item_matrix_demeaned_np, k=50)

# Convert sigma to a diagonal matrix
sigma = np.diag(sigma)

# Predict ratings
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.values.reshape(-1, 1)

# Convert predicted ratings to DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=user_item_matrix.columns)


In [21]:
def recommend_items(user_id, num_recommendations=5):
    user_row_number = user_id - 1
    sorted_user_ratings = predicted_ratings_df.iloc[user_row_number].sort_values(ascending=False)

    # Get the user's data and merge in the movie information
    user_data = ratings[ratings.userId == user_id]
    sorted_user_predictions = sorted_user_ratings.drop(user_data.movieId.tolist()).head(num_recommendations)
    recommendations = pd.DataFrame({'movieId': sorted_user_predictions.index, 'predictedRating': sorted_user_predictions.values})

    return recommendations


In [22]:
# Test
user_id = 1
recommendations = recommend_items(user_id)
print("Recommended Movies for User ID", user_id)
print(recommendations)

Recommended Movies for User ID 1
   movieId  predictedRating
0     1036         4.024307
1     1221         3.324815
2     1387         3.304728
3      858         2.891690
4     1968         2.870832


# Using Surprise for training a model with Matrix Factorization

In [23]:
!pip install scikit-surprise



In [25]:
from surprise import Dataset, Reader, SVD, accuracy, BaselineOnly
from surprise.model_selection import train_test_split

data = Dataset.load_builtin('ml-100k')
reader = Reader(line_format='user item rating timestamp', sep='\t')
trainset, testset = train_test_split(data, test_size=0.2)

# Use the SVD algorithm
algo = SVD()

# Train
algo.fit(trainset)

# Test
predictions = algo.test(testset)

# Calculate RMSE
rmse_after_training = accuracy.rmse(predictions)

# Do the same for baseline
baseline_algo = BaselineOnly()
baseline_algo.fit(trainset)
baseline_predictions = baseline_algo.test(testset)
baseline_rmse = accuracy.rmse(baseline_predictions)

print(f"RMSE after Training: {rmse_after_training}")
print(f"Baseline RMSE: {baseline_rmse}")

# Check if the model improved
if rmse_after_training < baseline_rmse:
    print("Training improved the model.")
else:
    print("Training did not improve the model.")


RMSE: 0.9518
Estimating biases using als...
RMSE: 0.9554
RMSE after Training: 0.9518366013143656
Baseline RMSE: 0.9553704400771427
Training improved the model.
