In [None]:
import pandas as pd
import os
import numpy as np
#dataset
movielens_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
movielens_dir_path = "/content/drive/MyDrive/ml-100k/ml-100k"

ratings_df = pd.read_csv(
    os.path.join(movielens_dir_path, 'u.data'),
    sep='\t',
    header=None,
    names=['user_id', 'item_id', 'rating', 'timestamp']
)

display(ratings_df.head())

display(ratings_df.info())

In [None]:
from sklearn.model_selection import train_test_split

X = ratings_df[['user_id', 'item_id']]
y = ratings_df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (features):", X_train.shape)
print("Testing set shape (features):", X_test.shape)
print("Training set shape (target):", y_train.shape)
print("Testing set shape (target):", y_test.shape)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances


train_data = X_train.copy()
train_data['rating'] = y_train

user_item_matrix = train_data.pivot(index='user_id', columns='item_id', values='rating')

user_item_matrix_filled = user_item_matrix.fillna(0)

user_similarity = 1 - pairwise_distances(user_item_matrix_filled, metric='cosine')

def predict_user_based(user_id, item_id, user_item_matrix, user_similarity):

    if user_id not in user_item_matrix.index or item_id not in user_item_matrix.columns:
        return None

    item_ratings = user_item_matrix[item_id]
    rated_by_similar_users = item_ratings[item_ratings > 0].index

    user_index = user_item_matrix.index.get_loc(user_id)
    user_similarities_with_target = user_similarity[user_index, :]

    similar_users_who_rated_item_indices = [user_item_matrix.index.get_loc(uid) for uid in rated_by_similar_users if uid in user_item_matrix.index]
    similarities_of_rated_users = user_similarities_with_target[similar_users_who_rated_item_indices]
    users_who_rated_item = user_item_matrix.index[similar_users_who_rated_item_indices]

    ratings_of_similar_users = item_ratings[users_who_rated_item]

    predicted_rating = np.dot(similarities_of_rated_users, ratings_of_similar_users) / np.sum(similarities_of_rated_users)

    return predicted_rating


display(user_item_matrix.head())
display(user_similarity[:5, :5])

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_user_matrix_filled = user_item_matrix_filled.T

item_similarity = cosine_similarity(item_user_matrix_filled)

item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix_filled.index, columns=item_user_matrix_filled.index)

def predict_item_based(user_id, item_id, user_item_matrix, item_similarity_df):
   
    if user_id not in user_item_matrix.index or item_id not in user_item_matrix.columns:
        return None

    user_ratings = user_item_matrix.loc[user_id]
    rated_items_by_user = user_ratings[user_ratings > 0].index

    if item_id not in item_similarity_df.index:
        return None
    item_similarities_with_target = item_similarity_df.loc[item_id]

    similarities_of_rated_items = item_similarities_with_target[rated_items_by_user]

    ratings_of_rated_items = user_ratings[rated_items_by_user]

    sum_of_similarities = similarities_of_rated_items.sum()
    if sum_of_similarities == 0:
        return None

    predicted_rating = np.dot(similarities_of_rated_items.values, ratings_of_rated_items.values) / sum_of_similarities

    return predicted_rating

display(item_similarity_df.head())

In [None]:

user_based_predictions_list = []
item_based_predictions_list = []
for index, row in X_test.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']

    user_pred = predict_user_based(user_id, item_id, user_item_matrix, user_similarity)
    item_pred = predict_item_based(user_id, item_id, user_item_matrix, item_similarity_df)

    user_based_predictions_list.append(user_pred)
    item_based_predictions_list.append(item_pred)

# Convert lists to NumPy arrays with float dtype, handling None by default as NaN
user_based_predictions = np.array(user_based_predictions_list, dtype=np.float64)
item_based_predictions = np.array(item_based_predictions_list, dtype=np.float64)

# Replace NaN values with 0
user_based_predictions = np.nan_to_num(user_based_predictions, nan=0)
item_based_predictions = np.nan_to_num(item_based_predictions, nan=0)

# Filter out predictions that were originally NaN (now 0) and their corresponding y_test values
valid_user_predictions_indices = np.where(user_based_predictions != 0)[0]
valid_user_predictions = user_based_predictions[valid_user_predictions_indices]
valid_y_test_user = y_test.iloc[valid_user_predictions_indices]

valid_item_predictions_indices = np.where(item_based_predictions != 0)[0]
valid_item_predictions = item_based_predictions[valid_item_predictions_indices]
valid_y_test_item = y_test.iloc[valid_item_predictions_indices]

def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

# MSE for user-based
user_based_mse = mean_squared_error(valid_y_test_user, valid_user_predictions)

user_based_rmse = np.sqrt(user_based_mse)

item_based_mse = mean_squared_error(valid_y_test_item, valid_item_predictions)

item_based_rmse = np.sqrt(item_based_mse)

print(f"User-based collaborative filtering RMSE: {user_based_rmse}")
print(f"Item-based collaborative filtering RMSE: {item_based_rmse}")