# FGV Summer Program 2025
## Recommender Systems class 2 - Dataset split
Diego Galeano, Ph.D.

$\color{green}{\text{Before starting}}$ let's run the following cell to have all the required libraries for this notebook.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import drive
import os
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


## The Movielens dataset

In [None]:
# Define the base path to your Google Drive
base_path = '/content/drive/My Drive/5. Teaching/FGV 2025/programming exercises/ml-100k/'
file_name = 'u.data'

# Construct the full file path
file_path = base_path + file_name

column_names = ['user_id', 'item_id', 'rating', 'time_stamp']
data = pd.read_csv(file_path, sep='\t', header=None, names=column_names )

# read also the item data
item_df = pd.read_csv(base_path + 'u.item', sep='|', encoding='latin-1', header=None)[[0,1,2]]
item_df.columns = ['item_id', 'movie_name', 'release_date']

data = data.merge(item_df, on = 'item_id')

In [None]:
data.head()

Unnamed: 0,user_id,item_id,rating,time_stamp,movie_name,release_date
0,196,242,3,881250949,Kolya (1996),24-Jan-1997
1,186,302,3,891717742,L.A. Confidential (1997),01-Jan-1997
2,22,377,1,878887116,Heavyweights (1994),01-Jan-1994
3,244,51,2,880606923,Legends of the Fall (1994),01-Jan-1994
4,166,346,1,886397596,Jackie Brown (1997),01-Jan-1997


# Split the dataset into training, validation and test sets

In [None]:
valid_frac = 0.005  # % for validation
test_frac = 0.005  # % for testing
train_frac = 1- (valid_frac + test_frac)  # % for training

# Randomly shuffle the DataFrame
# Hint: Use the sample() method to shuffle the data and reset the index.
data_shuffled = None # one line code

# Split the data
train_end = int(len(data) * train_frac)
valid_end = train_end + int(len(data) * valid_frac)

train_data = None# Hint: Select rows from the shuffled data for training.
valid_data = None# Hint: Select rows for validation.
test_data = None# Hint: The remaining rows are for testing.

# Print the sizes of each subset
print(f"Training Set: {len(train_data)} rows")
print(f"Validation Set: {len(valid_data)} rows")
print(f"Test Set: {len(test_data)} rows")

Training Set: 99000 rows
Validation Set: 500 rows
Test Set: 500 rows


### Building the rating matrix with the training data

In [None]:
# Build the rating matrix using only train_data
# Hint: Use the pivot() method to create a matrix where rows are users, columns are items, and values are ratings.
rating_matrix = None
# Fill missing values with 0 (or any desired value like NaN)
# Hint: Use the fillna() method to replace NaN values with a default value like 0 or NaN.
rating_matrix = None

In [None]:
rating_matrix

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Random Recommender System and predict on validation and test data

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def generate_random_predictions(df, min_rating=1, max_rating=5, seed=42):
    np.random.seed(seed)  # Ensure reproducibility
    random_scores = None # Hint: Use np.random.uniform to generate random values within a range.
    return random_scores

# Generate random predictions for validation and test sets
valid_data['predicted_rating'] = generate_random_predictions(valid_data)
test_data['predicted_rating'] = generate_random_predictions(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data['predicted_rating'] = generate_random_predictions(valid_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_rating'] = generate_random_predictions(test_data)


In [None]:
# Calculate RMSE and MAE for validation set
valid_rmse = np.sqrt(mean_squared_error(valid_data['rating'], valid_data['predicted_rating']))
valid_mae = mean_absolute_error(valid_data['rating'], valid_data['predicted_rating'])

# Calculate RMSE and MAE for test set
test_rmse = np.sqrt(mean_squared_error(test_data['rating'], test_data['predicted_rating']))
test_mae = mean_absolute_error(test_data['rating'], test_data['predicted_rating'])

# Print the results
print(f"Validation RMSE: {valid_rmse:.4f}")
print(f"Validation MAE: {valid_mae:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")

Validation RMSE: 1.6695
Validation MAE: 1.3511
Test RMSE: 1.7727
Test MAE: 1.4406


**Task: Implement the RMSE and MAE from scratch**

In [None]:
#Define a function to calculate RMSE
# Hint: RMSE is the square root of the average of the squared differences between actual and predicted ratings.
def calculate_rmse(actual, predicted):
    """
    Function to calculate Root Mean Squared Error (RMSE).
    actual: array-like, actual ratings.
    predicted: array-like, predicted ratings.
    """
    # Step 1: Calculate the squared differences
    squared_differences = None

    # Step 2: Compute the mean of squared differences
    mean_squared_difference = None

    # Step 3: Take the square root of the mean
    rmse = None

    return rmse

# Define a function to calculate MAE
# Hint: MAE is the average of the absolute differences between actual and predicted ratings.
def calculate_mae(actual, predicted):
    """
    Function to calculate Mean Absolute Error (MAE).
    actual: array-like, actual ratings.
    predicted: array-like, predicted ratings.
    """
    # Step 1: Calculate the absolute differences
    absolute_differences = None

    # Step 2: Compute the mean of absolute differences
    mae = None

    return mae

In [None]:
rmse = calculate_rmse(valid_data['rating'], valid_data['predicted_rating'])
mae = calculate_mae(valid_data['rating'], valid_data['predicted_rating'])

# Print the results
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

## Split per user for top-N Recommender Systems
Take only one item (selected at random) for each user, and place it on a test set dataframe.

In [None]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

for i in set(data['user_id']):
  # extract data information about user i
  df_user_i = None

  # sample one item at random, Hint: use df_user_i.sample()
  df_user_selected = None

  # save training and testing
  df_test = pd.concat([df_user_selected, df_test])
  df_train = pd.concat([df_train, df_user_i[df_user_i['item_id'] != df_user_selected['item_id'].values[0]]])

  #break

In [None]:
df_user_i

Unnamed: 0,user_id,item_id,rating,time_stamp,movie_name,release_date
202,1,61,4,878542420,Three Colors: White (1994),01-Jan-1994
305,1,189,3,888732928,"Grand Day Out, A (1992)",01-Jan-1992
333,1,33,4,878542699,Desperado (1995),01-Jan-1995
334,1,160,4,875072547,Glengarry Glen Ross (1992),01-Jan-1992
478,1,20,4,887431883,Angels and Insects (1995),01-Jan-1995
...,...,...,...,...,...,...
92049,1,28,4,875072173,Apollo 13 (1995),01-Jan-1995
92487,1,172,5,874965478,"Empire Strikes Back, The (1980)",01-Jan-1980
94019,1,122,3,875241498,"Cable Guy, The (1996)",14-Jun-1996
96699,1,152,5,878542589,Sleeper (1973),01-Jan-1973


In [None]:
df_test

Unnamed: 0,user_id,item_id,rating,time_stamp,movie_name,release_date
8737,1,209,4,888732908,This Is Spinal Tap (1984),01-Jan-1984
