In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV
import pandas as pd
import numpy as np

# Load the dataset
ratings_all_path = "data_movie_lens_100k/ratings_all_development_set.csv"
ratings_all_df = pd.read_csv(ratings_all_path)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_all_df[['user_id', 'item_id', 'rating']], reader)

# Define the parameter grid
param_grid = {
    'n_factors': np.linspace(100, 200, 6, dtype=int),  # 100, 120, 140, 160, 180, 200
    'lr_all': np.logspace(-2, np.log10(0.05), 6),  # 0.01, ..., 0.05 (logarithmic scale)
    'reg_all': np.logspace(-2, np.log10(0.5), 6)   # 0.01, ..., 0.5 (logarithmic scale)
}

# Perform grid search
gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=5)
gs.fit(data)

# Print best MAE score and corresponding parameters
print("Best MAE:", gs.best_score['mae'])
print("Best Parameters:", gs.best_params['mae'])

Best MAE: 0.7255399798669238
Best Parameters: {'n_factors': 152, 'lr_all': 0.01778279410038923, 'reg_all': 0.1}


In [11]:
# Use the best model to generate predictions for the leaderboard dataset
best_model = gs.best_estimator['mae']
trainset = data.build_full_trainset()
best_model.fit(trainset)

# Load leaderboard dataset
ratings_masked_path = "data_movie_lens_100k/ratings_masked_leaderboard_set.csv"
ratings_masked_df = pd.read_csv(ratings_masked_path)

# Generate predictions for leaderboard
predictions = []
for _, row in ratings_masked_df.iterrows():
    pred = best_model.predict(row['user_id'], row['item_id']).est
    predictions.append(pred)

# Save predictions to a file
output_path = "predicted_ratings_leaderboard.txt"
import numpy as np
np.savetxt(output_path, predictions)
print(f"Predictions saved to {output_path}")

Predictions saved to predicted_ratings_leaderboard.txt


In [9]:
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import GridSearchCV
import pandas as pd

# Load the dataset
ratings_all_path = "data_movie_lens_100k/ratings_all_development_set.csv"
ratings_all_df = pd.read_csv(ratings_all_path)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_all_df[['user_id', 'item_id', 'rating']], reader)

# Define the parameter grid for SVD++
param_grid = {
    'n_factors': [20, 50, 100],  # Number of latent factors
    'lr_all': [0.002, 0.005],   # Learning rate
    'reg_all': [0.02, 0.1]      # Regularization strength
}

# Perform grid search with SVD++
gs = GridSearchCV(SVDpp, param_grid, measures=['mae'], cv=3)
gs.fit(data)

# Print the best MAE score and parameters
print("Best MAE:", gs.best_score['mae'])
print("Best Parameters:", gs.best_params['mae'])

# Use the best model to generate predictions for the leaderboard dataset
best_model = gs.best_estimator['mae']
trainset = data.build_full_trainset()
best_model.fit(trainset)

# Load leaderboard dataset
ratings_masked_path = "data_movie_lens_100k/ratings_masked_leaderboard_set.csv"
ratings_masked_df = pd.read_csv(ratings_masked_path)

# Generate predictions for leaderboard
predictions = []
for _, row in ratings_masked_df.iterrows():
    pred = best_model.predict(row['user_id'], row['item_id']).est
    predictions.append(pred)

# Save predictions to a file
output_path = "predicted_ratings_leaderboard_svdpp.txt"
import numpy as np
np.savetxt(output_path, predictions, fmt='%.6f')
print(f"Predictions saved to {output_path}")

Best MAE: 0.7347440322379115
Best Parameters: {'n_factors': 20, 'lr_all': 0.005, 'reg_all': 0.02}
Predictions saved to predicted_ratings_leaderboard_svdpp.txt
