In [18]:
import numpy
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
import pandas as pd

file_name_test = "data/" + 'testTrack_hierarchy.txt'
file_name_train = "data/" + 'trainIdx2_matrix.txt'
output_file = 'output1.txt'

# Load the training data into a DataFrame
train_data = pd.read_csv(file_name_train, sep='|', header=None, names=["userID", "itemID", "score"])

# Prepare the dataset for matrix factorization
reader = Reader(rating_scale=(train_data['score'].min(), train_data['score'].max()))
data = Dataset.load_from_df(train_data, reader)

print(train_data.head())

   userID  itemID  score
0  199808  248969     90
1  199808    2663     90
2  199808   28341     90
3  199808   42563     90
4  199808   59092     90


In [3]:
# Split the dataset into a training set and a validation set
trainset, testset = train_test_split(data, test_size=0.2)

# Grid search for finding the best hyperparameters
param_grid = {
    "n_epochs": [5, 10, 20],
    "lr_all": [0.002, 0.005, 0.01],
    "reg_all": [0.02, 0.1, 0.5]
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3, n_jobs=-1)
gs.fit(data)

print("Best RMSE score:", gs.best_score["rmse"])
print("Best parameters:", gs.best_params["rmse"])

# Train the SVD model using the best hyperparameters
algorithm = gs.best_estimator["rmse"]
algorithm.fit(trainset)





Best RMSE score: 25.775363641911724
Best parameters: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.5}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x4932248e0>

In [16]:
test_data_list = []

with open(file_name_test, 'r') as f:
    for line in f:
        fields = line.strip().split('|')
        user_id, track_id, album_id, artist_id = fields[:4]
        genres = fields[4:]

        test_data_list.append({
            "userID": user_id,
            "trackID": track_id,
            "albumID": album_id,
            "artistID": artist_id,
            "genres": genres
        })

test_data = pd.DataFrame(test_data_list)

# Predict the user preference scores for the test dataset
predictions = []
for index, row in test_data.iterrows():
    user_id, track_id, album_id, artist_id = row["userID"], row["trackID"], row["albumID"], row["artistID"]
    
    # Predict the ratings for the album and artist using the trained model, if available
    album_pred = algorithm.predict(user_id, album_id) if album_id != 'None' else 0
    artist_pred = algorithm.predict(user_id, artist_id) if artist_id != 'None' else 0
    

    alb_pred = album_pred.est if album_pred else 0
    art_pred=  artist_pred.est if artist_pred else 0
	
    # Calculate the total rating
    total_rating = alb_pred + art_pred
        
    # Apply threshold for binary output (1 if the user would like it, 0 otherwise)
    like_or_not = 1 if total_rating >= 0.5 else 0
    
    predictions.append([f"{user_id}_{track_id}", like_or_not])

# Save predictions to a CSV file
predictions_df = pd.DataFrame(predictions, columns=["TrackID", "Predictor"])
predictions_df.to_csv("predictions.csv", index=False)