In [3]:
pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3195783 sha256=b373f4f1e4286cc84d1bcef581797df7f4a5f4c98467f4cc3744d308c1f72aed
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [4]:
import pandas as pd
from surprise import Reader, Dataset, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# Read training data
train_data = pd.read_csv("trainIdx2_matrix.txt", sep="|", names=["user_id", "item_id", "rating"])

# Preprocess test data
test_data = []
with open("testTrack_hierarchy.txt", "r") as file:
    for line in file:
        tokens = line.strip().split("|")
        user_id, track_id, album_id, artist_id = tokens[:4]
        genres = tokens[4:]
        row = [user_id, track_id, album_id, artist_id] + genres
        test_data.append(row)

# Convert the preprocessed data to a DataFrame
column_names = ["user_id", "track_id", "album_id", "artist_id"] + [f"genre_{i}" for i in range(1, len(max(test_data, key=len)) - 3)]
test_data = pd.DataFrame(test_data, columns=column_names)

print(test_data.head())


  user_id track_id album_id artist_id genre_1 genre_2 genre_3 genre_4 genre_5  \
0  199810   208019   209288      None    None    None    None    None    None   
1  199810    74139   277282    271146  113360  173467  173655  192976  146792   
2  199810     9903     None      None   33722  123396   79926   73523    None   
3  199810   242681   190640    244574   61215   17453  274088    None    None   
4  199810    18515   146344     33168   19913   48505  154024    None    None   

  genre_6  ... genre_12 genre_13 genre_14 genre_15 genre_16 genre_17 genre_18  \
0    None  ...     None     None     None     None     None     None     None   
1   48505  ...     None     None     None     None     None     None     None   
2    None  ...     None     None     None     None     None     None     None   
3    None  ...     None     None     None     None     None     None     None   
4    None  ...     None     None     None     None     None     None     None   

  genre_19 genre_20 genre_

In [5]:
# Convert user_id and track_id columns to integers
test_data["user_id"] = test_data["user_id"].astype(int)
test_data["track_id"] = test_data["track_id"].astype(int)

# Prepare the data for Surprise
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(train_data, reader)

# Split the data into a training set and a test set
trainset = data.build_full_trainset()

In [None]:
# Create and train the collaborative filtering model
algo = KNNWithMeans(k=40, sim_options={"name": "pearson_baseline", "user_based": True})
algo.fit(trainset)

# Make recommendations for test tracks
test_data["recommended"] = 0
for index, row in test_data.iterrows():
    user_id = row["user_id"]
    track_id = row["track_id"]
    prediction = algo.predict(user_id, track_id)
    if prediction.est >= 50:  # Adjust the threshold for recommendations
        test_data.at[index, "recommended"] = 1

# Save results to a CSV file
output = test_data[["user_id", "track_id", "recommended"]]
output["user_track"] = output["user_id"].astype(str) + "_" + output["track_id"].astype(str)
output[["user_track", "recommended"]].to_csv("recommendations_surprise.csv", index=False, header=["TrackID", "Predictor"])

Estimating biases using als...
