In [8]:
import pandas as pd
import numpy as np

# Read training data
train_data = pd.read_csv("trainIdx2_matrix.txt", sep="|", names=["user_id", "item_id", "rating"])

# Preprocess test data
test_data = []
with open("testTrack_hierarchy.txt", "r") as file:
    for line in file:
        tokens = line.strip().split("|")
        user_id, track_id, album_id, artist_id = tokens[:4]
        genres = tokens[4:]
        row = [user_id, track_id, album_id, artist_id] + genres
        test_data.append(row)

# Convert the preprocessed data to a DataFrame
column_names = ["user_id", "track_id", "album_id", "artist_id"] + [f"genre_{i}" for i in range(1, len(max(test_data, key=len)) - 3)]
test_data = pd.DataFrame(test_data, columns=column_names)

# Convert user_id and track_id columns to integers
test_data["user_id"] = test_data["user_id"].astype(int)
test_data["track_id"] = test_data["track_id"].astype(int)

# Create user-item ratings matrix
user_item_matrix = train_data.pivot_table(index="user_id", columns="item_id", values="rating").fillna(0)

def pearson_correlation(user1, user2):
    common_items = user_item_matrix.loc[user1][user_item_matrix.loc[user1].ne(0) & user_item_matrix.loc[user2].ne(0)].index
    if len(common_items) == 0:
        return 0
    user1_ratings = user_item_matrix.loc[user1, common_items]
    user2_ratings = user_item_matrix.loc[user2, common_items]
    num = np.sum((user1_ratings - user1_ratings.mean()) * (user2_ratings - user2_ratings.mean()))
    den = np.sqrt(np.sum((user1_ratings - user1_ratings.mean()) ** 2) * np.sum((user2_ratings - user2_ratings.mean()) ** 2))
    return num / den if den != 0 else 0

def recommend_tracks(user_id, n_recommendations=5, similarity_threshold=0.2):
    user_similarity = user_item_matrix.index.to_series().apply(lambda x: pearson_correlation(user_id, x))
    similar_users = user_similarity[user_similarity >= similarity_threshold].sort_values(ascending=False)
    track_candidates = user_item_matrix.loc[similar_users.index].apply(lambda x: np.sum(x * similar_users) / np.sum(similar_users), axis=0)
    track_candidates = track_candidates[user_item_matrix.loc[user_id].eq(0)].sort_values(ascending=False)
    return track_candidates.head(n_recommendations).index.tolist()

# Make recommendations for test tracks
test_data["recommended"] = 0
for index, row in test_data.iterrows():
    user_id = row["user_id"]
    track_id = row["track_id"]
    recommended_tracks = recommend_tracks(user_id)
    if track_id in recommended_tracks:
        test_data.at[index, "recommended"] = 1

# Save results to a CSV file
output = test_data[["user_id", "track_id", "recommended"]]
output["user_track"] = output["user_id"].astype(str) + "_" + output["track_id"].astype(str)
output[["user_track", "recommended"]].to_csv("recommendations.csv", index=False,



  test_data = pd.read_csv("data/testTrack_hierarchy.txt", sep="|", names=["user_id", "track_id", "album_id", "artist_id", *[f"genre_{i}" for i in range(1, 8)]], error_bad_lines=False)
Skipping line 14: expected 11 fields, saw 13
Skipping line 25: expected 11 fields, saw 12
Skipping line 68: expected 11 fields, saw 12
Skipping line 73: expected 11 fields, saw 14
Skipping line 79: expected 11 fields, saw 12
Skipping line 135: expected 11 fields, saw 12
Skipping line 145: expected 11 fields, saw 12
Skipping line 147: expected 11 fields, saw 15
Skipping line 237: expected 11 fields, saw 13
Skipping line 268: expected 11 fields, saw 15
Skipping line 297: expected 11 fields, saw 12
Skipping line 330: expected 11 fields, saw 14
Skipping line 332: expected 11 fields, saw 13
Skipping line 383: expected 11 fields, saw 13
Skipping line 418: expected 11 fields, saw 13
Skipping line 422: expected 11 fields, saw 13
Skipping line 440: expected 11 fields, saw 14
Skipping line 448: expected 11 fields

: 

: 

In [None]:

# Create similarity matrix
def create_similarity_matrix(matrix):
    similarity_matrix = pd.DataFrame(index=matrix.index, columns=matrix.index)
    
    for user1 in matrix.index:
        for user2 in matrix.index:
            if user1 == user2:
                similarity_matrix.loc[user1, user2] = 1
            else:
                similarity_matrix.loc[user1, user2] = 1 - cosine(matrix.loc[user1], matrix.loc[user2])
    
    return similarity_matrix

user_similarity = create_similarity_matrix(user_item_matrix)

# Define recommendation function
def recommend(user_id, track_id, k=5):
    # Get k most similar users
    similar_users = user_similarity[user_id].nlargest(k+1).iloc[1:].index
    
    # Calculate the predicted rating
    sum_similarity = 0
    sum_weighted_ratings = 0
    for user in similar_users:
        similarity = user_similarity.loc[user_id, user]
        sum_similarity += similarity
        sum_weighted_ratings += user_item_matrix.loc[user, track_id] * similarity
    
    predicted_rating = sum_weighted_ratings / sum_similarity
    
    # Set the threshold for recommendation
    threshold = 50
    return 1 if predicted_rating > threshold else 0

# Generate recommendations
recommendations = []

for index, row in test_data.iterrows():
    user_id = row["user_id"]
    track_id = row["track_id"]
    
    prediction = recommend(user_id, track_id)
    recommendations.append(f"{user_id}_{track_id},{prediction}")

# Save recommendations to a file
with open("recommendations.txt", "w") as f:
    f.write("TrackID,Predictor\n")
    for line in recommendations:
        f.write(line + "\n")
