In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
import pandas as pd
from sklearn.neighbors import NearestNeighbors


# Change this value to set the minimum number of ratings required per user
min_ratings_threshold = 0

training_data_dir = '../data/trainIdx2_matrix.txt'
track_data_dir = '../data/trackData2.txt'
output_dir = "../data/proccessed/dump.csv"
# Read the training data
train_df = pd.read_csv(training_data_dir, sep='|',
                       names=['userId', 'itemId', 'rating'])


# Read the track data line by line
with open(track_data_dir, 'r') as file:
    lines = file.readlines()

track_data = []
for line in lines:
    fields = line.strip().split('|')
    # Use the second number as the trackId
    track_data.append([fields[1]] + fields[2:])

# Create a DataFrame from the track data
track_data_df = pd.DataFrame(track_data)

# Fill NaN values with an empty string
track_data_df.fillna("", inplace=True)

# Determine the maximum number of genres
max_genres = track_data_df.shape[1] - 3

# Rename columns
track_data_df.columns = ['trackId', 'albumId', 'artistId'] + \
    [f'genreId_{i}' for i in range(1, max_genres + 1)]

# Convert the 'itemId' columns to numeric and drop any rows with non-finite values
train_df['itemId'] = pd.to_numeric(train_df['itemId'], errors='coerce')
train_df.dropna(subset=['itemId'], inplace=True)

track_data_df['trackId'] = pd.to_numeric(
    track_data_df['trackId'], errors='coerce')
track_data_df.dropna(subset=['trackId'], inplace=True)

# Convert the 'itemId' columns to int data type
train_df['itemId'] = train_df['itemId'].astype(int)
track_data_df['trackId'] = track_data_df['trackId'].astype(int)

# Merge the track data with the train data
merged_df = track_data_df.merge(
    train_df, left_on='trackId', right_on='itemId', how='left')

# Convert genres to a list of genres for each track
merged_df['genres'] = merged_df[[
    f'genreId_{i}' for i in range(1, max_genres + 1)]].values.tolist()

# Remove empty strings from the genre lists
merged_df['genres'] = merged_df['genres'].apply(
    lambda x: [genre for genre in x if genre != ""])

# Drop individual genre columns and itemId column
merged_df.drop(columns=[f'genreId_{i}' for i in range(
    1, max_genres + 1)] + ['itemId'], inplace=True)



# Filter out users who have fewer than the threshold number of ratings
user_rating_counts = merged_df['userId'].value_counts()
users_with_min_ratings = user_rating_counts[user_rating_counts >=
                                            min_ratings_threshold].index
filtered_df = merged_df[merged_df['userId'].isin(users_with_min_ratings)]

# Calculate the number of rows in the original and filtered DataFrames
original_row_count = len(merged_df)
filtered_row_count = len(filtered_df)

# Calculate the percentage of rows removed
percentage_removed = (
    (original_row_count - filtered_row_count) / original_row_count) * 100

# Print the results
print(f"Original row count: {original_row_count}")
print(f"Filtered row count: {filtered_row_count}")
print(f"Percentage of data removed: {percentage_removed:.2f}%")


# Continue with the rest of the recommendation system using the filtered_df DataFrame
merged_df = filtered_df.sort_values(by='userId')

print(merged_df.head())
merged_df.to_csv(output_dir, index=False)


Original row count: 22128003
Filtered row count: 22128002
Percentage of data removed: 0.00%
          trackId albumId artistId    userId  rating                   genres
21224082   204650  177418   131552  199810.0    50.0                       []
10586553     9774   79500   158282  199810.0    50.0         [242383, 207648]
5006151      9774   79500   158282  199810.0    50.0  [242383, 207648, 47898]
5146696     26374  153568   158282  199810.0    50.0          [81520, 242383]
10580710   271229  293464   279143  199811.0    70.0          [173655, 98154]
