## Matrix Factorization Song Recommendation System

In [4]:
import pandas as pd
import numpy as np
import random
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split


In [6]:
NUM_PRED = 5

In [7]:
df_songs = pd.read_csv("cleaned_data/csr_df.csv")
df_songs.shape

(1855, 8516)

In [8]:
random_sample = df_songs.sample(axis='columns')
print(random_sample)

      blame it on waylon _ josh thompson
0                                      0
1                                      0
2                                      0
3                                      0
4                                      0
...                                  ...
1850                                   0
1851                                   0
1852                                   0
1853                                   0
1854                                   0

[1855 rows x 1 columns]


In [160]:
songs = df_songs.to_numpy()

#### i think this doesn't do anything?

In [161]:
x,y = songs.reshape((songs.shape[0], songs.shape[1])), range(songs.shape[0])

In [162]:
# Create a new user:
new_user_songs = ['greyhound _ swedish house mafia', 'saturday night _ the herbaliser', 'time to pretend _ mgmt']
# Create a new user row (all 0s, then set selected songs to 1)
new_user_row = np.zeros(songs.shape[1])
for song in new_user_songs:
    if song in df_songs.columns:
        new_user_row[df_songs.columns.get_loc(song)] = 1

In [163]:
new_user_row

array([0., 0., 0., ..., 0., 0., 0.])

In [164]:
list_of_songs = []

In [165]:
for i in range(5):
    # random seed
    rand = int(random.random()*100)

    # split into test and training data (20, 80)
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.24,random_state=42)

    # TruncatedSVD is a dimensionality reduction technique (similar to PCA)
    svd = TruncatedSVD(n_components=200, n_iter=10, random_state=rand)
    svd.fit(x)

    # fit the training data
    # contains the transformed low-dimensional representation
    train_mat = svd.transform(x_train)

    # dot multiplication of the training matrix and the svd components
    # meaning: reconstructs an approximate version of the original data
    approx_matrix = np.dot(train_mat, svd.components_)

    # Projecting the new user onto the reduced feature space
    new_user_mat = np.dot(new_user_row, svd.components_.T)

    # Reconstructing predictions in the original space
    new_predictions = np.dot(new_user_mat, svd.components_)

    # Sorts indices of predicted ratings in descending order (highest ratings first)
    recommendations = np.argsort(-new_predictions)

    # Selects the top NUM_PRED recommendations
    recommendations = [i for i in recommendations][:NUM_PRED]

    # Mapping indices to song titles
    recommended_songs = [df_songs.columns[i] for i in recommendations]

    print(f"{i+1}. Top Recommendations for New User:")
    print(recommended_songs)
    for song in recommended_songs:
        list_of_songs.append(song)

1. Top Recommendations for New User:
['million years _ gareth emery', "fool's gold _ jill scott", 'firefly _ breaking benjamin', 'greyhound _ swedish house mafia', 'girlfriend - single version _ bobby brown']
2. Top Recommendations for New User:
['girlfriend - single version _ bobby brown', 'frikitona _ plan b', "fool's gold _ jill scott", 'bigfoot _ w&w', 'detroit vs. everybody _ eminem']
3. Top Recommendations for New User:
['million years _ gareth emery', 'greyhound _ swedish house mafia', 'girlfriend - single version _ bobby brown', 'bigfoot _ w&w', 'love rain - (coffee shop mix) _ mos def']
4. Top Recommendations for New User:
['firefly _ breaking benjamin', 'bigfoot _ w&w', 'radical - original mix _ dyro', 'greyhound _ swedish house mafia', 'better in time _ leona lewis']
5. Top Recommendations for New User:
['bigfoot _ w&w', 'girlfriend - single version _ bobby brown', 'radical - original mix _ dyro', 'better in time _ leona lewis', 'million years _ gareth emery']


In [166]:
df_songs.values.sum()

8516

### sorts the song recommendations 

##### set - gets rid of duplicates
##### sorts the songs based on their frequency in list of songs st least frequent appears first

In [167]:
res = sorted(set(list_of_songs), key = lambda ele: list_of_songs.count(ele))
print(len(res))
res = res[len(res)-5:]
print(res)

11
['better in time _ leona lewis', 'million years _ gareth emery', 'greyhound _ swedish house mafia', 'girlfriend - single version _ bobby brown', 'bigfoot _ w&w']


In [168]:
top_songs = []

In [169]:
# res = sorted(set(list_of_songs), key = lambda ele: list_of_songs.count(ele))
i = len(res) - 1
while (i > len(res) - NUM_PRED - 1):
    top_songs.append(res[i])
    i = i - 1
print(top_songs)

['bigfoot _ w&w', 'girlfriend - single version _ bobby brown', 'greyhound _ swedish house mafia', 'million years _ gareth emery', 'better in time _ leona lewis']


In [None]:
def mf_recommender(user_songs, num_pred=5, data_path="cleaned_data/csr_df.csv", num_components=200, num_iter=10):
    """
    Recommends songs using Matrix Factorization (SVD).
    
    Parameters:
        user_songs (list): List of song names liked by the user.
        data_path (str): Path to the song interaction dataset.
        num_components (int): Number of components for SVD.
        num_iter (int): Number of iterations for SVD.

    Returns:
        list: Top recommended songs.
    """
    # Load dataset
    df_songs = pd.read_csv(data_path)

    # Convert to NumPy array
    songs = df_songs.to_numpy()

    # Initialize new user row (all 0s)
    new_user_row = np.zeros(songs.shape[1])
    
    # Mark songs the user likes
    for song in user_songs:
        if song in df_songs.columns:
            new_user_row[df_songs.columns.get_loc(song)] = 1

    list_of_songs = []

    for _ in range(5):  # Run multiple times for better recommendations
        rand = int(random.random() * 100)  # Random seed

        # Prepare training data
        x, y = songs.reshape((songs.shape[0], songs.shape[1])), range(songs.shape[0])
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.24, random_state=42)

        # Apply SVD for dimensionality reduction
        svd = TruncatedSVD(n_components=num_components, n_iter=num_iter, random_state=rand)
        svd.fit(x)

        # Transform training data
        train_mat = svd.transform(x_train)

        # Reconstruct an approximate version of original data
        approx_matrix = np.dot(train_mat, svd.components_)

        # Project new user onto reduced feature space
        new_user_mat = np.dot(new_user_row, svd.components_.T)

        # Reconstruct predictions in original space
        new_predictions = np.dot(new_user_mat, svd.components_)

        # Sort indices of predicted ratings in descending order
        recommendations = np.argsort(-new_predictions)[:num_pred]

        # Convert indices to song names
        recommended_songs = [df_songs.columns[i] for i in recommendations]
        list_of_songs.extend(recommended_songs)

    # Sort songs by frequency of occurrence in recommendations
    sorted_songs = sorted(set(list_of_songs), key=lambda song: list_of_songs.count(song))
    
    # Select the top num_pred songs (returns a list of recommended track keys)
    return sorted_songs[-num_pred:]

# Test 
user_songs = ['greyhound _ swedish house mafia', 'saturday night _ the herbaliser', 'time to pretend _ mgmt']
recommended_songs = mf_recommender(user_songs)
print("Recommended Songs:", recommended_songs)

Recommended Songs: ['greyhound _ swedish house mafia', 'detroit vs. everybody _ eminem', "fool's gold _ jill scott", 'radical - original mix _ dyro', 'girlfriend - single version _ bobby brown']


In [None]:
# from matplotlib_venn import venn3

# # Exploring how similar the outputs from the three methods are (how many recommended songs overlap)

# recommended_songs_euc = [song[0] for song in closest_songs_euc]

# # Create a Venn diagram
# venn3([set(top_songs), set(recommended_songs_euc), set(recommended_songs_cos)], set_labels=('SVD', 'Euclidean', 'Cosine'))
# plt.show()

# print([set(top_songs)])
# print([set(recommended_songs_euc)])
# print([set(recommended_songs_cos)])

ModuleNotFoundError: No module named 'matplotlib_venn'