In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed



In [2]:
song_df_1 = pd.read_csv('triplets_file.csv')
song_df_2 = pd.read_csv('song_data.csv')

In [3]:
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

In [4]:
song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

In [5]:
song_df.rename({'title': 'song'}, axis=1, inplace=True)

In [6]:
song_df['user_id'] = preprocessing.LabelEncoder().fit_transform(song_df.user_id)

In [7]:
song_df['song_id'] = preprocessing.LabelEncoder().fit_transform(song_df.song_id)

In [8]:
song_df.head(100)

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,54961,736,1,Stronger,Graduation,Kanye West,2007
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
...,...,...,...,...,...,...,...
95,66678,1664,3,Horn Concerto No. 4 in E flat K495: II. Romanc...,Mozart - Eine kleine Nachtmusik,Barry Tuckwell/Academy of St Martin-in-the-Fie...,0
96,66678,1731,4,Rhyme & Reason,Listener Supported,DAVE MATTHEWS BAND,1994
97,66678,2220,3,Sehr kosmisch,Musik von Harmonia,Harmonia,0
98,66678,3305,1,Someone Else's Arms,Everglow_ The,Mae,2005


In [9]:
song_grouped = song_df.groupby(['user_id']).agg({'listen_count': 'count'}).reset_index()

In [10]:
song_grouped

Unnamed: 0,user_id,listen_count
0,0,7
1,1,5
2,2,9
3,3,10
4,4,9
...,...,...
76348,76348,44
76349,76349,11
76350,76350,17
76351,76351,7


In [11]:
song_grouped.rename(columns = {'listen_count':'user_listen_count'}, inplace = True)

In [12]:
song_grouped

Unnamed: 0,user_id,user_listen_count
0,0,7
1,1,5
2,2,9
3,3,10
4,4,9
...,...,...
76348,76348,44
76349,76349,11
76350,76350,17
76351,76351,7


In [13]:
song_df = pd.merge(song_df, song_grouped, on="user_id", how="left")

In [14]:
song_df

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year,user_listen_count
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0,45
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,45
2,54961,736,1,Stronger,Graduation,Kanye West,2007,45
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005,45
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,45
...,...,...,...,...,...,...,...,...
1999995,64590,3660,2,Ignorance (Album Version),Ignorance,Paramore,0,53
1999996,64590,3736,4,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009,53
1999997,64590,3744,3,What I've Done (Album Version),What I've Done,Linkin Park,2007,53
1999998,64590,3893,1,Up,My Worlds,Justin Bieber,2010,53


In [15]:
song_df['rating']=song_df['listen_count']/song_df['user_listen_count']

In [16]:
song_df

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year,user_listen_count,rating
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0,45,0.022222
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,45,0.044444
2,54961,736,1,Stronger,Graduation,Kanye West,2007,45,0.022222
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005,45,0.022222
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,45,0.022222
...,...,...,...,...,...,...,...,...,...
1999995,64590,3660,2,Ignorance (Album Version),Ignorance,Paramore,0,53,0.037736
1999996,64590,3736,4,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009,53,0.075472
1999997,64590,3744,3,What I've Done (Album Version),What I've Done,Linkin Park,2007,53,0.056604
1999998,64590,3893,1,Up,My Worlds,Justin Bieber,2010,53,0.018868


In [17]:
user_item_matrix = csr_matrix((song_df['listen_count'], (song_df['user_id'], song_df['song_id'])))
user_item_matrix = user_item_matrix.tocsr()

In [18]:
def compute_svd(user_item_matrix_csr, n_components):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    user_factors = svd.fit_transform(user_item_matrix_csr)
    item_factors = svd.components_.T
    return user_factors, item_factors

In [19]:
n_components = 50
num_cores = 4
results = Parallel(n_jobs=num_cores)(delayed(compute_svd)(user_item_matrix, n_components) for _ in range(num_cores))
user_factors = np.concatenate([result[0] for result in results], axis=1)
item_factors = np.concatenate([result[1] for result in results], axis=1)

In [22]:
def get_recommendations(user_id, n=10):
    user_vector = user_factors[user_id, :]
    scores = item_factors.dot(user_vector)
    top_item_indices = np.argsort(scores)[::-1][:n]
    top_items = np.arange(user_item_matrix.shape[1])[top_item_indices]
    recommendations = song_df[song_df['song_id'].isin(top_items)][['song_id', 'song']].drop_duplicates()
    return recommendations

In [23]:
user_id = 22570
recommendations = get_recommendations(user_id)
print("Top 10 recommended songs for user_id {}: ".format(user_id))
print(recommendations)

Top 10 recommended songs for user_id 22570: 
       song_id                                          song
76        1811  Ghosts 'n' Stuff (Original Instrumental Mix)
94        1334                              Hey_ Soul Sister
108       8138                                Drop The World
118       4152                                 The Scientist
121       6293                                        Yellow
158       4864                         Sinisten tähtien alla
652       7496                                      The Gift
2518      2893                                   Firestarter
2840      4558                              Ode To My Family
32718     8723               Hymne A L'Amour (Album Version)
