In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
song_df_1 = pd.read_csv('triplets_file.csv')
song_df_2 = pd.read_csv('song_data.csv')

In [3]:
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

In [4]:
song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

In [5]:
song_df.rename({'title': 'song'}, axis=1, inplace=True)

In [6]:
song_df['user_id'] = preprocessing.LabelEncoder().fit_transform(song_df.user_id)

In [7]:
song_df['song_id'] = preprocessing.LabelEncoder().fit_transform(song_df.song_id)

In [8]:
song_df.head(100)

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,54961,736,1,Stronger,Graduation,Kanye West,2007
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
...,...,...,...,...,...,...,...
95,66678,1664,3,Horn Concerto No. 4 in E flat K495: II. Romanc...,Mozart - Eine kleine Nachtmusik,Barry Tuckwell/Academy of St Martin-in-the-Fie...,0
96,66678,1731,4,Rhyme & Reason,Listener Supported,DAVE MATTHEWS BAND,1994
97,66678,2220,3,Sehr kosmisch,Musik von Harmonia,Harmonia,0
98,66678,3305,1,Someone Else's Arms,Everglow_ The,Mae,2005


In [9]:
users = song_df['user_id'].unique()
len(users)

76353

In [10]:
songs = song_df['song'].unique()
len(songs)

9567

In [11]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)

In [12]:
song_df

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,54961,736,1,Stronger,Graduation,Kanye West,2007
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
...,...,...,...,...,...,...,...
1999995,64590,3660,2,Ignorance (Album Version),Ignorance,Paramore,0
1999996,64590,3736,4,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009
1999997,64590,3744,3,What I've Done (Album Version),What I've Done,Linkin Park,2007
1999998,64590,3893,1,Up,My Worlds,Justin Bieber,2010


In [13]:
class content_based_recommender_py():
    def __init__(self):
        self.user_id = None
        self.song_id = None
        self.listen_count = None
        self.tfidf_matrix = None
        self.cosine_similarities = None
        
    def create(self, df):
        self.user_id = df['user_id']
        self.song_id = df['song_id']
        self.listen_count = df['listen_count']
        self.song = df['song']
        self.release = df['release']
        self.artist_name = df['artist_name']
        self.year = df['year']
        df['text'] = df['song'] + ' ' + df['release'] + ' ' + df['artist_name']
        vectorizer = TfidfVectorizer()
        self.tfidf_matrix = vectorizer.fit_transform(df['text'])
        self.cosine_similarities = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
        
    def recommend(self, user_id, song_id):
        index = self.song_id[self.song_id == song_id].index[0]
        sim_scores = list(enumerate(self.cosine_similarities[index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_similar_indices = [i[0] for i in sim_scores if i[0] != index][:10]
        recommendations = self.song_id.iloc[top_similar_indices]
        recommendations_df = pd.DataFrame({'user_id': user_id,
                                           'song_id': recommendations,
                                           'listen_count': 0,
                                           'song': self.song.iloc[top_similar_indices],
                                           'release': self.release.iloc[top_similar_indices],
                                           'artist_name': self.artist_name.iloc[top_similar_indices],
                                           'year': self.year.iloc[top_similar_indices]})
        
        return recommendations_df


In [14]:
def print_recommendations(recommendations_df):
    if recommendations_df.empty:
        print("No recommendations found.")
    else:
        print("Recommendations:")
        for idx, row in recommendations_df.iterrows():
            print("Song ID: ", row['song_id'])
            print("Song: ", row['song'])
            print("Artist: ", row['artist_name'])
            print("Release: ", row['release'])
            print("Year: ", row['year'])
            print("Listen Count: ", row['listen_count'])
            print("----------")


In [None]:
user_id = 54961
song_id = 750
cn =  content_based_recommender_py()
cn.create(song_df)
recommendations_df = cn.recommend(user_id, song_id)
print_recommendations(recommendations_df)