In [0]:
import pandas
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [0]:
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        
    def build_co_occurance_matrix(self, user_songs, all_songs):
        user_songs_users = []        
        for i in range(0, len(user_songs)):
        	item_data = self.train_data[self.train_data[self.item_id] == user_songs[i]]
        	item_users = set(item_data[self.user_id].unique())
        	user_songs_users.append(item_users)

        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           

        for i in range(0,len(all_songs)):
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            print(i)
            for j in range(0,len(user_songs)):       
                    
                users_j = user_songs_users[j]
                    
                users_intersection = users_i.intersection(users_j)
                
                if len(users_intersection) != 0:
                    users_union = users_i.union(users_j)
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 

        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        columns = ['user_id', 'song', 'score', 'rank']
        df = pandas.DataFrame(columns=columns)
         
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    def recommend(self, user):
    	user_data = self.train_data[self.train_data[self.user_id] == user]
    	user_items = list(user_data[self.item_id].unique())
    	user_songs = user_items  
    	print("No. of unique songs for the user: %d" % len(user_songs))
    	all_songs = list(self.train_data[self.item_id].unique())
    	print("no. of unique songs in the training set: %d" % len(all_songs))
    	cooccurence_matrix = self.build_co_occurance_matrix(user_songs, all_songs)
    	df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
    	return df_recommendations
    
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        all_songs = list(self.train_data[self.item_id].unique())
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
        cooccurence_matrix = self.build_co_occurance_matrix(user_songs, all_songs)
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations





In [0]:
def global_top_50(user_id=1):
	#Get a count of user_ids for each unique song as recommendation score
	train_data_grouped = train_data.groupby(['song']).agg({'user_id': 'count'}).reset_index()
	train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)

	#Sort the songs based upon recommendation score
	train_data_sort = train_data_grouped.sort_values(['score', 'song'], ascending = [0,1])

	#Generate a recommendation rank based upon score
	train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')  

	#Get the top 50 recommendations
	popularity_recommendations = train_data_sort.head(50)

	user_id = users[5]
	#Add user_id column for which the recommendations are being generated
	popularity_recommendations['user_id'] = user_id

	#Bring user_id column to the front
	cols = popularity_recommendations.columns.tolist()
	cols = cols[-1:] + cols[:-1]

	popularity_recommendations = popularity_recommendations[cols]

	return(popularity_recommendations)

In [0]:
#Datasets imported
triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

#datasets fitted in the pandas dataframe
song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']
song_df_2 =  pandas.read_csv(songs_metadata_file)

#Merge the two dataframes above to create input dataframe for recommender systems
#simlilar to left joint in DBMS
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

#merger the title and artist name into a single colomn named song
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

#print(song_df.head())
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
temp_df = song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

#print(temp_df.head())

users = song_df['user_id'].unique()
songs = song_df['song'].unique()

#print(len(users))
#print(len(songs))

train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
#print(train_data.head(5))

print(train_data.head())

                                          user_id  ...                                               song
608812   7b8fbe766a49e5d7618452149dfab920621fc4fb  ...  Two Is Better Than One - Boys Like Girls featu...
623729   d24956cd68ff84b6d0271286ae6866ee1c89ff77  ...                          Royal Gregory - Holy Fuck
583106   da7b91b6cab1ca11227ee7720c4d2e03e8c31579  ...                        Mr Sandman - The Chordettes
435735   5f633da6ad4845350949c3c76ce6c4ef6f167476  ...                 A Days Work (feat. P.O.S) - BK-One
1361953  01ad0fabd01af750700a1e80bb0055abcb3edd28  ...                 Rockin' Rollin' Stone - Andy Starr

[5 rows x 8 columns]


In [0]:
print("------------------------------------------------------------------------------------")
print("Welcome to the Sportify Music")
print("------------------------------------------------------------------------------------")
temp_user = int(input("Enter your user id: "))
while(True):
	flag_function = input("1. Global Top 50"+"\n"+"2. Your Mix"+"\n"+"3. Knn- recommendations"+"\n"+"4. Exit")
	if(str(flag_function) == '1'):
		test1  = global_top_50()
		test1 = test1[['song' , 'Rank']]
		print("------------------------------------------------------------------------------------")
		print(test1)
		print("------------------------------------------------------------------------------------")
	if(str(flag_function)== '4'):
		break
	if(str(flag_function) == '2'):
		is_model = item_similarity_recommender_py()
		is_model.create(train_data, 'user_id', 'song')
		user_id = users[temp_user]
		print("------------------------------------------------------------------------------------")
		print(is_model.recommend(user_id))
		print("------------------------------------------------------------------------------------")
	if(str(flag_function) == '3'):
   		knn_df = song_df.drop(['song_id', 'artist_name', 'release', 'year', 'title' ], axis=1)
   		knn_df.drop_duplicates(subset ='song', keep = "first", inplace = True)
   		wide_artist_data = knn_df.pivot(index = 'song', columns = 'user_id', values = 'listen_count').fillna(0)
   		wide_artist_data_sparse = csr_matrix(wide_artist_data.values)
   		model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
   		model_knn.fit(wide_artist_data_sparse)
   		query_index = temp_user
   		distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
   		print("------------------------------------------------------------------------------------")
   		for i in range(0, len(distances.flatten())):
   			if i == 0:
   				print ('Recommendations for user', query_index, wide_artist_data.index[query_index])
   			else:
   				print (i, wide_artist_data.index[indices.flatten()[i]])
   		print("------------------------------------------------------------------------------------")
	else:
		print("------------------------------------------------------------------------------------")
		print("Please select a valid Option")
		print("------------------------------------------------------------------------------------")


------------------------------------------------------------------------------------
Welcome to the Sportify Music
------------------------------------------------------------------------------------
Enter your user id: 5
1. Global Top 50
2. Your Mix
3. Knn- recommendations
4. Exit1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


------------------------------------------------------------------------------------
                                                   song  Rank
7127                           Sehr kosmisch - Harmonia   1.0
9084                                       Undo - Björk   2.0
2068  Dog Days Are Over (Radio Edit) - Florence + Th...   3.0
9880                     You're The One - Dwight Yoakam   4.0
6774                            Revelry - Kings Of Leon   5.0
7115                              Secrets - OneRepublic   6.0
3613  Horn Concerto No. 4 in E flat K495: II. Romanc...   7.0
2717                     Fireflies - Charttraxx Karaoke   8.0
3485                           Hey_ Soul Sister - Train   9.0
8847                                 Tive Sim - Cartola  10.0
5968                    OMG - Usher featuring will.i.am  11.0
9140                       Use Somebody - Kings Of Leon  12.0
2227                Drop The World - Lil Wayne / Eminem  13.0
5223                                   Marry Me