Adapted from https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html

In [310]:
import pandas as pd 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#%matplotlib notebook

from sklearn import cross_validation as cv
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse



In [311]:
# datasets provided by https://github.com/llSourcell/recommender_live
dataset1 = '/home/kristina/Downloads/10000.txt'
dataset2 = '/home/kristina/Downloads/song_data.csv'

temp_df1 = pd.read_csv(dataset1, sep="\t", names =["user_id", "song_id", "listen_count"])
temp_df2 = pd.read_csv(dataset2)

final_df = pd.merge(temp_df1, temp_df2)
final_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,930d2be6c85315d72cab9823ec0f7bfe7e477794,SOBBMDR12A8C13253B,1.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007


In [312]:
temp = final_df.copy()
temp.columns = ['user_id_orig', 'song_id_orig', 'listen_count_orig', 'title_orig', 'release_orig', 'artist_name_orig', 'year_orig']

final_df["user_id"] = pd.Categorical(final_df["user_id"])
final_df['user_id'] = final_df["user_id"].cat.codes

final_df["song_id"] = pd.Categorical(final_df["song_id"])
final_df['song_id'] = final_df["song_id"].cat.codes

final_df["title"] = pd.Categorical(final_df["title"])
final_df['title'] = final_df["title"].cat.codes

final_df["release"] = pd.Categorical(final_df["release"])
final_df['release'] = final_df["release"].cat.codes

final_df["artist_name"] = pd.Categorical(final_df["artist_name"])
final_df['artist_name'] = final_df["artist_name"].cat.codes

final_df.head()

compare = pd.concat([temp, final_df.copy()], axis=1)
compare.head()

Unnamed: 0,user_id_orig,song_id_orig,listen_count_orig,title_orig,release_orig,artist_name_orig,year_orig,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0,278,78,1.0,4404,2929,847,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,278,233,2.0,1272,946,1385,1976
2,930d2be6c85315d72cab9823ec0f7bfe7e477794,SOBBMDR12A8C13253B,1.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,219,233,1.0,1272,946,1385,1976
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007,278,414,1.0,4169,1065,982,2007
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007,278,414,1.0,4169,1065,982,2007


In [313]:
df = final_df[['user_id', 'song_id', 'listen_count']]
print("Matrix size "  + str(df.shape))
print("Unique matrix size " + "(" + str(n_users) + "," + str(n_items) + ")")
df.head()

Matrix size (11364, 3)
Unique matrix size (396,5475)


Unnamed: 0,user_id,song_id,listen_count
0,278,78,1.0
1,278,233,2.0
2,219,233,1.0
3,278,414,1.0
4,278,414,1.0


In [314]:
subset_df = final_df
matrix_df = pd.crosstab(subset_df.user_id, subset_df.song_id, values = subset_df.listen_count, aggfunc="first")
matrix_df.fillna(0, inplace=True)

magnitude = np.sqrt(np.square(matrix_df).sum(axis=1))
matrix_df = matrix_df.divide(magnitude, axis='index')
data_sparse = sparse.csr_matrix(matrix_df)

user_prediction = cosine_similarity(data_sparse.transpose())
user_df = pd.DataFrame(data=user_prediction, index=matrix_df.columns, columns= matrix_df.columns)

In [315]:
user_df.insert(0, "user_id", temp["user_id_orig"])
user_df = user_df.drop_duplicates(subset=['user_id'], keep="first")
user_df.head()


song_id,user_id,0,1,2,3,4,5,6,7,8,...,5465,5466,5467,5468,5469,5470,5471,5472,5473,5474
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,930d2be6c85315d72cab9823ec0f7bfe7e477794,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.023868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9fba771d9731561eba47216f6fbfc0023d88641b,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,0.0,0.0,0.023868,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,537340ff896dea11328910013cfe759413e1eeb3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [316]:
user = "85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b"
user_index = user_df[user_df.user_id == user].index.tolist()[0]
user_rating_vector = user_df.ix[user_index]

user_df_without_user = user_df.drop('user_id', 1)
user_rating_vector = user_rating_vector.drop('user_id', 0)

score = user_df_without_user.dot(user_rating_vector).div(user_df_without_user.sum(axis=1))
score_df = pd.DataFrame({'song_id':score.index, 'score':score.values})

score_df.sort_values(by="score", ascending=[False], inplace=True)
score_df.head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,score,song_id
3,0.211077,7
93,0.111127,187
117,0.0252545,267
222,0.0252545,958
90,0.0252545,183


In [318]:
recommend = score_df.head(n=50)
recommend_sam = pd.DataFrame(data = recommend.song_id.values)                   
res_compare= compare.loc[compare['song_id'].isin(recommend_sam.values.flatten())]

print("User:" + user)

result_df =res_compare.drop_duplicates(['title_orig'], keep="first")
result_df = result_df[["title_orig", "artist_name_orig"]]
result_df.head(n= 10)

User:85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b


Unnamed: 0,title_orig,artist_name_orig
3,Stronger,Kanye West
704,Blow Me Away,Breaking Benjamin
932,Harder Better Faster Stronger,Daft Punk
1434,Lucky (Album Version),Jason Mraz & Colbie Caillat
1562,Rianna,Fisher
1591,Almaz,Randy Crawford
3444,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
3445,Hips Don't Lie,Shakira ft. Wyclef Jean
3715,Halo,Beyoncé
3731,Move Along,The All-American Rejects


In [319]:
train_data, test_data = cv.train_test_split(final_df, test_size=0.25)

In [320]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [323]:
rmse_user = user_prediction[test_data_matrix.nonzero()].flatten()
rmse_test = test_data_matrix[test_data_matrix.nonzero()].flatten()
result = sqrt(mean_squared_error(rmse_user, rmse_test))

print ('Evaluation for User Collaboration: ' + str(result))
print('Result represents "Error regression loss" - a perfect result is 0')

Evaluation for User Collaboration: 6.143222784567765
Result represents "Error regression loss" - a perfect result is 0
