Adapted from https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html

In [363]:
import pandas as pd 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#%matplotlib notebook

from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse



In [375]:
# datasets provided by https://github.com/llSourcell/recommender_live
dataset1 = '/home/kristina/Downloads/10000.txt'
dataset2 = '/home/kristina/Downloads/song_data.csv'

temp_df1 = pd.read_csv(dataset1, sep="\t", names =["user_id", "song_id", "listen_count"])
temp_df2 = pd.read_csv(dataset2)

final_df = pd.merge(temp_df1, temp_df2)
final_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1.0,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,930d2be6c85315d72cab9823ec0f7bfe7e477794,SOBBMDR12A8C13253B,1.0,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1.0,Stronger,Graduation,Kanye West,2007


In [376]:
temp = final_df.copy()

final_df["user_id"] = pd.Categorical(final_df["user_id"])
final_df['user_id'] = final_df["user_id"].cat.codes

final_df["song_id"] = pd.Categorical(final_df["song_id"])
final_df['song_id'] = final_df["song_id"].cat.codes

final_df["title"] = pd.Categorical(final_df["title"])
final_df['title'] = final_df["title"].cat.codes

final_df["release"] = pd.Categorical(final_df["release"])
final_df['release'] = final_df["release"].cat.codes

final_df["artist_name"] = pd.Categorical(final_df["artist_name"])
final_df['artist_name'] = final_df["artist_name"].cat.codes

final_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,278,78,1.0,4404,2929,847,0
1,278,233,2.0,1272,946,1385,1976
2,219,233,1.0,1272,946,1385,1976
3,278,414,1.0,4169,1065,982,2007
4,278,414,1.0,4169,1065,982,2007


In [377]:
train_data, test_data = cv.train_test_split(final_df, test_size=0.25)

In [395]:
df = final_df[['user_id', 'song_id', 'listen_count']]
#df = final_df.loc[cluster_map["Cluster Class"]== 4]

n_users = df.user_id.unique().shape[0]
n_items = df.song_id.unique().shape[0]

print("Matrix size "  + str(df.shape))
print("Unique matrix size " + "(" + str(n_users) + "," + str(n_items) + ")")


Matrix size (11364, 3)
Unique matrix size (396,5475)


In [391]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [388]:
user_likeness = pairwise_distances(train_data_matrix, metric='cosine')
mean_user_rating = train_data_matrix.mean(axis=1)
ratings_diff = (train_data_matrix - mean_user_rating[:, np.newaxis])
user_prediction = mean_user_rating[:, np.newaxis] + user_likeness.dot(ratings_diff) / np.array([np.abs(user_likeness).sum(axis=1)]).T
print(user_prediction)

[[ 0.0057037   0.00824488  0.00316252 ...,  0.03365669  0.00557563
   0.01586843]
 [-0.00790567 -0.00537402 -0.01043732 ...,  0.01994243 -0.00790567
   0.00222091]
 [-0.00769407 -0.00515792 -0.01023021 ...,  0.02020352 -0.00769407
   0.00245051]
 ..., 
 [-0.00405655 -0.00152395 -0.00658916 ...,  0.02380209 -0.00405655
   0.00607386]
 [ 0.00182484  0.00432648 -0.00071134 ...,  0.02972283  0.00182484
   0.01196956]
 [-0.00294808 -0.00044692 -0.00548327 ...,  0.02493899 -0.00294808
   0.00719267]]


In [389]:
user_df = pd.DataFrame(user_prediction, columns = temp.song_id.unique())
user_df.insert(0, "user_id", temp["user_id"])
user_df = user_df.drop_duplicates(subset=['user_id'], keep="first")
user_df.head()


Unnamed: 0,user_id,SOAKIMP12A8C130995,SOBBMDR12A8C13253B,SOBXHDL12A81C204C0,SOBYHAJ12A6701BF1D,SODACBL12A8C13C273,SODDNQT12A6D4F5F7E,SODXRTY12AB0180F3B,SOFGUAY12AB017B0A8,SOFRQTD12A81C233C0,...,SOOEGCA12A6D4F8B80,SOOLOQL12A8C143F4B,SOQBFRA12A6D4F854E,SOSFRXX12A58A7B4C8,SOULYQW12A8C13708E,SOXMOQF12A6701FB8B,SOXRJYC12A6D4FB48B,SOXSSIQ12A8C13AFFB,SOYHIOC12AB018DB2E,SOZIBZP12A6701C434
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,0.005704,0.008245,0.003163,0.005704,0.010786,0.005704,0.008245,0.01075,0.008245,...,0.007989,0.015799,0.005704,0.007989,0.020951,0.005704,0.08448,0.033657,0.005576,0.015868
2,930d2be6c85315d72cab9823ec0f7bfe7e477794,-0.007694,-0.005158,-0.01023,-0.007694,-0.002622,-0.007694,-0.005158,-0.002622,-0.005158,...,-0.005158,0.002451,-0.007694,-0.005158,0.007523,-0.007694,0.070926,0.020204,-0.007694,0.002451
5,9fba771d9731561eba47216f6fbfc0023d88641b,-0.007723,-0.005191,-0.010254,-0.007723,-0.002659,-0.007723,-0.005191,-0.002659,-0.005191,...,-0.005191,0.002404,-0.007723,-0.005191,0.007467,-0.007723,0.070758,0.020126,-0.007723,0.002404
7,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,0.003459,0.005967,0.000919,0.003459,0.008427,0.003459,0.005967,0.008506,0.005927,...,0.005992,0.013412,0.003459,0.005992,0.018666,0.003459,0.082021,0.031399,0.003455,0.013619
9,537340ff896dea11328910013cfe759413e1eeb3,0.005026,0.007545,0.002427,0.005026,0.010222,0.005026,0.007545,0.010211,0.007624,...,0.007624,0.015005,0.005026,0.007624,0.020596,0.005026,0.08469,0.033607,0.005026,0.015413


In [383]:
user_df.set_index(["user_id"], inplace = True)
user_df = user_df.transpose()
user_df.head()



user_id,b80344d063b5ccb3212f76538f3d9e43d87dca9e,930d2be6c85315d72cab9823ec0f7bfe7e477794,9fba771d9731561eba47216f6fbfc0023d88641b,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,537340ff896dea11328910013cfe759413e1eeb3,8fce200f3912e9608e3b1463cdb9c3529aab5c08,c24ec42f0e449ff39a95a01f0795f833b898f71b,1645b689f873529ab85e3b72742be44813e82bd3,6f8453b0d9d2199f98c1992995a8445ad6837fd8,12768858f6a825452e412deb1df36d2d1d9c6791,...,4449f7ea241ceb4a2d1e2298c4aead417b26899a,6b7a5895d266599bc414b9eb8fbad59c95b9a99d,4ad8037982f52ce3e38d7ae7f6897401630141ad,0a00498b9d607844a8826184ae7278097d1c008a,529b42cdbc379ad2e765aec6d3bad8a192038741,332de6dcfc5b597500a661ab51a84653705b492b,73d0a0c725c9b2c541635672bb0572bfcb7eb2b4,62f2f9b881dc320d745a90c0c10528d18e10deb1,f47116f998e030f2dab275b81fb2a04a9dc06c33,ed3664f9cd689031fe4d0ed6c66503bdc3ad7cb6
SOAKIMP12A8C130995,0.005704,-0.007694,-0.007723,0.003459,0.005026,-0.005338,0.033781,-0.008589,-0.007539,0.007966,...,-0.007343,-0.005148,0.002904,-0.000943,-0.006414,0.005682,-0.003994,-0.006782,-0.000201,-0.002948
SOBBMDR12A8C13253B,0.008245,-0.005158,-0.005191,0.005967,0.007545,-0.002801,0.036298,-0.006043,-0.005008,0.010404,...,-0.004885,-0.002615,0.005392,0.001598,-0.003895,0.008039,-0.001459,-0.004309,0.002271,-0.000447
SOBXHDL12A81C204C0,0.003163,-0.01023,-0.010254,0.000919,0.002427,-0.007876,0.031186,-0.011135,-0.010071,0.005383,...,-0.009893,-0.007681,0.000347,-0.003484,-0.00898,0.003144,-0.006554,-0.009318,-0.002744,-0.005483
SOBYHAJ12A6701BF1D,0.005704,-0.007694,-0.007723,0.003459,0.005026,-0.005338,0.033781,-0.008589,-0.007539,0.007966,...,-0.007343,-0.005148,0.002875,-0.000943,-0.006414,0.005682,-0.004072,-0.006782,-0.000201,-0.002948
SODACBL12A8C13C273,0.010786,-0.002622,-0.002659,0.008427,0.010222,-0.000263,0.038937,-0.003497,-0.002476,0.01313,...,-0.002668,-8.1e-05,0.008019,0.00414,-0.001284,0.010757,0.001126,-0.001709,0.004886,0.002122


In [387]:
rmse_user = user_prediction[test_data_matrix.nonzero()].flatten()
rmse_test = test_data_matrix[test_data_matrix.nonzero()].flatten()
result = sqrt(mean_squared_error(rmse_user, rmse_test))

print ('Evaluation for User Collaboration: ' + str(result))

  

Evaluation for User Collaboration: 6.3482365565885095
