In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline

In [2]:
# read data into a DataFrame
song_data = pd.read_csv('song_data.csv')
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


In [3]:
# let's limit things to the top 250 songs
n = 250
top_n = song_data.song.value_counts().index[:n]
song_data_top = song_data[song_data.song.isin(top_n)]
print song_data_top.shape

(251302, 6)


In [31]:
# collect the top 250 users from this top 250 songs data to be used as test users in the sql data
top_users_n = song_data_top.user_id.value_counts().index[:n]
user_data_top = song_data_top[song_data_top.user_id.isin(top_users_n)]
print user_data_top.shape

(8723, 6)


In [41]:
user_data_top.to_csv('user_data_top.csv')

In [4]:
print "melting..."
song_wide = pd.pivot_table(song_data_top, values=["listen_count"],
                         index=["song", "user_id"],
                         aggfunc=np.sum).unstack()

melting...


In [5]:
# any cells that are missing data (i.e. a user didn't listen to a song)
# we're going to set to 0
song_wide = song_wide.fillna(0)

In [6]:
# this is the key. we're going to use cosine_similarity from scikit-learn
# to compute the distance between all songs
print "calculating similarity"
dists = cosine_similarity(song_wide)

# stuff the distance matrix into a dataframe so it's easier to operate on
dists = pd.DataFrame(dists, columns=song_wide.index)

# give the indicies (equivalent to rownames in R) the name of the product id
dists.index = dists.columns

calculating similarity


In [42]:
dists.to_pickle('song_similarity.pkl') 

In [18]:
a = np.zeros(shape=(1,len(dists)))
user_df = pd.DataFrame(a,columns = dists.index)

In [19]:
user_df

song,'Till I Collapse - Eminem / Nate Dogg,16 Candles - The Crests,A Beggar On A Beach Of Gold - Mike And The Mechanics,A-Punk (Album) - Vampire Weekend,Ain't Misbehavin - Sam Cooke,Ain't No Rest For The Wicked (Original Version) - Cage The Elephant,Alejandro - Lady GaGa,All I Do Is Win (feat. T-Pain_ Ludacris_ Snoop Dogg & Rick Ross) - DJ Khaled,All The Right Moves - OneRepublic,Almaz - Randy Crawford,...,Word Up! - Cameo,Yeah! - Usher Featuring Lil' Jon & Ludacris,Yellow - Coldplay,You And Me (Wedding Version) - Lifehouse,You And Me Jesus - Jake Hess,You Belong With Me - Taylor Swift,You Give Love A Bad Name - Bon Jovi,You Know I'm No Good - Amy Winehouse,You're The One - Dwight Yoakam,You've Got The Love - Florence + The Machine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
user_data = {'16 Candles - The Crests': 2,
 'Alejandro - Lady GaGa': 5,
 "Yeah! - Usher Featuring Lil' Jon & Ludacris": 1}
for key in user_data.keys():
    user_df.loc[0,key] = user_data[key]  

In [22]:
single_user_matrix_multiply = user_df.dot(dists)

In [27]:
single_user_matrix_transpose = single_user_matrix_multiply.transpose()
song_reco = single_user_matrix_transpose[0].sort_values(ascending=False)
song_reco_10 = song_reco.index[song_reco.index.isin(user_data.keys())==False][:10]

In [30]:
print song_reco_10.values

['Love Story - Taylor Swift'
 'Catch You Baby (Steve Pitron & Max Sanna Radio Edit) - Lonnie Gordon'
 'Halo - Beyonc\xc3\xa9' 'Sehr kosmisch - Harmonia'
 "Just Dance - Lady GaGa / Colby O'Donis" 'Monster - Lady GaGa'
 'Whataya Want From Me - Adam Lambert' 'Bring Me To Life - Evanescence'
 "Nothin' On You [feat. Bruno Mars] (Album Version) - B.o.B"
 'Whatcha Say - Jason Derulo']


In [None]:
user_wide = pd.crosstab(song_data.user_id,song_data.song,values=song_data.listen_count,
                         aggfunc=np.sum,rownames=['user_id'], colnames=['song'])
# any cells that are missing data (i.e. a user didn't buy a particular product)
# we're going to set to 0
user_wide = user_wide.fillna(0)

In [None]:
user_matrix_multiply = user_wide.dot(dists)
user_matrix_multiply.head()

In [None]:
def some_fn(x):
    if x > 0 :
         return 0
    else:
          return 1

In [None]:
user_song_notheard = user_wide.applymap(some_fn)
user_multiply = user_matrix_multiply * user_song_notheard

In [None]:
user_multiply_transpose = user_multiply.transpose()

In [None]:
def sorted(s, num):
    tmp = s.order(ascending=False)[:num]
    tmp2 = pd.Series(tmp.index.values, name=tmp.name)
    #tmp.index = range(num)
    return tmp2

In [None]:
song_recommend = user_multiply_transpose.apply(lambda x: sorted(x, 10))