# Q3b Song Recommendation system for female users

In [1]:
import pandas as pd
import numpy as np

# Data preprocessing & transformation

In [2]:
# First data file "usersha1-artmbid-artname-plays.tsv"
# This contains the historical information on the playing song for each user

In [3]:
# Loading the file "usersha1-artmbid-artname-plays.tsv"
df=pd.read_csv("usersha1-artmbid-artname-plays.tsv",delimiter='\t',names=[ "user-mboxsha","musicbrainz-artist-id", "artist-name","plays"])

In [4]:
df.head()

Unnamed: 0,user-mboxsha,musicbrainz-artist-id,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


In [5]:
df.dropna(inplace=True)

In [6]:
df.drop(df.index[-47:-1],inplace=True)
df.drop(df.index[-1],inplace=True)

In [7]:
# Store the artist name in a separate data frame

In [8]:
df_aname=df[["musicbrainz-artist-id","artist-name"]]

In [9]:
df_aname=df_aname.drop_duplicates(subset="musicbrainz-artist-id",keep='first')

In [10]:
df.drop(['artist-name'],axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,user-mboxsha,musicbrainz-artist-id,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706


In [12]:
# User feature matrix

In [13]:
# Loading the data file with user basic information
df1=pd.read_csv("usersha1-profile.tsv",delimiter='\t',names=["user-mboxsha", "gender","age","country","signup"])

In [14]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359347 entries, 0 to 359346
Data columns (total 5 columns):
user-mboxsha    359347 non-null object
gender          326572 non-null object
age             284447 non-null float64
country         359347 non-null object
signup          359347 non-null object
dtypes: float64(1), object(4)
memory usage: 13.7+ MB


In [15]:
df1=df1.set_index('user-mboxsha')

In [16]:
df1.dropna(subset=['gender'],inplace=True)

In [17]:
# Fill the missing values in age with the median
median=df1['age'].median()
df1['age']=df1['age'].fillna(median)

In [18]:
# Fill the unreasonable age values with the median
mask=(df1.age>150) | (df1.age<0)
df1.loc[mask,'age']=median

In [19]:
# Normalize the age column
mean=df1['age'].mean()
std=df1['age'].std()
df1['age']=(df1['age']-mean)/std

In [20]:
# Encoding the categorical variables
df1=pd.concat([df1,pd.get_dummies(df1['gender'],drop_first=True)],axis=1)
df1=pd.concat([df1,pd.get_dummies(df1['country'],drop_first=True)],axis=1)

In [21]:
df1.drop(['gender','country'],axis=1,inplace=True)

In [22]:
# Transform the date in string format onto the datetime format
df1['signup']=pd.to_datetime(df1['signup'])

In [23]:
# Calculate the difference between the signup date and today
df1['signup']=abs(df1['signup']-pd.datetime.now().date())

In [24]:
# Normalize the signup column
mean=df1['signup'].mean()
std=df1['signup'].std()
df1['signup']=(df1['signup']-mean)/std

In [25]:
# dataframe df1 stores the user feature matrix

In [26]:
df1.head()

Unnamed: 0_level_0,age,signup,m,Albania,Algeria,American Samoa,Andorra,Angola,Anguilla,Antarctica,...,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British","Virgin Islands, U.s.",Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
user-mboxsha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000c289a1829a808ac09c00daf10bc3c4e223b,-0.335593,0.425123,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00001411dc427966b17297bf4d69e7e193135d89,-0.217526,-0.299309,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,-0.689796,-0.644953,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00007a47085b9aab8af55f52ec8846ac479ac4fe,0.372812,1.30107,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0000c176103e538d5c9828e695fed4f7ae42dd01,-0.571728,1.331847,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Create Rating Matrix

In [27]:
# Subset the user histortical information from the dataframe df 

In [28]:
user=df['user-mboxsha'].unique()

In [29]:
user[100]
last_index=df[df['user-mboxsha']==user[100]].index[-1]
sdf=df[:last_index]
Rating_matrix=sdf.pivot_table(index='user-mboxsha',columns='musicbrainz-artist-id',values='plays')

In [30]:
first_index=last_index
for i in range(200,1200,100):
    last_index=df[df['user-mboxsha']==user[i]].index[-1]
    sdf=df[first_index:last_index]
    Rating_matrix2=sdf.pivot_table(index='user-mboxsha',columns='musicbrainz-artist-id',values='plays')
    Rating_matrix=pd.concat([Rating_matrix,Rating_matrix2],axis=0)
    first_index=last_index

In [31]:
# Remove the duplicate user id
Rating_matrix=Rating_matrix.sum(level=0)

In [32]:
Rating_matrix.fillna(0.0,inplace=True)

In [33]:
# Rating of each user is calculated by dividing the total number of plays of that user
Rating_matrix=Rating_matrix.div(Rating_matrix.sum(axis=1),axis=0)

In [34]:
common_index=sorted(list(set(df1.index).intersection(Rating_matrix.index)))

In [35]:
Rating_matrix=Rating_matrix[Rating_matrix.index.isin(common_index)]

In [36]:
Rating_matrix.head()

Unnamed: 0_level_0,00034ede-a1f1-4219-be39-02f36853373e,0004537a-4b12-43eb-a023-04009e738d2e,000842dd-08e9-485f-a9b6-8ada9f1c4a12,000b1990-4dd8-4835-abcd-bb6038c13ac7,000d90ec-d64c-48a1-b775-e726fd240e9f,000fc734-b7e1-4a01-92d1-f544261b43f5,000fecd9-ae03-49bc-9a08-636dde5d405d,0011f816-f0a8-4603-b82d-6f021d214672,0017e52a-e23a-437a-a5c1-18ed4ebe968e,0019749d-ee29-4a5f-ab17-6bfa11deb969,...,ffdf4cb7-5d2e-403e-a5ba-b19c18d0feb4,ffdf7f27-a12c-40fe-8ae3-050b46fc2f79,ffe16bba-4d84-409b-8f22-5242c60b930f,ffe53025-23f7-4c80-ac9d-442a3275d4a6,ffe54de6-5cee-4c0f-8329-f5946f9bc3ff,ffe8acdb-3fa3-4f7e-8939-c3862138502d,ffe9ec08-6b6b-4993-9394-e280b429dbfd,ffee2206-eb9e-4b39-9447-8977e269ed0e,fffc6a35-ae65-48be-bcc0-9d604e05665e,fffed9ff-98c6-458a-8379-47e7fb4ba6ec
user-mboxsha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000c289a1829a808ac09c00daf10bc3c4e223b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00001411dc427966b17297bf4d69e7e193135d89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00007a47085b9aab8af55f52ec8846ac479ac4fe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000c176103e538d5c9828e695fed4f7ae42dd01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Now the Rating matrix and the user-feature matrix have the common user
df1=df1[df1.index.isin(common_index)]

In [38]:
df1.head()

Unnamed: 0_level_0,age,signup,m,Albania,Algeria,American Samoa,Andorra,Angola,Anguilla,Antarctica,...,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British","Virgin Islands, U.s.",Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
user-mboxsha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000c289a1829a808ac09c00daf10bc3c4e223b,-0.335593,0.425123,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00001411dc427966b17297bf4d69e7e193135d89,-0.217526,-0.299309,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,-0.689796,-0.644953,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00007a47085b9aab8af55f52ec8846ac479ac4fe,0.372812,1.30107,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0000c176103e538d5c9828e695fed4f7ae42dd01,-0.571728,1.331847,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Subset the required artist name corresponding to the Rating matrix

In [40]:
Sub_aname=df_aname[df_aname['musicbrainz-artist-id'].isin(Rating_matrix.columns)]
Sub_aname.head()

Unnamed: 0,musicbrainz-artist-id,artist-name
0,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch
1,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte
2,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge
3,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking
4,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks


In [41]:
Sub_aname.sort_values(by='musicbrainz-artist-id',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [42]:
Sub_aname=Sub_aname.set_index('musicbrainz-artist-id')

In [43]:
Sub_aname.head()

Unnamed: 0_level_0,artist-name
musicbrainz-artist-id,Unnamed: 1_level_1
00034ede-a1f1-4219-be39-02f36853373e,o rappa
0004537a-4b12-43eb-a023-04009e738d2e,ultra naté
000842dd-08e9-485f-a9b6-8ada9f1c4a12,rumillajta
000b1990-4dd8-4835-abcd-bb6038c13ac7,hayden
000d90ec-d64c-48a1-b775-e726fd240e9f,get cape. wear cape. fly


# Recommendation 

In [44]:
# The train and test sets are obtained by dividing the Rating matrix in a half
row,column=Rating_matrix.shape
train=Rating_matrix[0:int(row*.5)]
test=Rating_matrix[int(row*.5):2*int(row*.5)]

In [45]:
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix

In [46]:
# Transform the train and test sets into the format coo and csr matrices

In [47]:
train_row,train_column=train.shape
train_coo = coo_matrix(train, shape=(train_row, train_column))

In [48]:
test_row,test_column=test.shape
test_coo = coo_matrix(test, shape=(test_row, test_column))

In [49]:
# Obtain the user feature matrices for the train and test sets

In [50]:
ufeature_train=df1[0:train_row]
ufeature_test=df1[train_row:2*train_row]

In [51]:
row1,column1=ufeature_train.shape
ufeafure_train_csr=csr_matrix(ufeature_train,shape=(row1,column1))

In [52]:
row2,column2=ufeature_test.shape
ufeafure_test_csr=csr_matrix(ufeature_test,shape=(row2,column2))

In [53]:
# Subset the female index and location from the user feature matrix

In [54]:
female_index=ufeature_train[ufeature_train['m']==0].index

In [55]:
female_list=[]
for i in female_index:
    female_list.append(train.index.get_loc(i))

In [56]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score



In [66]:
# Parameters for training model
alpha = 5e-3
model = LightFM(no_components=50,learning_rate=0.001, loss='warp',learning_schedule='adagrad')

In [67]:
# Training the data set with the train set

In [68]:
model.fit(train_coo, user_features=ufeafure_train_csr,epochs=200)

<lightfm.lightfm.LightFM at 0x1f1826227f0>

In [69]:
train_precision = precision_at_k(model, train_coo, user_features=ufeafure_train_csr, k=5).mean()
test_precision = precision_at_k(model, test_coo,user_features=ufeafure_test_csr,k=5).mean()

In [70]:
# Precision for predicting top k recommendations 
train_precision, test_precision

(0.17405191, 0.17085829)

In [71]:
# AUC scores for train and test sets
train_auc = auc_score(model, train_coo,user_features=ufeafure_train_csr).mean()
test_auc = auc_score(model, test_coo,user_features=ufeafure_test_csr).mean()

In [72]:
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

AUC: train 0.86, test 0.72.


# Recommendation function

In [73]:
def recommendation(model, data, aname, user_ids_loc, user_id_name):

    """This function provides the top 5 recommendation of the users.
    argument:
    model = trained model,
    data = rating matrix
    aname = np.array stores the artist names,
    user_ids_loc = the required user index location,
    user_id_name = the required user id,
    """

    n_users, n_items = data.shape

    for index, user_id in enumerate(user_ids_loc):
        
        scores = model.predict(user_id, np.arange(n_items))
        top_items = aname[np.argsort(-scores)]
        
        print("User %s" % user_id_name[index])
    
        print("     Recommended:")
        
        for x in top_items[:5]:
            print("        %s" % x)

# Example: Top 5 recommendations for 5 female users

In [74]:
recommendation(model, train, Sub_aname.values, female_list[0:5], female_index)

User 00000c289a1829a808ac09c00daf10bc3c4e223b
     Recommended:
        ['fred astaire']
        ['michelle williams']
        ['mel tormé']
        ['blossom dearie']
        ['james morrison']
User 00001411dc427966b17297bf4d69e7e193135d89
     Recommended:
        ['the shins']
        ['death cab for cutie']
        ['the decemberists']
        ['sigur rós']
        ['iron & wine']
User 0000ef373bbd0d89ce796abae961f2705e8c1faf
     Recommended:
        ['radiohead']
        ['coldplay']
        ['the beatles']
        ['the cure']
        ['muse']
User 0000f687d4fe9c1ed49620fbc5ed5b0d7798ea20
     Recommended:
        ['radiohead']
        ['coldplay']
        ['the cure']
        ['portishead']
        ['the beatles']
User 0001bd96207f323b53652bf400702719ad456d3c
     Recommended:
        ['radiohead']
        ['coldplay']
        ['the cure']
        ['portishead']
        ['the beatles']
