In [1]:
import pandas as pd

In [2]:
user_artists = pd.read_csv('user_artists.dat', sep='\t')
user_artists.head()

FileNotFoundError: [Errno 2] File b'user_artists.dat' does not exist: b'user_artists.dat'

In [None]:
artists = pd.read_csv('artists.dat', sep='\t')
artists.head()

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
#pd.set_option('display.max_colwidth', -1)


In [None]:
# df.rename(str.lower, axis='columns')
# df.rename({1: 2, 2: 4}, axis='index')

artists = artists.rename({'id': 'artistID'}, axis='columns')
artists.head()

In [None]:
user_and_artists = pd.merge(artists, user_artists, on='artistID')
user_and_artists.head()

In [None]:
user_and_artists.shape

In [None]:
len(user_and_artists['name'].value_counts())

In [None]:
user_and_artists = user_and_artists.sort_values(by='userID')
user_and_artists.head()

In [None]:
user_and_artists.to_csv('user_and_artists.csv', index=False)

In [None]:
df = pd.read_csv('user_and_artists.csv')
df.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

user_and_artists['weight'].hist(bins=3)

In [None]:
user_and_artists['weight'].describe()

In [None]:
# Here I'm inferring a rating for each artist based on how many times each user listened to them using 'weight'

ratings = []

for row in user_and_artists['weight']:
    if row <=25.0:
        ratings.append(1)
    elif row <= 50.0:
        ratings.append(2)
    elif row <= 107.0:
        ratings.append(3)
    elif row <= 175.0:
        ratings.append(4)
    elif row <= 260.0:
        ratings.append(5)
    elif row <= 400.0:
        ratings.append(6)
    elif row <= 614.0:
        ratings.append(7)
    elif row <= 1000.0:
        ratings.append(8)
    elif row <= 5000.0:
        ratings.append(9)
    else:
        ratings.append(10)
        
user_and_artists['artistRatings'] = ratings
user_and_artists.head()

In [None]:
user_and_artists['artistRatings'].value_counts(normalize=True)

### Recommendations based on ratings counts

In [None]:
rating_count = pd.DataFrame(user_and_artists.groupby('name')['artistRatings'].count())
rating_count.sort_values('artistRatings', ascending=False).head()

For the last.fm dataset, the artist that received the most rating counts is Lady Gaga.  Something that binds every artist in this list, with the exception of The Beatles, is that they're all pop singers.

### Recommendations based on correlations

We use Pearson's R correlation coefficient.  First we need to find out the average rating, and the number of ratings each book received.

In [None]:
average_rating = pd.DataFrame(user_and_artists.groupby('name')['artistRatings'].mean())
average_rating.sort_values(by='artistRatings', ascending=False).head(50)

In [None]:
average_rating['ratingCount'] = pd.DataFrame(user_and_artists.groupby('name')['artistRatings'].count())
average_rating.sort_values('ratingCount', ascending=False).head()

In this dataset, the artist that received the most rating counts was not highly rated at all.  As a result, if we were to use recommendations based on rating counts, we would definitely make mistakes here.  So we need a better system.

### Ratings Matrix

We convert the ratings table to a 2D matrix.  The matrix will be sparse because not every user rated every artist.

In [None]:
user_and_artists.head()

In [None]:
# This is the core of the correlation method

user_and_artists_pivot = user_and_artists.pivot(index='userID', columns='name').artistRatings
userID = user_and_artists_pivot.index
name = user_and_artists_pivot.columns
print(user_and_artists_pivot.shape)
user_and_artists_pivot.head()

Let's find out which artists are correlated with Katy Perry.  

In [None]:
# we're essentially taking the 'Katy Perry' column and measuring its correlation with all the columns of the 
# user_and_artists_pivot DataFrame, so accounting for users with similar tastes in music.

katy_ratings = user_and_artists_pivot['Katy Perry']
similar_to_katy = user_and_artists_pivot.corrwith(katy_ratings)
corr_katy = pd.DataFrame(similar_to_katy, columns=['pearson'])
corr_katy.dropna(inplace=True)
corr_katy.head()

In [None]:
corr_katy.head(10)

In [None]:
corr_summary = corr_katy.join(average_rating['ratingCount'])
corr_summary.head(10)

In [None]:
corr_summary.shape

In [None]:
# Our results

corr_summary[corr_summary['ratingCount']>=100].sort_values('pearson', ascending=False).head(10)

### How accurate are our results for Katy Perry?

In [None]:
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/KlyXNRrsk4A" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')


In [None]:
# Duran Duran

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/nTizYn3-QN0" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')

In [None]:
# Queen

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/fJ9rUzIMcZQ" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')

#### These aren't necessarily the best recommendations in terms of similarity, but may be for people with eclectic musical tastes.

In [None]:
pwd

In [None]:
mars_ratings = user_and_artists_pivot['30 Seconds to Mars']
similar_to_mars = user_and_artists_pivot.corrwith(mars_ratings)
corr_mars = pd.DataFrame(similar_to_mars, columns=['pearson'])
corr_mars.dropna(inplace=True)
corr_mars.head()

In [None]:
corr_summary = corr_mars.join(average_rating['ratingCount'])
corr_summary[corr_summary['ratingCount'] >= 100].sort_values('pearson', ascending=False).head(10)

In [None]:
# These recommendations seem a little better, although I'm not sure how Jennifer Lopez snuck up into the top-ten here.

### Collaborative Filtering Using k-Nearest Neighbors

kNN method discovers clusters of similar users based on common ratings

In [None]:
user_and_artists.head()

In [None]:
artist_ratingCount = (user_and_artists.
                     groupby(by=['name'])['artistRatings'].
                     count().
                     reset_index().
                     rename(columns = {'artistRatings': 'totalRatingCount'})
                     [['name', 'totalRatingCount']]
                     )
artist_ratingCount.head()


In [None]:
artist_ratingCount.head(25)

In [None]:
rating_with_artistRatingCount = artist_ratingCount.merge(user_and_artists, left_on='name', right_on='name', how='left')
rating_with_artistRatingCount.head()

In [None]:
print(artist_ratingCount['totalRatingCount'].describe())

The median artist has been rated only once.  Let's look at the top of the distribution:

In [None]:
import numpy as np

print(artist_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

About 1% of the books received 81 or more ratings.  

Because we don't have as much data as we think we need, we will try to use most of the data.

In [None]:
artist_ratingCount.shape

In [None]:
popularity_threshold = 10

rating_popular_artist = rating_with_artistRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_artist.head()

In [None]:
rating_popular_artist.shape

### Implementing kNN

We convert out table to a 2D matrix, and fill the missing values with zeros (since we will calculate distances between rating vectors).  We then transform the values (ratings) of the matrix dataframe into a scipy sparse matrix for more efficient calculations.

### Finding the Nearest Neighbors

We use unsupervised algorithms with sklearn.neighbors.  The algorithm we use to compute the nearest neighbors is "brute", and we specify "metric=cosine" so that the algorithm will calculate the cosine similarity between rating vectors.  And then we fit the model.

In [None]:
from scipy.sparse import csr_matrix

In [None]:
rating_popular_artist_pivot = rating_popular_artist.pivot(index='name', columns='userID', values='artistRatings').fillna(0)
rating_popular_artist_pivot.head()

In [None]:
rating_popular_artist_matrix = csr_matrix(rating_popular_artist_pivot.values)
type(rating_popular_artist_matrix)

In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(rating_popular_artist_matrix)

### Test our model and make some recommendations:

In this step, the kNN algorithm measures distance to determine the "closeness" of instances.  It then classifies an instance by finding its nearest neighbors, and picks the most popular class among the neighbors.

In [None]:
rating_popular_artist_pivot.shape

In [None]:
query_index = np.random.choice(rating_popular_artist_pivot.shape[0]) # picks a random index number
query_index

In [None]:
distances, indices = model_knn.kneighbors(rating_popular_artist_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_popular_artist_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distances of {2}:'.format(i, rating_popular_artist_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

In [None]:
query_index = np.random.choice(rating_popular_artist_pivot.shape[0]) # picks a random index number
distances, indices = model_knn.kneighbors(rating_popular_artist_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_popular_artist_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distances of {2}:'.format(i, rating_popular_artist_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

In [None]:
# I think these are pretty decent.

### Collaborative Filtering Using Matrix Factorization

Matrix Factorization is simply a mathematical tool for playing around with matrices.  Matrix Factorization techniques are usually more effective because they allow users to discover the latent (hidden) features underlying the interactions between users and items (artists).

We use singular value decomposition (SVD) - one of the Matrix Factorization models for identifying latent factors.

Similar to kNN we convert our rating_popular_artist table into a 2D matrix (called a utility matrix here) and fill the missing values with zeros.

In [None]:
rating_popular_artist_pivot_2 = rating_popular_artist.pivot(index='userID', columns='name', values='artistRatings').fillna(0)
rating_popular_artist_pivot_2.head()

We transpose this utility matrix, so that the name of artists becomes rows and userIDs become columns.  

In [None]:
rating_popular_artist_pivot_2.shape

In [None]:
X = rating_popular_artist_pivot_2.values.T
X.shape

In [None]:
rating_popular_artist_pivot_2.T.head()

In [None]:
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=42)
matrix = SVD.fit_transform(X)
matrix.shape

We calculate the Pearson's R correlation coefficient for every book pair in our final matrix.  To compare this with the results from kNN, we pick the same artist: 'Bruno Mars' to find the artists that have high correlation coefficients (between 0.9 and 1.0) with it.

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

In [None]:
rating_popular_artist_name = rating_popular_artist_pivot_2.columns
rating_popular_artist_list = list(rating_popular_artist_name)


In [None]:
Bruno_Mars = rating_popular_artist_list.index("Bruno Mars")

In [None]:
Bruno_Mars

In [None]:
corr_Bruno_Mars = corr[Bruno_Mars]

In [None]:
corr_Bruno_Mars

In [None]:
list(rating_popular_artist_name[(corr_Bruno_Mars<1.0) & (corr_Bruno_Mars>0.9)])

In [None]:
# Another one: 2NE1

kPop = rating_popular_artist_list.index("2NE1")

kPop

In [None]:
kPop = rating_popular_artist_list.index("2NE1")
corr_kPop = corr[kPop]
list(rating_popular_artist_name[(corr_kPop<1.0) & (corr_kPop>0.9)])

In [None]:
print(list(rating_popular_artist_pivot_2.columns))

In [None]:
artists = pd.DataFrame(list(rating_popular_artist_pivot_2.columns))
artists.head()

In [None]:
artists.to_csv('artist_list.csv', index=False)

In [None]:
MCR = rating_popular_artist_list.index("My Chemical Romance")
corr_MCR = corr[MCR]
list(rating_popular_artist_name[(corr_MCR<1.0) & (corr_MCR>0.9)])

In [None]:
# This seems magical even after studying Data Science for 6 months

In [None]:
pwd

In [None]:
ADI = rating_popular_artist_list.index("At the Drive-In")
corr_ADI = corr[ADI]
list(rating_popular_artist_name[(corr_ADI<1.0) & (corr_ADI>0.85)])

In [None]:
NSYNC = rating_popular_artist_list.index("*NSYNC")
corr_NSYNC = corr[NSYNC]
list(rating_popular_artist_name[(corr_NSYNC<1.0) & (corr_NSYNC>0.9)])

In [None]:
stones = rating_popular_artist_list.index("12 Stones")
corr_stones = corr[stones]
list(rating_popular_artist_name[(corr_stones<1.0) & (corr_stones>0.9)])

In [None]:
tupac = rating_popular_artist_list.index("2Pac")
corr_tupac = corr[tupac]
list(rating_popular_artist_name[(corr_tupac<1.0) & (corr_tupac>0.9)])

In [None]:
kPop = rating_popular_artist_list.index("2NE1")
corr_kPop = corr[kPop]
list(rating_popular_artist_name[(corr_kPop<1.0) & (corr_kPop>0.9)])

In [None]:
doors_down = rating_popular_artist_list.index("3 Doors Down")
corr_doors_down = corr[doors_down]
list(rating_popular_artist_name[(corr_doors_down<1.0) & (corr_doors_down>0.85)])

In [None]:
to_mars = rating_popular_artist_list.index("30 Seconds to Mars")
corr_to_mars = corr[to_mars]
list(rating_popular_artist_name[(corr_to_mars<1.0) & (corr_to_mars>0.85)])

In [None]:
three_eleven = rating_popular_artist_list.index("311")
corr_three_eleven = corr[three_eleven]
list(rating_popular_artist_name[(corr_three_eleven<1.0) & (corr_three_eleven>0.9)])

In [None]:
fifty = rating_popular_artist_list.index("50 Cent")
corr_fifty = corr[fifty]
list(rating_popular_artist_name[(corr_fifty<1.0) & (corr_fifty>0.9)])

In [None]:
seagulls = rating_popular_artist_list.index("A Flock of Seagulls")
corr_seagulls = corr[seagulls]
list(rating_popular_artist_name[(corr_seagulls<1.0) & (corr_seagulls>0.99)])

In [None]:
circle = rating_popular_artist_list.index("A Perfect Circle")
corr_circle = corr[circle]
list(rating_popular_artist_name[(corr_circle<1.0) & (corr_circle>0.85)])

In [None]:
fifty = rating_popular_artist_list.index("")
corr_fifty = corr[fifty]
list(rating_popular_artist_name[(corr_fifty<1.0) & (corr_fifty>0.9)])