In [536]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.feature_extraction.text import TfidfVectorizer

In [537]:
pd.options.display.max_rows = 10000
pd.options.display.max_seq_items = 10000


In [544]:
album_df = pd.read_csv(r'../data/album_df.csv', index_col=[0])
columns = pd.read_csv(r'../data/column.csv', index_col=[0])


In [564]:
columns.columns

Index(['Alternative Rock', 'Art Rock', 'Progressive Rock', 'Conscious Hip Hop',
       'West Coast Hip Hop', 'Jazz Rap', 'Experimental Rock', 'Electronic',
       'Shoegaze', 'Noise Pop', 'Pop Rock', 'Abstract Hip Hop', 'Glam Rock',
       'Psychedelic Pop', 'New Wave', 'Post-Punk', 'Avant-Garde Jazz',
       'Third Stream', 'Spiritual Jazz', 'Dream Pop', 'Neo-Psychedelia',
       'Heavy Metal', 'Hard Rock', 'Modal Jazz', 'Cool Jazz',
       'East Coast Hip Hop', 'Boom Bap', 'Hardcore Hip Hop', 'Post-Rock',
       'Baroque Pop', 'Indie Folk', 'Indie Rock', 'Pop Rap', 'Hip Hop',
       'Post-Hardcore', 'Math Rock', 'Contemporary Folk', 'Singer/Songwriter',
       'Gothic Rock', 'Rock', 'Jangle Pop', 'Indie Pop', 'Folk Rock',
       'Jazz Fusion', 'Progressive Pop', 'Ambient', 'Trip Hop', 'Art Pop',
       'Art Punk', 'Instrumental Hip Hop', 'Experimental Hip Hop',
       'Plunderphonics', 'Symphonic Prog', 'Psychedelic Rock',
       'Psychedelic Folk', 'Lo-Fi / Slacker Rock', 'Blues Roc

In [550]:
album_df.head()

Unnamed: 0,Album,Artist,Genres,Secondary_Genres,Album_Descriptors
0,OK Computer,Radiohead,"Alternative Rock, Art Rock",,"melancholic, anxious, futuristic, male vocals,..."
1,Wish You Were Here,Pink Floyd,"Progressive Rock, Art Rock","Space Rock, Psychedelic Rock","melancholic, atmospheric, progressive, male vo..."
2,In the Court of the Crimson King,King Crimson,"Progressive Rock, Art Rock","Symphonic Prog, Jazz-Rock, Free Improvisation,...","fantasy, epic, progressive, complex, philosoph..."
3,To Pimp a Butterfly,Kendrick Lamar,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","Political Hip Hop, Neo-Soul, Funk, Poetry, Exp...","political, conscious, poetic, protest, concept..."
4,Kid A,Radiohead,"Art Rock, Experimental Rock, Electronic","Ambient, Electronic, IDM","cold, melancholic, futuristic, anxious, atmosp..."


In [500]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [506]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(album_df['Album_Genres_Descriptors'])

In [421]:
tfidf_matrix.shape

(4739, 48539)

In [422]:

# Cosine similarity
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


Album = album_df['Album']
# finding indices of every title
indices = pd.Series(album_df.index, index=Album)

In [430]:
album_df.columns

Index(['Album', 'Artist', 'Album_Genres_Descriptors'], dtype='object')

In [509]:
def get_recommendations(df, column, value, value_list, limit=1):

    # Turning value_list from a list to a string
    values = ', '.join([str(elem) for elem in value_list])
    
    # Adding a new row to the end of the Dataframe
    df = df.append({'Album':value, 'Artist':value, 'Album_Genres_Descriptors':values}, ignore_index = True)

    # Vectorizing the Album_Genres_Descriptors column
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(df['Album_Genres_Descriptors'])
    
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

    indices = pd.Series(df.index, index=df[column]).drop_duplicates()

    # Get the index for the target value
    target_index = indices[value]

    # Get the cosine similarity scores for the target value
    cosine_similarity_scores = list(enumerate(cosine_similarities[target_index]))

    # Sort the cosine similarities in order of closest similarity
    cosine_similarity_scores = sorted(cosine_similarity_scores, key=lambda x: x[1], reverse=True)

    # Return tuple of the requested closest scores excluding the target item and index
    cosine_similarity_scores = cosine_similarity_scores[1:limit+1]

    # Extract the tuple values
    index = (x[0] for x in cosine_similarity_scores)
    scores = (x[1] for x in cosine_similarity_scores)

    # Get the indices for the closest items
    recommendation_indices = [i[0] for i in cosine_similarity_scores]

    # Get the actutal recommendations
    recommendations = df[column].iloc[recommendation_indices]

    # Return a dataframe
    df = pd.DataFrame(list(zip(index, recommendations, scores)), 
                      columns=['index','Album', 'Scores'])

    return df

In [526]:
recommendations = get_recommendations(album_df,
                                      'Album',
                                      'new-album',
                                      ['Thrash Metal', 'Acoustic Rock', 'energetic', 'male vocals', 'Alternative Rock', 'Art Rock', 'melancholic'])

recommendations.head()

Unnamed: 0,index,Album,Artist
0,3103,Live at the Fillmore,0.71147
1,3900,O monstro precisa de amigos,0.215864
2,3921,O monstro precisa de amigos,0.215864
3,4472,Innuendo,0.213631
4,0,OK Computer,0.210782


In [529]:
print(album_df.loc[1640])

Album                                                              Plays Live
Artist                                                          Peter Gabriel
Album_Genres_Descriptors    Art Rock, Progressive Rock, Pop Rock, male vocals
Name: 1640, dtype: object
