# Music Dataset Recommender

### Import Libraries and data

In [1]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_lg
import string
import pickle
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# #note: pitchfork.csv cleaned in Google Colab due to NLTK import issues to create pitchfork_cleaned
#using the following functions:

# # modified from: https://stackoverflow.com/questions/5511708/adding-words-to-nltk-stoplist
# stop_words = nltk.corpus.stopwords.words('english')
# additional_stopwords = ["“", "’", "”", "."]
# stop_words.extend(additional_stopwords)

# #function to take each review and set it as lowercase and remove stop words/punctuation
# def sentence_cleaner(sentence):
#     cleaner = set(stop_words + list(string.punctuation))
#     new_sentence = [i for i in word_tokenize(sentence.lower()) if i not in cleaner]
#     new_sentence = " ".join(new_sentence)
#     return new_sentence

# cleaned_list = []
# for i in range(len(music_df['content'])):
#     processed_sentence = sentence_cleaner(music_df['content'][i])
#     cleaned_list.append(processed_sentence)
    
# #create cleaned review dataframe
# cleaned_content_df = pd.DataFrame(cleaned_list, columns=['cleaned_content'])

# #dataframe with original text, cleaned text
# music_df_cleaned = music_df.join(cleaned_content_df, how='outer')

# #export dataframe
# music_df_cleaned.to_csv('pitchfork_cleaned.csv')

In [3]:
music_df = pd.read_csv('./data/clean/pitchfork_cleaned.csv', index_col=0)

In [4]:
music_df.shape

(18375, 10)

In [5]:
music_df.head(1)

Unnamed: 0,index,reviewid,score,title,artist,url,content,genre,year,cleaned_content
0,0,22703,9.3,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,"“Trip-hop” eventually became a ’90s punchline,...",electronic,1998.0,trip-hop eventually became 90s punchline music...


In [6]:
music_df.drop_duplicates(subset=['content'], inplace=True)
music_df.dropna(subset=['artist'], inplace=True)
music_df.dropna(subset=['title'], inplace=True)
music_df.dropna(subset=['cleaned_content'], inplace=True)
music_df.reset_index(inplace=True)

### Data Cleaning

In [7]:
nlp = spacy.load('en_core_web_lg')

In [8]:
music_df['cleaned_content'].isnull().sum()

0

Note: these cells previous run and output as "spacy_list"

In [9]:
# spacy_list = []
# for i in range(len(music_df['content'])):
#     spacy_object = nlp(music_df['cleaned_content'][i])
#     spacy_list.append(spacy_object)
#     print(f'{i}')

In [10]:
# #save spacy_list as txt file to load in streamlit
# # https://stackoverflow.com/questions/899103/writing-a-list-to-a-file-with-python/899176
# with open('./streamlit_app/data/spacy_list.txt', 'w', encoding="utf-8") as f:
#     for item in spacy_list:
#         f.write("%s\n" % item)

In [11]:
# open txt file as spacy_list
# https://stackoverflow.com/questions/30523943/how-to-read-just-the-first-column-of-each-row-of-a-csv-file
with open('./data/clean/spacy_list.txt', encoding="utf-8") as f:
    spacy_list = [line.split(',')[0] for line in f]

In [12]:
# music_df.to_pickle('./streamlit_app/data/music.pk1')

### Create Recommender Function

In [13]:
#recomender with genre filter
def recomender(input):
    recomender_list = []
    genre = music_df['genre'][input]
    for i in range(len(spacy_list)):
        vector = nlp(spacy_list[i]).similarity(nlp(spacy_list[input]))
        recomender_list.append(vector)
        recomender_df = pd.DataFrame(recomender_list)
        recomender_df['artist'] = music_df['artist']
        recomender_df['title'] = music_df['title']
        recomender_df['genre'] = music_df['genre']
        recomender_df = recomender_df.sort_values(by=0, ascending=False)
        recomender_df = recomender_df[recomender_df['genre'] == genre]
    return recomender_df[0:6]

In [14]:
recomender(2171)

Unnamed: 0,0,artist,title,genre
2171,1.0,kendrick lamar,to pimp a butterfly,rap
2557,0.985875,run the jewels,run the jewels 2,rap
1081,0.985583,"macklemore, ryan lewis",this unruly mess i've made,rap
4178,0.985397,kanye west,yeezus,rap
1152,0.984933,"vic spencer, chris crack",who the fuck is chris spencer??,rap
4919,0.984865,the coup,sorry to bother you,rap


In [30]:
recomender(4278)

Unnamed: 0,0,artist,title,genre
4278,1.0,daft punk,random access memories,electronic
612,0.990074,the avalanches,wildflower,electronic
4459,0.98717,goldie,the alchemist: the best of goldie 1992-2012,electronic
707,0.987001,cornelius,fantasma,electronic
2633,0.986076,underworld,dubnobasswithmyheadman (20th anniversary remas...,electronic
95,0.98546,justice,woman,electronic


In [31]:
recomender(3391)

Unnamed: 0,0,artist,title,genre
3391,1.0,beck,morning phase,rock
6379,0.9891,screaming trees,last words: the final recordings,rock
15527,0.988082,starflyer 59,old,rock
17988,0.988054,grandaddy,the sophtware slump,rock
8664,0.987978,radiohead,amnesiac: special collectors edition,rock
1984,0.987834,elvis depressedly,new alhambra,rock


In [32]:
recomender(18286)

Unnamed: 0,0,artist,title,genre
18286,1.0,red hot chili peppers,californication,rock
6299,0.975886,red hot chili peppers,i'm with you,rock
15792,0.971209,verbena,la musica negra,rock
16364,0.968323,gary wilson,forgotten lovers,rock
16300,0.967378,the warlocks,phoenix album,rock
18127,0.967179,the smashing pumpkins,machina/the machines of god,rock


In [36]:
recomender(6523)

Unnamed: 0,0,artist,title,genre
6523,1.0,bon iver,"bon iver, bon iver",rock
6268,0.98875,girls,"father, son, holy ghost",rock
8563,0.98767,volcano choir,unmap,rock
8463,0.987592,atlas sound,logos,rock
11410,0.9871,the clientele,god save the clientele,rock
9459,0.987001,animal collective,merriweather post pavilion,rock


### Add polarity and subjectivity to recommender

In [21]:
analyzer = SentimentIntensityAnalyzer()
sentiment_list = []
polarity_list = []
for i in range(len(music_df)):
    text = music_df['cleaned_content'][i]
    sentiment_score = TextBlob(text).sentiment[0]
    polarity_score = analyzer.polarity_scores(text)['compound']
    sentiment_list.append(sentiment_score)
    polarity_list.append(polarity_score)

In [22]:
sentiment_df = pd.DataFrame(sentiment_list, columns=['sentiment_score'])
polarity_df = pd.DataFrame(polarity_list, columns=['polarity_score'])
music_df = music_df.join(sentiment_df, how='outer')
music_df = music_df.join(polarity_df, how='outer')

In [23]:
#recomender with genre filter + polarity and sentiment engineered features
def recomender_sentiment(input):
    recomender_list = []
    genre = music_df['genre'][input]
    for i in range(len(spacy_list)):
        vector = nlp(spacy_list[i]).similarity(nlp(spacy_list[input]))
        recomender_list.append(vector)
        recomender_df = pd.DataFrame(recomender_list)
        recomender_df['artist'] = music_df['artist']
        recomender_df['title'] = music_df['title']
        recomender_df['genre'] = music_df['genre']
        recomender_df['sentiment_score'] = music_df['sentiment_score']
        recomender_df['polarity_score'] = music_df['polarity_score']
        recomender_df['engineered_rec'] = recomender_df[0] + recomender_df['sentiment_score'] + recomender_df['polarity_score']
        recomender_df = recomender_df.sort_values(by='engineered_rec', ascending=False)
        recomender_df = recomender_df[recomender_df['genre'] == genre]
    return recomender_df[0:6]

In [24]:
# #return index value for artist name
# def get_artist_name(artist)
#     which(sapply(music_df, function(x) any(x == "artist")))

In [25]:
recomender_sentiment(2171)

Unnamed: 0,0,artist,title,genre,sentiment_score,polarity_score,engineered_rec
14201,0.950983,pete rock,the surviving elements: from the soul survivor...,rap,0.330952,0.982,2.263935
2648,0.949415,"birdman, young thug, rich homie quan",tha tour part 1,rap,0.302375,0.9973,2.24909
16494,0.961187,rhettmatic,the wedding mixer,rap,0.292799,0.9925,2.246487
12176,0.961189,missy elliott,respect m.e.,rap,0.266222,0.9977,2.225111
8123,0.959465,"freeway, jake one",the stimulus package,rap,0.264847,0.9991,2.223412
1541,0.966105,young thug,slime season,rap,0.239374,0.998,2.203479


In [26]:
recomender_sentiment(4278)

Unnamed: 0,0,artist,title,genre,sentiment_score,polarity_score,engineered_rec
1267,0.974239,kyle hall,from joy,electronic,0.300501,0.9986,2.27334
1644,0.969038,the foreign exchange,tales from the land of milk and honey,electronic,0.301649,0.9975,2.268187
6232,0.96173,tropics,parodia flare,electronic,0.282197,0.9906,2.234526
17817,0.963224,boards of canada,in a beautiful place out in the country ep,electronic,0.265983,0.9898,2.219007
17605,0.976854,schlammpeitziger,collected simplesongs of my temporary past,electronic,0.24505,0.9971,2.219004
18344,0.952562,looper,up a tree,electronic,0.273148,0.989,2.21471
