In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from nltk.stem import PorterStemmer, WordNetLemmatizer

### DATA

In [None]:
df = pd.read_csv('spotify_songs.csv')
df.info()

In [None]:
df1 = df[df['language'] == 'en']

In [None]:
df1 = df1.drop_duplicates(subset=['lyrics']).reset_index(drop=True)

In [None]:
df1['lyrics'] = df1['lyrics'].astype(str)

### TFIDF - Cleaning Lyrics

In [None]:
# Used lemmatizeter
wordnet = WordNetLemmatizer()
stop_words = stopwords.words("english")
stop_words.extend(['oh','yeah','na','ya','us','make','oooo','right','gotta','wo','ong','ou','x5','uhm','em','x8','x2','x4',' gonna',
                   'oooooooh','wanna','wooo','go','back','said','come','things','get','one','place','would','like','cause'])
# Adding a cleaned column
df1['lyrics'] = df1['lyrics'].apply(lambda x: " ".join([wordnet.lemmatize(i) for i in re.sub("[^a-zA-Z^\n\d+\s^\s+|\s+?$]", " ", x).split() if i not in stop_words]).lower())

In [None]:
#Tfidf Vectorizer
tfidf = TfidfVectorizer(stop_words=stop_words)
#tfidf = TfidfVectorizer(stop_words=stop_words, sublinear_tf=True, norm='l2')
tfidf_matrix = tfidf.fit_transform(df1['lyrics'])
#final_features_arr = final_features.toarray()
tfidf_matrix.shape

### Recommender

In [None]:
class SongRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, artist, recom_song):
        rec_items = len(recom_song)
        
        print(f'The {rec_items} recommended songs for {song} by {artist} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {recom_song[i][0]} similarity score") 
            print("--------------------")
        
    def recommend(self, recommendation):
        # Get song to find recommendations for
        song = recommendation['track_name']
        # Get artist to find recommendations for
        artist = recommendation['track_artist']
        # Get number of songs to recommend
        number_songs = recommendation['number_songs']
        # Get the number of songs most similars from matrix similarities
        recom_song = self.matrix_similar[song][:number_songs]
        # print each item
        self._print_message(song=song, artist=artist, recom_song=recom_song)

### Recommendations with TFIDF

In [None]:
cosine_similarities = cosine_similarity(tfidf_matrix)

In [None]:
similarities = {}

In [None]:
for i in range(len(cosine_similarities)):
    # Now we'll sort each element in cosine_similarities and get the indexes of the songs.
    similar_indices = cosine_similarities[i].argsort()[:-50:-1]
    # After that, we'll store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities[df1['track_name'].iloc[i]] = [(cosine_similarities[i][x], df1['track_name'][x], df1['track_artist'][x]) for x in similar_indices][1:]

In [None]:
recommedations = SongRecommender(similarities)

In [None]:
##Lady Gaga - Just dance
recommendation = {
    "track_name": df1['track_name'].iloc[25077],
    "track_artist" : df1['track_artist'].iloc[2507],
    "number_songs": 20
}

##Recommendations 
recommedations.recommend(recommendation)

In [None]:
##Coldplay - The Scientist
recommendation1 = {
    "track_name": df1['track_name'].iloc[12313],
    "track_artist" : df1['track_artist'].iloc[12313],
    "number_songs": 10
}
##Recommendations
recommedations.recommend(recommendation1)

### Recommendations with LDA

In [None]:
lda = LDA(n_components= 30)

In [None]:
## Creating the topics
topic_values = lda.fit_transform(tfidf_matrix)
lda.components_.shape

In [None]:
##Creating a data frame with topic values
topics_df = pd.DataFrame(topic_values)
topics_df

In [None]:
##Joining dataframes
df1_wtopics = df1.join(topics_df)
df1_wtopics

In [None]:
## Using topic values to calculate cosine similarities
X = df1_wtopics[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]]

In [None]:
cosine_similarities1 = cosine_similarity(X)
similarities1 = {}

In [None]:
for i in range(len(cosine_similarities1)):
    # Sort each element in cosine_similarities and get the indexes of the songs. 
    similar_indices = cosine_similarities1[i].argsort()[:-50:-1] 
    # After that store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities1[df1_wtopics['track_name'].iloc[i]] = [(cosine_similarities1[i][x], df1_wtopics['track_name'][x], df1_wtopics['track_artist'][x]) for x in similar_indices][1:]

In [None]:
recommedations1 = SongRecommender(similarities1)

In [None]:
## Lady Gaga - Just Dance
recommendation_lda = {
    "track_name": df1_wtopics['track_name'].iloc[2507],
    "track_artist": df1_wtopics['track_artist'].iloc[2507],
    "number_songs": 10
}
recommedations1.recommend(recommendation_lda)

In [None]:
## Coldplay - The scientist
recommendation_lda1 = {
    "track_name": df1_wtopics['track_name'].iloc[12313],
    "track_artist": df1_wtopics['track_artist'].iloc[12313],
    "number_songs": 10
}
recommedations1.recommend(recommendation_lda1)

### Recommendations with Audio Features

In [None]:
## Audio Features
Audio = df1_wtopics[['danceability', 'energy','key','loudness','mode', 'speechiness','acousticness', 'instrumentalness','liveness','valence','tempo']]

In [None]:
cosine_similarities_Audio = cosine_similarity(Audio)
similarities_Audio = {}

In [None]:
for i in range(len(cosine_similarities_Audio)):
    # Sort each element in cosine_similarities and get the indexes of the songs. 
    similar_indices = cosine_similarities_Audio[i].argsort()[:-50:-1] 
    # After that store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities_Audio[df1_wtopics['track_name'].iloc[i]] = [(cosine_similarities_Audio[i][x], df1_wtopics['track_name'][x], df1_wtopics['track_artist'][x]) for x in similar_indices][1:]

In [None]:
recommedations_Audio = SongRecommender(similarities_Audio)

In [None]:
## Lady Gaga - Just Dance
recommendation_Audio = {
    "track_name": df1_wtopics['track_name'].iloc[2507],
    "track_artist": df1_wtopics['track_artist'].iloc[2507],
    "number_songs": 10
}
recommedations_Audio.recommend(recommendation_Audio)

In [None]:
## Coldplay - The Scientist
recommendation_Audio1 = {
    "track_name": df1_wtopics['track_name'].iloc[12313],
    "track_artist": df1_wtopics['track_artist'].iloc[12313],
    "number_songs": 10
}
recommedations_Audio.recommend(recommendation_Audio1)

### Recommendations with Audio Features and LDA 

In [None]:
A_wlyrics = df1_wtopics[['danceability', 'energy','key','loudness','mode', 'speechiness','acousticness', 'instrumentalness','liveness','valence','tempo',0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]]

In [None]:
cosine_similarities_A_wlyrics = cosine_similarity(A_wlyrics)
similarities_A_wlyrics = {}

In [None]:
for i in range(len(cosine_similarities_A_wlyrics)):
    # Sort each element in cosine_similarities and get the indexes of the songs. 
    similar_indices = cosine_similarities_A_wlyrics[i].argsort()[:-50:-1] 
    # After that store in similarities each name of the 50 most similar songs.
    # Except the first one that is the same song.
    similarities_A_wlyrics[df1_wtopics['track_name'].iloc[i]] = [(cosine_similarities_A_wlyrics[i][x], df1_wtopics['track_name'][x], df1_wtopics['track_artist'][x]) for x in similar_indices][1:]

In [None]:
recommedations_A_wlyrics = SongRecommender(similarities_A_wlyrics)

In [None]:
## Lady Gaga - Just Dance
recommendation_A_wlyrics = {
    "track_name": df1_wtopics['track_name'].iloc[2507],
    "track_artist": df1_wtopics['track_artist'].iloc[2507],
    "number_songs": 10
}
recommedations_A_wlyrics.recommend(recommendation_A_wlyrics)

In [None]:
## Coldplay - The Scientist
recommendation_A_wlyrics1 = {
    "track_name": df1_wtopics['track_name'].iloc[12313],
    "track_artist": df1_wtopics['track_artist'].iloc[12313],
    "number_songs": 10
}
recommedations_A_wlyrics.recommend(recommendation_A_wlyrics1)