# Playlist Group Based on Genre Similarity

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_rows', None)

In [3]:
# Read in the data
data = pd.read_csv('../data/ph_spotify_daily_charts_artists1.csv')

# Filter out rows with empty genres
data = data[data['genres'] != '[]']
data.head()

Unnamed: 0,artist_id,artist_name,total_followers,genres,popularity
0,0du5cEVh5yTK9QJze8zA0C,Bruno Mars,47387027,"['dance pop', 'pop']",89
1,4IWBUUAFIplrNtaOHcJPRM,James Arthur,11471232,"['pop', 'talent show', 'uk pop']",82
2,69GGBxA162lTqCwzJG5jLp,The Chainsmokers,20036566,"['dance pop', 'edm', 'electropop', 'pop', 'pop...",81
3,04gDigrS5kc9YWfZHwBETP,Maroon 5,40125006,['pop'],86
4,5p7f24Rk5HkUZsaS3BLG5F,Hailee Steinfeld,8535540,"['dance pop', 'pop', 'post-teen pop']",73


In [4]:
# Filter for artists with genres
data = data[data['genres'].notnull()]

In [5]:
# Convert genre strings to lists
data['genres'] = data['genres'].apply(lambda x: x.strip('[]').split(', '))

In [6]:
# Remove duplicates and reset index
data = data.drop_duplicates(subset=['artist_id']).reset_index(drop=True)

In [7]:
# Filter for Dilaw songs
dilaw = data[data['artist_name'] == 'Dilaw']

In [8]:
# Create CountVectorizer object and fit_transform on genres
cv = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
genres_cv = cv.fit_transform(data['genres'].apply(lambda x: ' '.join(x)))



In [9]:
# Compute cosine similarity matrix
cos_sim = cosine_similarity(genres_cv)

In [10]:
# Get index of Dilaw songs in the data
dilaw_index = dilaw.index[0]

In [11]:
# Compute cosine similarity scores of all songs to Dilaw songs
dilaw_scores = cos_sim[dilaw_index]

In [12]:
# Get indices of songs with highest cosine similarity scores
most_similar_indices = dilaw_scores.argsort()[:-100:-1]
most_similar_genres = data.iloc[most_similar_indices][['artist_name', 'genres', 'popularity']]
most_similar_genres = most_similar_genres.explode('genres')
most_similar_genres = most_similar_genres.groupby('genres').agg({'artist_name': ', '.join, 'popularity': 'mean'}).reset_index()
most_similar_genres.rename(columns={'artist_name': 'artists'}, inplace=True)

# Create a new dataframe with cosine distances
similar_cosine_distances = pd.DataFrame({'cosine_distance': 1 - dilaw_scores[most_similar_indices]})

# Concatenate the two dataframes
most_similar_genres = pd.concat([most_similar_genres, similar_cosine_distances], axis=1)

In [21]:
# Display the most similar genres
print('Most Similar Genres:')
most_similar_genres.head(50)

Most Similar Genres:


Unnamed: 0,genres,artists,popularity,cosine_distance
0,'acoustic cover',Boyce Avenue,71.0,0.0
1,'afrofuturism',Janelle Monáe,67.0,0.047543
2,'alt z',"VÉRITÉ, Chelsea Cutler, Carlie Hanson, Claire ...",58.857143,0.055189
3,'alternative hip hop',"Dilaw, Oliver Tree",72.0,0.061045
4,'alternative metal',Lifehouse,65.0,0.07277
5,'alternative pop rock',Dominic Fike,74.0,0.074737
6,'alternative r&b',Janelle Monáe,67.0,0.079651
7,'alternative rock',Dilaw,64.0,0.081944
8,'argentine hip hop',Bizarrap,88.0,0.083562
9,'art pop',"Kate Bush, King Princess, Hayley Williams",66.0,0.084167


In [19]:
# Get indices of songs with lowest cosine similarity scores
least_similar_indices = dilaw_scores.argsort()[:200]
least_similar_genres = data.iloc[least_similar_indices][['artist_name', 'genres', 'popularity']]
least_similar_genres = least_similar_genres.explode('genres')
least_similar_genres = least_similar_genres.groupby('genres').agg({'artist_name': ', '.join, 'popularity': 'mean'}).reset_index()
least_similar_genres.rename(columns={'artist_name': 'artists'}, inplace=True)

# Create a new dataframe with cosine distances
distant_cosine_distances = pd.DataFrame({'cosine_distance': 1 - dilaw_scores[least_similar_indices]})

# Concatenate the two dataframes
least_similar_genres = pd.concat([least_similar_genres, distant_cosine_distances], axis=1)

In [20]:
# Display the least similar genres
print('Least Similar Genres:')
least_similar_genres.head(50)

Least Similar Genres:


Unnamed: 0,genres,artists,popularity,cosine_distance
0,'a cappella',Pentatonix,65.0,0.609428
1,'adult standards',"Darlene Love, Nat King Cole, Bobby Helms",54.666667,0.557899
2,'afro r&b',CKay,72.0,0.532047
3,'afrofuturism',Steve Lacy,83.0,0.532047
4,'alt z',Rosa Linn,75.0,0.532047
5,'bass trap',Zookeepers,44.0,0.532047
6,'basshall',Aya Nakamura,74.0,0.532047
7,'bedroom pop',"d4vd, A-Wall",70.5,0.532047
8,'brooklyn drill',Pop Smoke,84.0,0.521648
9,'bubblegum pop',Edison Lighthouse,59.0,0.485683
