In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from ast import literal_eval

data = pd.read_csv('spotify_data/data.csv')
data['artists'] = data['artists'].apply(literal_eval)
data_w_genres = pd.read_csv('spotify_data/data_w_genres.csv')

In [2]:
artist_genres = data_w_genres[['artists', 'genres']]

separated_artists = data.explode('artists')
separated_artists = separated_artists.join(artist_genres.set_index('artists'), on='artists')

cols = set(separated_artists.columns)
cols.remove('artists')
cols.remove('genres')
songs_w_genres = separated_artists.groupby(list(cols), as_index=False).agg({'artists': [list], 'genres': [lambda x: x.iloc[0]]})
songs_w_genres.columns = songs_w_genres.columns.droplevel(-1)
songs_w_genres = songs_w_genres[songs_w_genres.genres != '[]'].dropna()

genre_corpus = []

for genres in songs_w_genres['genres']:
    for genre in literal_eval(genres):
        genre_corpus.append(genre)

vectorizer = CountVectorizer(min_df=5750, ngram_range=(1,3))
vectorizer.fit_transform(genre_corpus)
# manually_pruned = set(['adult', 'album', 'alternative', 'and', 'art', 'contemporary', 'cool', 'dance', 
#                    'early', 'era', 'gold', 'hard', 'hip', 'hop', 'new', 'post', 'roots', 'singer', 
#                    'soft', 'songwriter', 'standards', 'mellow', 'wave', 'vocal jazz', 'cool jazz'])

manually_pruned_combos = set(['psychedelic rock', 'classic rock', 'blues rock', 'adult', 'hip', 'hop', 'post', 
                              'songwriter', 'pop rap', 'roots rock', 'folk rock', 'and', 'era', 'art rock', 
                              'pop rock', 'new', 'hard rock', 'alternative rock', 'early', 'soft rock', 'cool jazz', 
                              'vocal jazz', 'dance pop', 'album rock', 'band', 'country rock', 'gold', 'singer', 
                              'mellow', 'standards', 'new wave'])

genres = set(vectorizer.get_feature_names())

genres = genres - manually_pruned_combos

print(len(genres))
print(genres)


42
{'romantic', 'rap', 'classical', 'folk', 'psychedelic', 'modern', 'bop', 'soft', 'vocal', 'hard', 'rock', 'roots', 'grunge', 'country', 'indie', 'adult standards', 'classic', 'wave', 'swing', 'southern', 'lounge', 'tango', 'soul', 'contemporary', 'jazz', 'metal', 'bebop', 'singer songwriter', 'pop', 'mellow gold', 'cool', 'romantic era', 'hip hop', 'funk', 'art', 'blues', 'punk', 'alternative', 'dance', 'latin', 'traditional', 'album'}


In [3]:
def process_genres(genre_list):
    vec = CountVectorizer(ngram_range=(1,3))
    lst = literal_eval(genre_list)
    new_genre_list = []
    if lst:
        try:
            vec.fit_transform(lst)
            for genre in vec.get_feature_names():
                if genre in genres:
                    new_genre_list.append(genre)
        except:
            if lst[0] in genres:
                new_genre_list.append(lst[0])
                
    return new_genre_list if new_genre_list else np.nan

artist_genres['genres'] = artist_genres['genres'].apply(process_genres)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  artist_genres['genres'] = artist_genres['genres'].apply(process_genres)


In [4]:
useless_columns = ['artists', 'year', 'release_date', 'id']
separated_artists = data.explode('artists')
separated_artists = separated_artists.join(artist_genres.set_index('artists'), on='artists')

cols = set(separated_artists.columns)
cols.remove('artists')
cols.remove('genres')
songs_w_genres = separated_artists.groupby(list(cols), as_index=False).agg({'artists': [list], 'genres': [lambda x: x.iloc[0]]})
songs_w_genres.columns = songs_w_genres.columns.droplevel(-1)
songs_w_genres = songs_w_genres.drop(useless_columns, axis=1)
songs_w_genres = songs_w_genres[songs_w_genres.genres != '[]'].dropna()

print(songs_w_genres.shape)
songs_w_genres.to_csv('spotify_data/processed_data.csv', index=False)

(136879, 16)
