# Import Modules and Settings

In [4]:
import music
import os
import pickle
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.precision", 3)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:.2f}".format

import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.oauth2 as oauth2

from datetime import datetime, timedelta

from fuzzywuzzy import process

import requests
from bs4 import BeautifulSoup as BS
import pprint
pp = pprint.PrettyPrinter(indent=4)

path = os.getcwd() + '/data/'
raw_path = path + 'raw/'
billboard_path = path + 'external/billboard_weeks'
fuzzy_path = path + 'interim/'
model_path = path + 'model/'

# Functions

In [1]:
def user_playlist_tracks_full(spotify_connection, user, playlist_id = None, fields = None, market = None):
    """ 
    args:
        user: User ID of playlist owner
        playlist_id: ID of the Spotify playlist
        fields: Can select specific fields to return. Refer to Spotify documentation for further details
        market: Country code
        
    """
    # first run through also retrieves total no of songs in library
    response = spotify_connection.user_playlist_tracks(user, playlist_id, fields=fields, limit=100, market=market)
    results = response["items"]

    # subsequently runs until it hits the user-defined limit or has read all songs in the library
    while len(results) < response["total"]:
        response = spotify_connection.user_playlist_tracks(
            user, playlist_id, fields = fields, limit = 100, offset = len(results), market = market)
        results.extend(response["items"])
    return results

def scrape_spotify(user, playlist):
    
    '''
    Function to create a dataframe of audio features, given the user name and playlist id
    Args:
        spotify_connection: spotipy.Spotify(auth = token) object
        user: Name of user that playlist is extracted from. Enter in string format
        playlist_id: id of Spotify playlist. Under share, select copy Spotify uri
    
    Returns: DataFrame object
    '''
    client_id = '6a7cb31c7b4d48bfa37ad4a2fda6b0e4'
    client_secret = 'fb1aa602521a4f618c316e9301f79f30'

    credentials = oauth2.SpotifyClientCredentials(
        client_id = client_id,
        client_secret = client_secret)

    token = credentials.get_access_token()
    spotify = spotipy.Spotify(auth = token)
    
    search = user_playlist_tracks_full(spotify, user, playlist_id = playlist , market = 'US')
    
    artist_name = [search[i]['track']['artists'][0]['name'] for i in range(len(search))]
    artist_id = [search[i]['track']['artists'][0]['id'] for i in range(len(search))]
    track_name = [search[i]['track']['name'] for i in range(len(search))]
    track_id = [search[i]['track']['id'] for i in range(len(search))]
    release_date = [search[i]['track']['album']['release_date'] for i in range(len(search))]
    popularity_metric = [search[i]['track']['popularity'] for i in range(len(search))]
    
    audio_features = spotify.audio_features(track_id[0:100])
    for i in range(0, int(len(search)/100)-1):
        audio_features.extend(spotify.audio_features(track_id[100+100*i: 200+100*i]))
        
    genre = [spotify.artists(artist_id[0:50])['artists'][i]['genres'] for i in range(0, 50)]
    for i in range(0, int(len(search)/50)-1):
        for g in range(0,50):
            genre.append(spotify.artists(artist_id[50+(50*i): 100+(50*i)])['artists'][g]['genres'])
        
    song_df = pd.DataFrame(audio_features)
    song_df['track'] = track_name
    song_df['artist'] = artist_name
    song_df['release_date'] = release_date
    song_df['artist_id'] = artist_id
    song_df['track_id'] = track_id
    song_df['popularity'] = popularity_metric
    song_df['genre'] = genre
    
    return song_df    

def scrape_billboard(ranking_week): #format '2018-01-06'

    data_path = os.getcwd() + '/data/billboard_weeks/billboard_{}.pickle'.format(ranking_week)
    url = 'https://www.billboard.com/charts/hot-100/{}'.format(ranking_week)
    response = requests.get(url)
    page = response.text
    soup = BS(page, 'lxml')

    items = soup.find_all('div', {'class': 'chart-list-item'})
    rank = soup.find_all('div', {'class': 'chart-list-item__rank'})
    song_title = soup.find_all('div', {'class': 'chart-list-item__title'})
    artist_name = soup.find_all('div', {'class': 'chart-list-item__artist'})

    headers = ['ranking', 'title', 'artist', 'rank_week']
    weekly_rank = [rank[i].string.replace('\n','').strip() for i in range(len(items))]
    title = [song_title[i].text.replace('\n','').strip() for i in range(len(items))]
    artist = [artist_name[i].text.replace('\n','').strip() for i in range(len(items))]
    wk_ranking = [ranking_week] * len(items)

    billboard_top = dict(zip(headers, [weekly_rank, title, artist, wk_ranking]))

    return pd.DataFrame(billboard_top).to_pickle(data_path)

def generate_billboard_weeks(start_date, end_date):
    '''
    Returns a pickle file for the range of weeks given;
    '''
    start = datetime.datetime.strptime(start_date, '%m/%d/%y')
    assert start.weekday() == 5, 'The start of the week must begin on a Saturday'
    end = datetime.datetime.strptime(end_date, '%m/%d/%y')

    for week in range(0, ((end-start).days// 7) + 1):
        scrape_billboard(datetime.datetime.strftime(start + datetime.timedelta(days = 7 * week), '%Y-%m-%d'))
    return None

def merge_billboard(start_date, end_date):

    start = datetime.datetime.strptime(start_date, '%m/%d/%y')
    assert start.weekday() == 5, 'The start of the week must begin on a Saturday'
    end = datetime.datetime.strptime(end_date, '%m/%d/%y')
    '''
    Merges the pickle files in the given date range
    start = dt.datetime.strptime(start_date, '%m/%d/%y')
    assert start.weekday() == 5, 'The start of the week must begin on a Saturday'
    end = dt.datetime.strptime(end_date, '%m/%d/%y')
    '''
    dfs = []
    for week in range(0, ((end-start).days// 7) + 1):
        formatted_date = datetime.datetime.strftime(start + datetime.timedelta(days = 7 * week), '%Y-%m-%d')
        dfs.append(pd.read_pickle(os.getcwd() + '/data/billboard_weeks/billboard_{}.pickle'.format(formatted_date)))
        billboard_df = pd.concat(dfs, axis = 0, ignore_index = True)
    return billboard_df

def fuzzy_merge(df_1, df_2, key1, key2, threshold, limit):
    
    '''
    df_1 is the left table to join
    df_2 is the right table to join
    key1 is the key column of the left table
    key2 is the key column of the right table
    threshold is how close the matches should be to return a match
    limit is the amount of matches will get returned, these are sorted high to low
    '''
    
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [2]:
def fuzzy_merge(df_1, df_2, key1, key2, threshold, limit):
    
    '''
    df_1 is the left table to join
    df_2 is the right table to join
    key1 is the key column of the left table
    key2 is the key column of the right table
    threshold is how close the matches should be to return a match
    limit is the amount of matches will get returned, these are sorted high to low
    '''
    
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

# Obtain Audio Features from Spotify Web API via Spotipy

#### Without genres

In [8]:
client_id = '6a7cb31c7b4d48bfa37ad4a2fda6b0e4'
client_secret = 'fb1aa602521a4f618c316e9301f79f30'

credentials = oauth2.SpotifyClientCredentials(
              client_id = client_id,
              client_secret = client_secret)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth = token)

In [4]:
# Available playlist to choose from:
songs_2018 = '7drSV2jTKxdPXsLHM12s5I'
# Main Project 3 Playlist: 10,000 songs across 2018. Not carefully selected
albums_2018 = '64EuP1gJFje1HJn0PrZlS8'
# Playlist based on albums released in 2018; may not capture singles released
singles_2018 = '6qr9d6frpDxgbk8KxpJ1rL'
# Playlist that captures songs not in either list 
may_2018_playlist = '19vrv5El1RKLqkNArxvo6F'
# Test playlist for fun; shared playlist
study_playlist = '4X7oYDGcLzW9llFdFkDilo'
# Additional playlist for fun; personal playlist

In [5]:
column_order = ['track', 'artist','acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
                'speechiness', 'tempo', 'valence', 'genre', 'duration_ms', 'release_date', 'id', 'artist_id', 'track_id', 'popularity', 'analysis_url', 'time_signature', 'track_href',
                'type', 'uri']

In [21]:
# Functions extracts audio features from 10000 song playlist
song_df = scrape_spotify('jc98924', songs_2018)

# Reorder the columns accordingly 
song_df = song_df[column_order]

# Save to .csv file
song_df.to_csv(raw_path + 'song_df_unaltered.csv')

In [None]:
# Functions extracts audio features from curated playlist based on albums released + select songs
album_df = scrape_spotify('jc98924', albums_2018)

# Reorder the columns accordingly 
album_df = album_df[column_order]

album_df.to_csv(raw_path + 'album_df_unaltered.csv')

In [70]:
# Run code to obtain the billboard_df for the specified date range. 
billboard_df = merge_billboard('01/06/18', '06/29/19')
# Save to .csv file
billboard_df.to_csv(raw_path + 'billboard_df_unaltered.csv')

#### With genres included

In [None]:
client_id = '6a7cb31c7b4d48bfa37ad4a2fda6b0e4'
client_secret = 'fb1aa602521a4f618c316e9301f79f30'

credentials = oauth2.SpotifyClientCredentials(
    client_id = client_id,
    client_secret = client_secret)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth = token)

search = user_playlist_tracks_full(spotify, 'jc98924', playlist_id = albums_2018 , market = 'US')

artist_name = [search[i]['track']['artists'][0]['name'] for i in range(len(search))]
artist_id = [search[i]['track']['artists'][0]['id'] for i in range(len(search))]
track_name = [search[i]['track']['name'] for i in range(len(search))]
track_id = [search[i]['track']['id'] for i in range(len(search))]
release_date = [search[i]['track']['album']['release_date'] for i in range(len(search))]
popularity_metric = [search[i]['track']['popularity'] for i in range(len(search))]

audio_features = spotify.audio_features(track_id[0:100])
for i in range(0, int(len(search)/100)-1):
    audio_features.extend(spotify.audio_features(track_id[100+100*i: 200+100*i]))

song_df = pd.DataFrame(audio_features)
song_df['track'] = track_name
song_df['artist'] = artist_name
song_df['release_date'] = release_date
song_df['artist_id'] = artist_id
song_df['track_id'] = track_id
song_df['popularity'] = popularity_metric

# Extremely inefficient line of code, need to find a better method
genre = [spotify.artists(artist_id[0:50])['artists'][i]['genres'] for i in range(0, 50)]
for i in range(0, int(len(search)/50)-1):
    for g in range(0,50):
        genre.append(spotify.artists(artist_id[50+(50*i): 100+(50*i)])['artists'][g]['genres'])

song_df['genre'] = genre

song_df.to_pickle(raw_path + 'song_df_genre.pickle')

In [None]:
client_id = '6a7cb31c7b4d48bfa37ad4a2fda6b0e4'
client_secret = 'fb1aa602521a4f618c316e9301f79f30'

credentials = oauth2.SpotifyClientCredentials(
    client_id = client_id,
    client_secret = client_secret)

token = credentials.get_access_token()
spotify = spotipy.Spotify(auth = token)

search = user_playlist_tracks_full(spotify, 'jc98924', playlist_id = albums_2018 , market = 'US')

artist_name = [search[i]['track']['artists'][0]['name'] for i in range(len(search))]
artist_id = [search[i]['track']['artists'][0]['id'] for i in range(len(search))]
track_name = [search[i]['track']['name'] for i in range(len(search))]
track_id = [search[i]['track']['id'] for i in range(len(search))]
release_date = [search[i]['track']['album']['release_date'] for i in range(len(search))]
popularity_metric = [search[i]['track']['popularity'] for i in range(len(search))]

audio_features = spotify.audio_features(track_id[0:100])
for i in range(0, int(len(search)/100)-1):
    audio_features.extend(spotify.audio_features(track_id[100+100*i: 200+100*i]))

album_df = pd.DataFrame(audio_features)
album_df['track'] = track_name
album_df['artist'] = artist_name
album_df['release_date'] = release_date
album_df['artist_id'] = artist_id
album_df['track_id'] = track_id
album_df['popularity'] = popularity_metric

# Extremely inefficient line of code, need to find a better method
genre = [spotify.artists(artist_id[0:50])['artists'][i]['genres'] for i in range(0, 50)]
for i in range(0, int(len(search)/50)-1):
    for g in range(0,50):
        genre.append(spotify.artists(artist_id[50+(50*i): 100+(50*i)])['artists'][g]['genres'])
        
album_df['genre'] = genre

album_df.to_pickle(raw_path + 'album_df_genre.pickle')

# DataFrame Cleaning and Merging

In [16]:
# For using the playlist based on 10,000 songs
billboard_df = pd.read_csv(raw_path + 'billboard_df_unaltered.csv')

song_df = pd.read_pickle(raw_path + 'song_df_with_genres')

song_df['index'] = song_df['track'] + ' ' + song_df['artist']
billboard_df['index'] = billboard_df['title'] + ' ' + billboard_df['artist']

billboard_df.drop_duplicates(subset = ['index'], keep = 'first', inplace = True)
song_df.drop_duplicates(subset = ['track','artist'], keep = 'first', inplace = True)
print('The length of {} is {}'.format('song_df',len(song_df)))
print('The length of {} is {}'.format('billboard_df',len(billboard_df)))

The length of song_df is 9621
The length of billboard_df is 955


In [15]:
# For the playlist based on 5400 songs
billboard_df = pd.read_csv(raw_path + 'billboard_df_unaltered.csv')

album_df = pd.read_pickle(raw_path + 'album_df_with_genres.pickle')

billboard_df['index'] = billboard_df['title'] + ' ' + billboard_df['artist']
album_df['index'] = album_df['track'] + ' ' + album_df['artist']
billboard_df.drop_duplicates(subset = ['index'], keep = 'first', inplace = True)

album_df.drop_duplicates(subset = 'index', keep = 'first', inplace = True)
print('The length of {} is {}'.format('album_df',len(album_df)))
print('The length of {} is {}'.format('billboard_df',len(billboard_df)))

The length of album_df is 5393
The length of billboard_df is 955


## FuzzyMerging with Billboard Data

#### album_df: threshold 88

In [109]:
album_88 = fuzzy_merge(album_df, billboard_df, 'index', 'index', threshold = 88, limit = 1)
album_88.to_csv(fuzzy_path + 'album_88_genre.csv')

In [45]:
album_88 = pd.read_csv(fuzzy_path + 'album_88_genre.csv')
len(album_88[~album_88['matches'].isnull()])
# Dataframe is 5393 rows long, has 414 match entries, 7.67%; Most correct except Hunch Jack

414

#### album_df: threshold 92

In [47]:
album_merge_92 = fuzzy_merge(album_df, billboard_df, 'index', 'index', threshold = 92, limit = 1)
album_merge_92.to_csv(fuzzy_path + 'album_merge_92.csv')

In [55]:
album_merge_92 = pd.read_csv(fuzzy_path + 'album_merge_92.csv')
len(album_merge_92[~album_merge_92['matches'].isnull()])
# Album_df length is 5393, 298 matched entries, 5.52% 

298

#### song_df: threshold 86

In [None]:
merge_df_86 = fuzzy_merge(song_df, billboard_df, 'index', 'index', threshold = 86, limit = 1)
merge_df_86.to_csv(fuzzy_path + 'fuzzy_merge_86.csv')

In [52]:
merge_86 = pd.read_csv(fuzzy_path + 'fuzzy_merge_86.csv')
len(merge_86[~merge_86['matches'].isnull()])
# Do not use model, too many false positives

7551

#### song_df: threshold 88

In [None]:
merge_df_88 = fuzzy_merge(song_df, billboard_df, 'index', 'index', threshold = 86, limit = 1)
merge_df_88.to_csv(fuzzy_path + 'fuzzy_merge_88.csv')

In [21]:
merge_88 = pd.read_csv(fuzzy_path + 'fuzzy_merge_88.csv')
len(merge_88[~merge_88['matches'].isnull()])
# Do not use model, too many false positives

594

#### song_df: threshold 90 (Best Sample)

In [None]:
merge_90 = fuzzy_merge(song_df, billboard_df, 'index', 'index', threshold = 90, limit =1)
merge_90.to_csv(fuzzy_path + 'merge_90_with_genre.csv')

In [20]:
merge_90 = pd.read_csv(fuzzy_path + 'merge_90_with_genre.csv')
len(merge_90[~merge_90['matches'].isnull()])

523

#### song_df: threshold 92

In [81]:
merge_df_92 = fuzzy_merge(song_df, billboard_df, 'index', 'index', threshold = 92, limit = 1)
merge_df_92.to_csv(fuzzy_path + 'fuzzy_merge_92.csv')

In [22]:
merge_92 = pd.read_csv(fuzzy_path + 'fuzzy_merge_92.csv')
len(merge_92[~merge_92['matches'].isnull()])
# Original DataFrame, 92 threshold, length of 9621, 409 positives, 4.25%

409

#### song_df: threshold 94

In [None]:
merge_df_94 = fuzzy_merge(song_df, billboard_df, 'index', 'index', threshold = 94, limit = 1)
merge_df_94.to_csv(fuzzy_path + 'fuzzy_merge_94.csv')

In [23]:
merge_94 = pd.read_csv(fuzzy_path + 'fuzzy_merge_94.csv')
len(merge_94[~merge_94['matches'].isnull()])
# Original DataFrame, 94 threshold, length of 9621, 405 positives, 4.20%

405

# Create positive class labels and further cleaning

In [9]:
# Chose to go the path of the albums (5,400 songs vs 10,000)
album_88 = pd.read_csv(fuzzy_path + 'album_88_genre.csv')
df = album_88.copy()

In [None]:
def clean_analysis_df(df, attribute_scale):
    df['pop'] = [1 if 'pop' in x else 0 for x in df['genre']]
    df['rap'] = [1 if 'rap' in x else 0 for x in df['genre']]
    df['hip-hop'] = [1 if 'hip' in x else 0 for x in df['genre']]
    df['country'] = [1 if 'country' in x else 0 for x in df['genre']]
    df['electronic'] = [1 if 'edm' in x else 0 for x in df['genre']]
    df['rock'] = [1 if 'rock' in x else 0 for x in df['genre']]
    df['indie'] = [1 if 'indie' in x else 0 for x in df['genre']]
    df['house'] = [1 if 'house' in x else 0 for x in df['genre']]
    df['metal'] = [1 if 'metal' in x else 0 for x in df['genre']]
    df['jazz'] = [1 if 'jazz' in x else 0 for x in df['genre']]
    df['soul'] = [1 if 'soul' in x else 0 for x in df['genre']]
    df['reggaeton'] = [1 if 'reggaeton' in x else 0 for x in df['genre']]

    df['sum'] = df['pop'] + df['rap'] + df['hip-hop'] \
    + df['country'] + df['electronic'] + df['rock'] + df['indie'] \
    + df['house'] + df['metal'] + df['jazz'] + df['soul'] + df['reggaeton']\

    df['other'] = [1 if x == 0 else 0 for x in df['sum']]
    
    df['matches'].fillna(0, inplace = True)
    df['success'] = [0 if x == 0 else 1 for x in df['matches']]
    
    feature_selection = ['success','acousticness', 'danceability', 'energy', 
                         'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
                         'speechiness', 'tempo', 'valence', 'duration_ms', 'release_date',
                         'popularity','artist','track', 'pop', 'rap', 'hip-hop', 'country',
                         'electronic', 'rock', 'indie', 'house', 'metal', 'jazz', 'soul', 
                         'reggaeton', 'other']
    
    spotify_df = df[feature_selection].copy()
    spotify_df['index'] = spotify_df['artist'] + ': ' + spotify_df['track']
    spotify_df.set_index('index', inplace = True)
    spotify_df['minutes'] = spotify_df['duration_ms']/60000
    spotify_df['release_date'] = pd.to_datetime(spotify_df['release_date'])
    spotify_df.drop(['artist', 'track', 'duration_ms'], axis = 1, inplace = True)
    
    spotify_df['acousticness'] = spotify_df['acousticness'] * attribute_scale
    spotify_df['danceability'] = spotify_df['danceability'] * attribute_scale
    spotify_df['energy'] = spotify_df['energy'] * attribute_scale
    spotify_df['instrumentalness'] = spotify_df['instrumentalness'] * attribute_scale
    spotify_df['liveness'] = spotify_df['liveness'] * attribute_scale
    spotify_df['speechiness'] = spotify_df['speechiness'] * attribute_scale
    spotify_df['valence'] = spotify_df['valence'] * attribute_scale
    
    return spotify_df  

In [None]:
album_sp_df = clean_analysis_df(df, 100)
# Manually change specific predictors that were classed as positive 
album_sp_df['success'].iloc[5362] = 0
album_sp_df['success'].iloc[5367:5373] = 0

album_sp_df.to_pickle(model_path + 'spotify_album_analysis.pickle')

In [None]:
spotify_df = clean_analysis_df(pd.read_csv(fuzzy_path + 'merge_90_with_genre.csv'), 100)

# Export to pickle file; Analysis of model located in project_3_spotify_analysis
spotify_df.to_pickle(model_path + 'spotify_analysis.pickle')

#### Data for Tableau (with Artist/Track names)

In [10]:
def tableau_clean_df(df, attribute_scale):
    df['pop'] = [1 if 'pop' in x else 0 for x in df['genre']]
    df['rap'] = [1 if 'rap' in x else 0 for x in df['genre']]
    df['hip-hop'] = [1 if 'hip' in x else 0 for x in df['genre']]
    df['country'] = [1 if 'country' in x else 0 for x in df['genre']]
    df['electronic'] = [1 if 'edm' in x else 0 for x in df['genre']]
    df['rock'] = [1 if 'rock' in x else 0 for x in df['genre']]
    df['indie'] = [1 if 'indie' in x else 0 for x in df['genre']]
    df['house'] = [1 if 'house' in x else 0 for x in df['genre']]
    df['metal'] = [1 if 'metal' in x else 0 for x in df['genre']]
    df['jazz'] = [1 if 'jazz' in x else 0 for x in df['genre']]
    df['soul'] = [1 if 'soul' in x else 0 for x in df['genre']]
    df['reggaeton'] = [1 if 'reggaeton' in x else 0 for x in df['genre']]

    df['sum'] = df['pop'] + df['rap'] + df['hip-hop'] \
    + df['country'] + df['electronic'] + df['rock'] + df['indie'] \
    + df['house'] + df['metal'] + df['jazz'] + df['soul'] + df['reggaeton']\

    df['other'] = [1 if x == 0 else 0 for x in df['sum']]
    
    df['matches'].fillna(0, inplace = True)
    df['success'] = [0 if x == 0 else 1 for x in df['matches']]
    
    feature_selection = ['success','acousticness', 'danceability', 'energy', 
                         'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
                         'speechiness', 'tempo', 'valence', 'duration_ms', 'release_date',
                         'popularity','artist','track', 'pop', 'rap', 'hip-hop', 'country',
                         'electronic', 'rock', 'indie', 'house', 'metal', 'jazz', 'soul', 
                         'reggaeton', 'other']
    
    spotify_df = df[feature_selection].copy()
    spotify_df['index'] = spotify_df['artist'] + ': ' + spotify_df['track']
    spotify_df.set_index('index', inplace = True)
    spotify_df['minutes'] = spotify_df['duration_ms']/60000
    spotify_df['release_date'] = pd.to_datetime(spotify_df['release_date'])
    spotify_df.drop(['duration_ms'], axis = 1, inplace = True)
    
    spotify_df['acousticness'] = spotify_df['acousticness'] * attribute_scale
    spotify_df['danceability'] = spotify_df['danceability'] * attribute_scale
    spotify_df['energy'] = spotify_df['energy'] * attribute_scale
    spotify_df['instrumentalness'] = spotify_df['instrumentalness'] * attribute_scale
    spotify_df['liveness'] = spotify_df['liveness'] * attribute_scale
    spotify_df['speechiness'] = spotify_df['speechiness'] * attribute_scale
    spotify_df['valence'] = spotify_df['valence'] * attribute_scale
    
    return spotify_df    

In [11]:
tableau_df = tableau_clean_df(df, 100)
tableau_df['success'].iloc[5362] = 0
tableau_df['success'].iloc[5367:5373] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
df = tableau_df.copy()

In [13]:
df['hip_merge'] = df['rap'] + df['hip-hop'] + df['reggaeton']
df['rock_merge'] = df['rock'] + df['metal']
df['edm_merge'] = df['electronic'] + df['house']
df['jazz_merge'] = df['jazz'] + df['soul']

df['rap/hip-hop'] = [1 if x >= 1 else 0 for x in df['hip_merge']]
df['ROCK'] = [1 if x >= 1 else 0 for x in df['rock_merge']]
df['edm'] = [1 if x >= 1 else 0 for x in df['edm_merge']]
df['JAZZ'] = [1 if x >= 1 else 0 for x in df['jazz_merge']]

conditions = [
    (df['pop'] == 1) & (df['rap/hip-hop'] == 1),
    (df['pop'] == 1) & (df['rap/hip-hop'] == 0)
]
choices = [0, 1]
df['POP'] = np.select(conditions, choices, default = 0)

df.drop(['pop','rap','hip-hop','electronic','rock','house','metal','jazz','soul','reggaeton','rock_merge','edm_merge','jazz_merge','hip_merge'],
        axis = 1, inplace = True)
df.rename(columns = lambda x: x.lower(), inplace = True)

In [14]:
df['pop'].value_counts()

0    2846
1    2547
Name: pop, dtype: int64

In [15]:
df.to_pickle(model_path + 'album_df_additional_genre_cleaning')

In [67]:
df.to_pickle('tabl_genre.pickle')

In [None]:
# if df['pop'] == 1 & df['rap/hip-hop'] == 1, df['pop'] = 0
#else if df['pop'] == 1, df['POP'] = 1

In [27]:
df['rap/hip-hop'].value_counts()
df['rap/hip-hop'] = [1 if x >= 1 else 0 for x in df['hip_sum']]

In [24]:
    df['other'] = [1 if x == 0 else 0 for x in df['sum']]

0    5031
1     362
Name: other, dtype: int64

In [15]:
pop_mask = ((tableau_df['pop'] == 1))
rap_mask = ((tableau_df))


In [None]:
tableau_df[pop_mask]

In [8]:
tableau_df.to_csv('tableau_df_artist.csv')