# Set-up

In [92]:
import pandas as pd
import re
import billboard
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [88]:
def split_artists(artist, separators):
    if pd.isna(artist):
        return []
    pattern = '|'.join(map(re.escape, separators))
    return [name.strip() for name in re.split(pattern, artist) if name.strip()]

def get_artist_albums(artist_name):
    result = sp.search(q=artist_name, type='artist', limit=1)
    
    if not result['artists']['items']:
        return f"No artist found with name: {artist_name}"
    
    artist_id = result['artists']['items'][0]['id']
    
    albums = []
    results = sp.artist_albums(artist_id, album_type='album', limit=50)  # 'album', 'single', 'appears_on', etc.
    
    while results:
        albums.extend(results['items'])
        if results['next']:
            results = sp.next(results)
        else:
            break
    
    return [album['name'] for album in albums]

# Artists presence in Billboard Year end top 100

In [52]:
pop_songs = pd.DataFrame(columns=['title', 'artist', 'rank', 'year'])
years = range(2013,2025)

for year in years:
    chart = billboard.ChartData('hot-100-songs', year=year)
    for song in chart:
        pop_songs = pd.concat([pop_songs, pd.DataFrame({'title': [song.title], 'artist': [song.artist], 'rank': [song.rank], 'year': [year]})])

pop_songs.head()

Unnamed: 0,title,artist,rank,year
0,Thrift Shop,Macklemore & Ryan Lewis Featuring Wanz,1,2013
0,Blurred Lines,Robin Thicke Featuring T.I. + Pharrell,2,2013
0,Radioactive,Imagine Dragons,3,2013
0,Harlem Shake,Baauer,4,2013
0,Can't Hold Us,Macklemore & Ryan Lewis Featuring Ray Dalton,5,2013


In [54]:
pop_songs.shape

(1199, 4)

In [53]:
separators = ['Featuring', ',', '&', 'With', 'Duet With', ' X ', ' + ']

pop_songs['artist'] = pop_songs['artist'].apply(lambda x: split_artists(x, separators))
pop_songs['artist'].to_list()

[['Macklemore', 'Ryan Lewis', 'Wanz'],
 ['Robin Thicke', 'T.I.', 'Pharrell'],
 ['Imagine Dragons'],
 ['Baauer'],
 ['Macklemore', 'Ryan Lewis', 'Ray Dalton'],
 ['Justin Timberlake'],
 ['P!nk', 'Nate Ruess'],
 ['Bruno Mars'],
 ['Florida Georgia Line', 'Nelly'],
 ['Katy Perry'],
 ['Bruno Mars'],
 ['The Lumineers'],
 ['Rihanna', 'Mikky Ekko'],
 ['Daft Punk', 'Pharrell Williams'],
 ['Lorde'],
 ['Taylor Swift'],
 ['Miley Cyrus'],
 ['Miley Cyrus'],
 ['Avicii'],
 ['Justin Timberlake', 'JAY Z'],
 ['Anna Kendrick'],
 ['Jay Z', 'Justin Timberlake'],
 ['will.i.am', 'Britney Spears'],
 ['Zedd', 'Foxes'],
 ['AWOLNATION'],
 ['Swedish House Mafia', 'John Martin'],
 ['Rihanna'],
 ['Icona Pop', 'Charli XCX'],
 ['Capital Cities'],
 ['Bruno Mars'],
 ['Ariana Grande', 'Mac Miller'],
 ['Drake'],
 ['Selena Gomez'],
 ['Drake', 'Majid Jordan'],
 ['Maroon 5'],
 ['Pitbull', 'Christina Aguilera'],
 ['Lady Gaga'],
 ['Maroon 5'],
 ['Lil Wayne', 'Drake', 'Future'],
 ['Fall Out Boy'],
 ['A$AP Rocky', 'Drake', '2 Chai

In [55]:
unique_artists = list(set([artist for artists in pop_songs['artist'] for artist in artists]))
len(unique_artists)

586

In [58]:
#count how many times each artist appears in the dataset
artist_counts = {artist: 0 for artist in unique_artists}
for artists in pop_songs['artist']:
    for artist in artists:
        artist_counts[artist] += 1

pd.Series(artist_counts).sort_values(ascending=False).head(20)


Drake            59
Ariana Grande    27
Justin Bieber    27
Nicki Minaj      25
Taylor Swift     25
The Weeknd       24
Post Malone      21
Morgan Wallen    21
Cardi B          19
Doja Cat         18
Future           18
Ed Sheeran       17
Luke Combs       17
Maroon 5         16
Bad Bunny        16
Rihanna          16
SZA              15
Chris Brown      15
Lil Baby         15
21 Savage        14
dtype: int64

In [64]:
pd.Series(artist_counts).describe()

count    586.000000
mean       2.993174
std        4.497287
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       59.000000
dtype: float64

In [70]:
len([(artist, count) for artist, count in artist_counts.items() if count >= 10])

38

In [None]:
artists_df = pd.DataFrame({'artist': list(artist_counts.keys()), 'num_songs_on_billboard': list(artist_counts.values())})

# Discographies

In [77]:
# include size and range of discography
artists_over_10 = [artist for artist, count in artist_counts.items() if count >= 10]

In [93]:
client_id = '7226939d4c5e43ab969715d406ad11d9'
client_secret = '9e8ee07ae94f49928c65c59ffd4a3dec'
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)

sp = spotipy.Spotify(auth_manager=auth_manager)

In [94]:
artists_df['num_albums'] = artists_df['artist'].apply(lambda x: len(get_artist_albums(x)))
artists_df.head()

Unnamed: 0,artist,songs in hot 100,num_albums
0,Teddy Swims,1,4
1,Imagine Dragons,13,11
2,Eric Church,2,15
3,Kacey Musgraves,2,8
4,Rae Sremmurd,4,4


In [105]:
artists_df['num_albums'].describe()

count    586.000000
mean       8.453925
std       12.788300
min        0.000000
25%        3.000000
50%        6.000000
75%       11.000000
max      201.000000
Name: num_albums, dtype: float64

In [109]:
artists_df.columns

Index(['artist', 'songs in hot 100', 'num_albums'], dtype='object')

In [113]:
artists_df.query('num_songs_on_billboard > 9 and num_albums > 10')

Unnamed: 0,artist,num_songs_on_billboard,num_albums
1,Imagine Dragons,13,11
44,Meghan Trainor,10,13
90,Justin Bieber,27,15
140,Lil Baby,15,12
160,21 Savage,14,15
182,Ariana Grande,27,14
191,Shawn Mendes,11,13
195,Sam Smith,12,12
207,Rihanna,16,16
219,DaBaby,10,12


In [114]:
artists_df.to_csv(r"C:\Users\asarr\Documents\Projects\pop_music\data\artists.csv")