In [1]:
import re
import pandas as pd
import json
import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Structure Raw Data

### Tik Tok

In [2]:
with open("./data/tokboard_record.json", "r") as f:
    tiktok_charts = json.load(f)

In [351]:
# charting TikTok sounds 
pre_df = {}
for month in tiktok_charts:
    chart = tiktok_charts[month]
    for rank in chart:
        song = chart[rank]
        title = song['title'].lower()
        user = song['artist'].lower()
        if (title, user) not in pre_df:
            pre_df[(title, user)] = {month: int(rank)}
        else:
            pre_df[(title, user)][month] = int(rank)
            
tok_charts_df = pd.DataFrame(pre_df).transpose()
tok_charts_df.index.set_names(["Title", "Artist"], inplace=True)

In [355]:
num_songs = len(tok_charts_df)
num_artists = len(set(tok_charts_df.index.get_level_values("Artist")))
print("Tik Tok Trending Sounds:")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique accounts")
print("\n")

Tik Tok Trending Sounds:
---
1890 total songs
1703 unique accounts




### Spotify
- Spotify Viral 50 (Global)
- Spotify Top 200 (Global)
- Spotify Viral 50 (USA)
- Spotify T0p 200 (USa)

In [3]:
# Global Viral 50
with open("./data/global_viral_50.json", "r") as f:
    spot_v50 = json.load(f)

In [4]:
# USA Viral 50
with open("./data/usa_viral_50.json", "r") as f:
    spot_v50_usa = json.load(f)

In [5]:
# Global Top 200
with open("./data/global_top_200.json", "r") as f:
    spot_top200 = json.load(f)

In [6]:
# USA Top 200
with open("./data/usa_top_200.json", "r") as f:
    spot_top200_usa = json.load(f)

In [316]:
# charting spotify songs history
def get_spot_df(all_charts):
    pre_df = {}
    for week in all_charts:
        chart = all_charts[week]
        for rank in chart:
            song = chart[rank]
            title = song['title'].lower()
            title = re.sub(r"&amp;", "&", title)
            artist = song['artist'][3:].lower()
            artist = re.sub(r"&amp;", "&", artist)
            if (title, artist) not in pre_df:
                pre_df[(title, artist)] = {week[:10]: int(rank) + 1}
            else:
                pre_df[(title, artist)][week[:10]] = int(rank) + 1
            
    df = pd.DataFrame(pre_df).transpose()
    df.index.set_names(["Title", "Artist"], inplace=True)
    return df

spot_v50_df = get_spot_df(spot_v50)
spot_top200_df = get_spot_df(spot_top200)
spot_v50_usa_df = get_spot_df(spot_v50_usa)
spot_top200_usa_df = get_spot_df(spot_top200_usa)

In [358]:
num_songs = len(spot_v50_df)
num_artists = len(set(spot_v50_df.index.get_level_values("Artist")))
print("Spotify Viral 50 (Global):")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

num_songs = len(spot_top200_df)
num_artists = len(set(spot_top200_df.index.get_level_values("Artist")))
print("Spotify Top 200 (Global):")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

num_songs = len(spot_v50_usa_df)
num_artists = len(set(spot_v50_usa_df.index.get_level_values("Artist")))
print("Spotify Viral 50 (USA):")
# print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

num_songs = len(spot_top200_usa_df)
num_artists = len(set(spot_top200_usa_df.index.get_level_values("Artist")))
print("Spotify Top 200(Global):")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

Spotify Viral 50 (Global):
---
2242 total songs
1744 unique artists


Spotify Top 200 (Global):
---
2516 total songs
1061 unique artists


Spotify Viral 50 (USA):
2395 total songs
1655 unique artists


Spotify Top 200(Global):
---
2990 total songs
859 unique artists




### Billboard 
- Billboard Top 200
- Billboard Hot 100

In [7]:
with open("./data/bilboard_200.json", "r") as f:
    bil_200 = json.load(f)

In [8]:
with open("./data/bilboard_hot_100.json", "r") as f:
    bil_h100 = json.load(f)

In [345]:
# charting billboard history

def get_bil_df(bil):
    pre_df = {}
    for week in bil:        
        chart = bil[week]
        for rank in chart:
            song = chart[rank]
            title = song['song'].lower()
            title = re.sub(r"&amp;", "&", title)
            artist = song['artist'].lower()
            artist = re.sub(r"&amp;", "&", artist)
            if (title, artist) not in pre_df:
                    pre_df[(title, artist)] = {week[:10]: int(rank) + 1}
            else:
                pre_df[(title, artist)][week[:10]] = int(rank) + 1
    df = pd.DataFrame(pre_df).transpose()
    df.index.set_names(["Title", "Artist"], inplace=True)
    return df

bil_200_df = get_bil_df(bil_200)
bil_h100_df = get_bil_df(bil_h100)

In [346]:
num_songs = len(bil_200_df)
num_artists = len(set(bil_200_df.index.get_level_values("Artist")))
print("Billboard Top 200:")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

num_songs = len(bil_h100_df)
num_artists = len(set(bil_h100_df.index.get_level_values("Artist")))
print("Billboard Hot 100:")
print("---")
print(num_songs, "total songs")
print(num_artists, "unique artists")
print("\n")

Billboard Top 200:
---
1975 total songs
1201 unique artists


Billboard Hot 100:
---
1757 total songs
936 unique artists




# Merge Songs From All Platforms

In [32]:
all_songs = set(tok_charts_df.index)
all_songs = all_songs.union(set(spot_v50_df.index))
all_songs = all_songs.union(set(spot_top200_df.index))
all_songs = all_songs.union(set(spot_v50_usa_df.index))
all_songs = all_songs.union(set(spot_top200_usa_df.index))
all_songs = all_songs.union(set(bil_200_df.index))
all_songs = all_songs.union(set(bil_h100_df.index))

zip([spot_v50_df, spot_top200_df], ["spot_v50", "spot_200"])
charts = [spot_v50_df, spot_top200_df, spot_v50_usa_df, spot_top200_usa_df, bil_200_df, bil_h100_df]
names = ["spot_v50", "spot_top200", "usa_v50", "usa_top200", "bil_200", "bil_h100"]

big_bucks = pd.DataFrame(index=all_songs)

for idx in big_bucks.index:
    if idx in tok_charts_df.index:
        tok = tok_charts_df.loc[idx[0],idx[1]].dropna()
        big_bucks.at[idx, "tok_first_date"] = datetime.datetime.strptime(tok.index[0], "%B-%Y").strftime("%Y-%m")
        big_bucks.at[idx, "tok_first_rank"] = tok[0]
        tok = tok.sort_values()
        big_bucks.at[idx, "tok_best_date"] = datetime.datetime.strptime(tok.index[0], "%B-%Y").strftime("%Y-%m")
        big_bucks.at[idx, "tok_best_rank"] = tok[0]
    
    
    for chart, name in zip(charts, names):
        if idx in chart.index:
            spot = chart.loc[idx[0],idx[1]].dropna().sort_values()
            first_date = spot.index.sort_values()[0]
            first_rank = spot[first_date]
            big_bucks.at[idx, f"{name}_first_date"] = first_date[:7]
            big_bucks.at[idx, f"{name}_first_rank"] = first_rank
            big_bucks.at[idx, f"{name}_best_date"] = spot.index[0]
            big_bucks.at[idx, f"{name}_best_rank"] = spot[0]    
big_bucks = big_bucks.drop(("", ""))
big_bucks.index.set_names(["Title", "Artist"], inplace=True)

In [497]:
# Replace empty artist names
all_index = big_bucks.index.tolist()
titles = []
artists = []
for i, value in enumerate(all_index):
    titles.append(value[0])
    if value[1] == "":
        artists.append("-")
    else:
        artists.append(value[1])
big_bucks = big_bucks.set_index([titles, artists])
big_bucks.index.set_names(["Title", "Artist"], inplace=True)

In [504]:
big_bucks.to_csv("song_library.csv")

# Retrieve Song Data

In [39]:
CLIENT_ID = "69200cfc38774a4394c7bc17911d0c08"
CLIENT_SECRET = "6c5f36f3da3e4e699ebc7ba59d3d9792"
auth_manager= spotipy.oauth2.SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)#, redirect_uri=REDIRECT_URI, scope='playlist-modify-public, playlist-modify-private')
sp = spotipy.Spotify(client_credentials_manager=auth_manager)

In [267]:
def parse_artist(results):
    artist_id = results['id']
    pop = results['popularity']
    followers = results['followers']['total']
    genres = results['genres']
    related = []
    for r in sp.artist_related_artists(artist_id)['artists']:
        related.append((r['name'], r['id'])) 
    return {"artist_id": artist_id, 
            "artist_popularity": pop, 
            "followers": followers, 
            "genres": genres, 
            "related": related, 
            "songs": {}}

def parse_song(song_results):
    release_date = song['album']['release_date']
    duration = song['duration_ms']
    song_popularity = song['popularity']
    song_id = song['id']
    audio_feats = sp.audio_features(song_id)[0]
    return {"song_id": song_id, 
            "release_date": release_date, 
            "duration": duration, 
            "song_popularity": song_popularity, 
            "audio_feats": audio_feats}

artist_info = {}
missed = []
for i, (title, artist) in enumerate(big_bucks.index):
#     print(title, artist)
    if artist not in artist_info and artist != "":
        results = sp.search(q=artist, type="artist", limit=1)['artists']['items']
        if len(results) > 0:
            artist_info[artist] = parse_artist(results[0])
    
    song = sp.search(q=f"{title} artist:{artist}", type="track")['tracks']['items']
    temp = {}
    if len(song) == 0:
        song = sp.search(q=title, type="track")['tracks']['items']
    if len(song) > 0:
        song = song[0]
        temp = parse_song(song)
        
        if artist not in artist_info:
            artist_id = song['artists'][0]['id']
            artist_results = sp.artist(artist_id)
            artist_info[artist] = parse_artist(artist_results)
            
    # still no results after 2 tries
    if artist not in artist_info:
        artist_info[artist] = {"songs": {}}
        missed.append((title, artist))
        print(f"Missed: {artist}")
    
    artist_info[artist]["songs"][title] = temp
    
    if i % 100 == 0:
        with open("artist_song_info.csv", "w") as f:
            json.dump(artist_info, f)
            
with open("artist_song_info.csv", "w") as f:
    json.dump(artist_info, f)

Missed: lv.01 napkins
Missed: iamkhizarkhan
Missed: rajput banna
Missed: vunnypubgm
Missed: jeytee4u
Missed: quantumwolf115
Missed: mike moonnight &amp; dm'boys
Missed: status boy ammu_87
Missed: .gacha._.cartoons
Missed: wowbrey
Missed: goalsounds
Missed: anmol1908
Missed: 岩谷竜宇 《りょー🗽》
Missed: mind your business ❤️
Missed: maddiefckinsmokez
Missed: sounds420
Missed: heyjacxb
Missed: di_ana_ss
Missed: elizabeth_chetwynd
Missed: ravecentral
Missed: jadenhasnoclout
Missed: zachfromdisney
Missed: bhinda aujla &amp; bobby layal feat. sunny boy
Missed: èsco upp🗣
Missed: quincy boykins🐺🌪
Missed: _ucil👑
Missed: moolesh_bheel88
Missed: thedylanmiller
Missed: picoslander is a sussy baka
Missed: stonerreacts🍃😏
Missed: sm0l.ash
Missed: 👑 h a r s h 👑
Missed: ashi khanna 
Missed: ꧁justin7u7mike꧂
Missed: bhavik_dhanak
Missed: amzorcen
Missed: øceancollab
Missed: dcfreestyle_official
Missed: call me jony 🤓
Missed: robirawks_
Missed: rishhpgautam
Missed: rauw alejandro, j balvin
Missed: lil.st1nky
Miss

In [505]:
artist_info.keys()



In [521]:
related_artists = {}
for i, artist in enumerate(artist_info):
    related = []
    if "artist_id" not in artist_info[artist]:
        related_artists[artist] = []
    else:
        artist_id = artist_info[artist]['artist_id']
        for r in sp.artist_related_artists(artist_id)['artists']:
            related.append((r['name'], r['id'])) 
        related_artists[artist] = related
        if i % 500 == 0:
            with open("related_artists.json", "w") as f:
                json.dump(related_artists, f)

with open("related_artists.json", "w") as f:
    json.dump(related_artists, f)