In [1]:
# pip install duckdb
# pip install jupysql
# pip install notebook
# pip install tabulate
# pip install sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [3]:
# If on windows, this only works if you're running Jupyter as an admin
# pip install duckdb-engine

Collecting duckdb-engine
  Using cached duckdb_engine-0.13.1-py3-none-any.whl (47 kB)
Collecting packaging>=21
  Using cached packaging-24.1-py3-none-any.whl (53 kB)
Installing collected packages: packaging, duckdb-engine
  Attempting uninstall: packaging
    Found existing installation: packaging 20.9
    Uninstalling packaging-20.9:
      Successfully uninstalled packaging-20.9
Successfully installed duckdb-engine-0.13.1 packaging-24.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import duckdb
import sqlalchemy
from datetime import date
import json
import pathlib
import importlib
%load_ext sql

In [2]:
# Set up SQL connection
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

# Change the billboard_observation_date!

In [3]:
current_date = date.today().strftime('%Y-%m-%d')
billboard_observation_date = '2024-07-16'

In [4]:
# Replace with your filepath. Forward slashes work for both Mac and Windows
%sql duckdb:///C:/Users/Admin/billboard/raw_db.duckdb

In [5]:
# Test connection
%sql USE raw_db;
%sql USE raw_db.billboard;
%sql SELECT * FROM raw_db.billboard.artist LIMIT 2;

Unnamed: 0,artist_id,artist_name,pct_female,last_update
0,1,Zach Bryan,0.0,2024-08-23
1,2,Morgan Wallen,0.0,2024-08-23


# Update upload/rank_by_name

In [6]:
# Update this for this week's rankings in upload/rank_by_name.py
# This is easier in an IDE that allows opt+arrow_key to move lines up and down

In [7]:
# TODO: figure out importlib.reload() here so I don't have to kill the kernel to reload
from upload.rank_by_name import rankings

if len(rankings) != 50:
    print(f'{len(rankings)} rankings, but expected 50')

rankings[:5] + ['...'] + rankings[45:]

['A Bar Song (Tipsy)',
 'I Had Some Help',
 'Lies Lies Lies',
 'Pink Skies',
 '28',
 '...',
 'New To Country',
 'Nine Ball',
 'Young Love & Saturday Nights',
 'Devil You Know',
 'Love You Again']

In [8]:
%%sql df_song_labeled <<
SELECT
    ss.song_id
    , ss.song_name
    , art.artist_name
    , s2a.relationship_type
    , ss.untracked_weeks_on_chart
FROM song ss
LEFT JOIN song_to_artist s2a
    ON ss.song_id = s2a.song_id
LEFT JOIN artist art
    ON s2a.artist_id = art.artist_id
;

In [13]:
df_song_labeled[df_song_labeled['song_name'] == 'Jolene']['song_id'].count()

2

In [16]:
existing_song_to_rank = {}
new_song_to_rank = {}
for song, rank_ix in zip(rankings, range(1,51)):
    n_rows = df_song_labeled[df_song_labeled['song_name'] == song]['song_id'].count()
    if n_rows == 0:
        new_song_to_rank[song] = rank_ix
    else:
        print(df_song_labeled.loc[df_song_labeled['song_name'] == song].to_markdown())
        song_id = input("Please enter the song_id or 'New Song'\n")

        if song_id == '': # hit enter to take the most common song ID
            song_id = int(
                df_song_labeled.loc[
                    df_song_labeled['song_name'] == song
                ]['song_id'].mode()
            )
        
        if str(song_id).lower() == 'new song':
            new_song_to_rank[song] = rank_ix
        else:
            existing_song_to_rank[int(song_id)] = rank_ix

|    |   song_id | song_name          | artist_name   | relationship_type   |   untracked_weeks_on_chart |
|---:|----------:|:-------------------|:--------------|:--------------------|---------------------------:|
| 16 |        34 | A Bar Song (Tipsy) | Shaboozey     | Lead artist         |                          0 |
Please enter the song_id or 'New Song'

|    |   song_id | song_name       | artist_name   | relationship_type   |   untracked_weeks_on_chart |
|---:|----------:|:----------------|:--------------|:--------------------|---------------------------:|
| 20 |        38 | I Had Some Help | Morgan Wallen | Other main artist   |                          0 |
| 83 |        38 | I Had Some Help | Post Malone   | Lead artist         |                          0 |
Please enter the song_id or 'New Song'

|    |   song_id | song_name      | artist_name   | relationship_type   |   untracked_weeks_on_chart |
|---:|----------:|:---------------|:--------------|:--------------------|-------

Please enter the song_id or 'New Song'

|    |   song_id | song_name                    | artist_name   | relationship_type   |   untracked_weeks_on_chart |
|---:|----------:|:-----------------------------|:--------------|:--------------------|---------------------------:|
| 70 |        92 | The Great American Bar Scene | Zach Bryan    | Lead artist         |                          0 |
Please enter the song_id or 'New Song'

|    |   song_id | song_name   | artist_name   | relationship_type   |   untracked_weeks_on_chart |
|---:|----------:|:------------|:--------------|:--------------------|---------------------------:|
| 40 |        61 | Bass Boat   | Zach Bryan    | Lead artist         |                          0 |
Please enter the song_id or 'New Song'

|     |   song_id | song_name   | artist_name     | relationship_type   |   untracked_weeks_on_chart |
|----:|----------:|:------------|:----------------|:--------------------|---------------------------:|
|  72 |        94 | Pur

In [17]:
print(json.dumps(existing_song_to_rank, indent = 2))
print(json.dumps(new_song_to_rank, indent = 2))

{
  "34": 1,
  "38": 2,
  "58": 3,
  "43": 4,
  "57": 5,
  "32": 6,
  "37": 7,
  "56": 8,
  "1": 9,
  "27": 10,
  "52": 11,
  "50": 12,
  "40": 13,
  "48": 14,
  "47": 15,
  "53": 16,
  "59": 17,
  "60": 18,
  "21": 19,
  "24": 20,
  "73": 22,
  "92": 23,
  "61": 24,
  "94": 25,
  "71": 26,
  "96": 27,
  "95": 28,
  "30": 29,
  "55": 32,
  "42": 33,
  "22": 34,
  "41": 35,
  "25": 36,
  "97": 38,
  "39": 40,
  "51": 42,
  "26": 44,
  "75": 46,
  "28": 47,
  "29": 48,
  "36": 49,
  "23": 50
}
{
  "We Ride": 21,
  "Wine Into Whiskey": 30,
  "Memphis; The Blues": 31,
  "Like Ida": 37,
  "Towers": 39,
  "Funny Man": 41,
  "Lucky Enough (Poem)": 43,
  "Bathwater": 45
}


## Add artists on any new songs that aren't in the artist table yet to its CSV

- We Ride: Bryan Martin (+26)
- Wine Into Whiskey: Tucker Wetmore
- Memphis; The Blues: Zach Bryan feat. John Moreland
- Like Ida: Zach Bryan
- Towers: Zach Bryan
- Funny Man: Zach Bryan
- Lucky Enough (Poem): Zach Bryan
- Bathwater: Zach Bryan

In [18]:
df_artist = pd.read_csv('source_data/artist.csv')

In [19]:
new_song_artists = [
    'Zach Bryan' # 1
    , 'Bryan Martin' # 19
    , 'Tucker Wetmore' # 9
    , 'John Moreland' # new
]
df_artist[df_artist['artist_name'].isin(new_song_artists)]

Unnamed: 0,artist_id,artist_name,pct_female,last_update
0,1,Zach Bryan,0.0,2024-08-23
8,9,Tucker Wetmore,0.0,2024-08-23
18,19,Bryan Martin,0.0,2024-08-23


In [20]:
df_artist['artist_id'].max()

63

In [21]:
df_artist = pd.concat(
    [
        df_artist
        , pd.DataFrame(
            np.array([
                [64, 'John Moreland', 0, current_date]
            ])
            , columns = ['artist_id', 'artist_name', 'pct_female', 'last_update']
        )
    ]
    , ignore_index = True
)

In [22]:
df_artist.to_csv('source_data/artist.csv', index=False)

## Add all new songs to the song table CSV

In [23]:
df_song_raw = pd.read_csv('source_data/song.csv')

In [None]:
## Paste new songs list here

In [24]:
max_song_id = df_song_raw['song_id'].max()
array_new_songs = []
for song_name in [
    'We Ride'
    , 'Wine Into Whiskey'
    , 'Memphis; The Blues'
    , 'Like Ida'
    , 'Towers'
    , 'Funny Man'
    , 'Lucky Enough (Poem)'
    , 'Bathwater'
]:
    max_song_id += 1
    array_new_songs += [[max_song_id, song_name, 0, current_date]]

array_new_songs

[[98, 'We Ride', 0, '2024-09-09'],
 [99, 'Wine Into Whiskey', 0, '2024-09-09'],
 [100, 'Memphis; The Blues', 0, '2024-09-09'],
 [101, 'Like Ida', 0, '2024-09-09'],
 [102, 'Towers', 0, '2024-09-09'],
 [103, 'Funny Man', 0, '2024-09-09'],
 [104, 'Lucky Enough (Poem)', 0, '2024-09-09'],
 [105, 'Bathwater', 0, '2024-09-09']]

In [25]:
df_song_raw = pd.concat(
    [
        df_song_raw
        , pd.DataFrame(
            np.array(array_new_songs)
            , columns = ['song_id', 'song_name', 'untracked_weeks_on_chart', 'last_update']
        )
    ]
    , ignore_index = True
)

In [28]:
# Update untracked_weeks_on_chart if needed
df_song_raw[-len(array_new_songs)-1:]

Unnamed: 0,song_id,song_name,untracked_weeks_on_chart,last_update
96,97,Northern Thunder,0.0,2024-09-08
97,98,We Ride,26.0,2024-09-09
98,99,Wine Into Whiskey,0.0,2024-09-09
99,100,Memphis; The Blues,0.0,2024-09-09
100,101,Like Ida,0.0,2024-09-09
101,102,Towers,0.0,2024-09-09
102,103,Funny Man,0.0,2024-09-09
103,104,Lucky Enough (Poem),0.0,2024-09-09
104,105,Bathwater,0.0,2024-09-09


In [27]:
# Adjust untracked weeks if they aren't zero
# Example: df_song_raw.loc[92, 'untracked_weeks_on_chart'] = 6
df_song_raw.loc[97, 'untracked_weeks_on_chart'] = 26

In [29]:
df_song_raw.to_csv('source_data/song.csv', index=False)

## Add all new songs to the song_to_artist table CSV

In [None]:
## Paste new songs markdown here
- We Ride: Bryan Martin (+26)
- Wine Into Whiskey: Tucker Wetmore
- Memphis; The Blues: Zach Bryan feat. John Moreland
- Like Ida: Zach Bryan
- Towers: Zach Bryan
- Funny Man: Zach Bryan
- Lucky Enough (Poem): Zach Bryan
- Bathwater: Zach Bryan

In [30]:
df_artist[df_artist['artist_name'].isin(new_song_artists)]

Unnamed: 0,artist_id,artist_name,pct_female,last_update
0,1,Zach Bryan,0.0,2024-08-23
8,9,Tucker Wetmore,0.0,2024-08-23
18,19,Bryan Martin,0.0,2024-08-23
63,64,John Moreland,0.0,2024-09-09


In [31]:
df_s2a_raw = pd.read_csv('source_data/song_to_artist.csv').fillna('')

In [32]:
array_new_s2a = []
for s2a in [ # song_id, artist_id, relationship_conjunction, relationship_index
    [98, 19, '', '']
    , [99, 9, '', '']
    , [100, 1, '', '']
    , [100, 64, '', 1]
    , [101, 1, '', '']
    , [102, 1, '', '']
    , [103, 1, '', '']
    , [104, 1, '', '']
    , [105, 1, '', '']
]:
    if s2a[3] == '': # Only lead artists don't have a relationship index
        reln = 'Lead artist'
    elif s2a[2] == '': # Featured artists don't have an explicit relationship conjunction
        reln = 'Featured artist' 
    else:
        reln = 'Other main artist'
        
    array_new_s2a += [[
        '-'.join([str(s2a[0]), reln, str(s2a[3])])
        , s2a[0]
        , s2a[1]
        , reln
        , s2a[2]
        , s2a[3]
        , current_date
    ]]
    
array_new_s2a

[['98-Lead artist-', 98, 19, 'Lead artist', '', '', '2024-09-09'],
 ['99-Lead artist-', 99, 9, 'Lead artist', '', '', '2024-09-09'],
 ['100-Lead artist-', 100, 1, 'Lead artist', '', '', '2024-09-09'],
 ['100-Featured artist-1', 100, 64, 'Featured artist', '', 1, '2024-09-09'],
 ['101-Lead artist-', 101, 1, 'Lead artist', '', '', '2024-09-09'],
 ['102-Lead artist-', 102, 1, 'Lead artist', '', '', '2024-09-09'],
 ['103-Lead artist-', 103, 1, 'Lead artist', '', '', '2024-09-09'],
 ['104-Lead artist-', 104, 1, 'Lead artist', '', '', '2024-09-09'],
 ['105-Lead artist-', 105, 1, 'Lead artist', '', '', '2024-09-09']]

In [33]:
df_s2a_raw = pd.concat(
    [
        df_s2a_raw
        , pd.DataFrame(
            np.array(array_new_s2a)
            , columns = [
                'song_to_artist_pk'
                , 'song_id'
                , 'artist_id'
                , 'relationship_type'
                , 'relationship_conjunction'
                , 'relationship_index'
                , 'last_update'
            ]
        )
    ]
    , ignore_index = True
)

In [34]:
df_s2a_raw.tail(1)

Unnamed: 0,song_to_artist_pk,song_id,artist_id,relationship_type,relationship_conjunction,relationship_index,last_update
122,105-Lead artist-,105,1,Lead artist,,,2024-09-09


In [35]:
df_s2a_raw.to_csv('source_data/song_to_artist.csv', index=False)

## Convert new_song_to_rank to the id_to_rankix format of existing_song_to_rank
Then stack the two dicts of songs and insert to CSV with more info like current_date for last_update

In [36]:
all_song_to_rank = existing_song_to_rank.copy()
for song_name, rank in new_song_to_rank.items():
    songs = list(df_song_raw[df_song_raw['song_name'] == song_name]['song_id'])
    
    if len(songs) == 1:
        song_id = songs[0]
    elif len(songs) > 1:
        song_id = int(input(f'{str(songs)}: Choose ID of song actually at rank {rank}\n'))
    else:
        raise Exception(f'The song at rank {rank} is missing from the song CSV')
    
    all_song_to_rank[int(song_id)] = rank

In [37]:
array_new_ranking = [
    [
        '-'.join([billboard_observation_date, str(rank)])
        , billboard_observation_date
        , rank
        , song_id
        , current_date
    ]
    for song_id, rank
    in all_song_to_rank.items()
]

array_new_ranking # Make sure it's 50 rows

[['2024-07-16-1', '2024-07-16', 1, 34, '2024-09-09'],
 ['2024-07-16-2', '2024-07-16', 2, 38, '2024-09-09'],
 ['2024-07-16-3', '2024-07-16', 3, 58, '2024-09-09'],
 ['2024-07-16-4', '2024-07-16', 4, 43, '2024-09-09'],
 ['2024-07-16-5', '2024-07-16', 5, 57, '2024-09-09'],
 ['2024-07-16-6', '2024-07-16', 6, 32, '2024-09-09'],
 ['2024-07-16-7', '2024-07-16', 7, 37, '2024-09-09'],
 ['2024-07-16-8', '2024-07-16', 8, 56, '2024-09-09'],
 ['2024-07-16-9', '2024-07-16', 9, 1, '2024-09-09'],
 ['2024-07-16-10', '2024-07-16', 10, 27, '2024-09-09'],
 ['2024-07-16-11', '2024-07-16', 11, 52, '2024-09-09'],
 ['2024-07-16-12', '2024-07-16', 12, 50, '2024-09-09'],
 ['2024-07-16-13', '2024-07-16', 13, 40, '2024-09-09'],
 ['2024-07-16-14', '2024-07-16', 14, 48, '2024-09-09'],
 ['2024-07-16-15', '2024-07-16', 15, 47, '2024-09-09'],
 ['2024-07-16-16', '2024-07-16', 16, 53, '2024-09-09'],
 ['2024-07-16-17', '2024-07-16', 17, 59, '2024-09-09'],
 ['2024-07-16-18', '2024-07-16', 18, 60, '2024-09-09'],
 ['2024-07-

In [38]:
df_billboard_ranking_raw = pd.concat(
    [
        pd.read_csv('source_data/billboard_ranking.csv').fillna('')
        , pd.DataFrame(
            np.array(array_new_ranking)
            , columns = [
                'billboard_ranking_pk'
                , 'billboard_observation_date'
                , 'billboard_rank'
                , 'song_id'
                , 'last_update'
            ]
        )
    ]
    , ignore_index = True
)

In [39]:
df_billboard_ranking_raw.to_csv('source_data/billboard_ranking.csv', index=False)

## Push to duckdb

In [40]:
# TODO: instead of purging and rebuilding, use the raw data to run updates
%sql --close duckdb:///C:/Users/Admin/billboard/raw_db.duckdb --quiet
pathlib.Path('raw_db.duckdb').unlink(missing_ok = True)

TODO: integrate into jupyter using `%python`

In a command terminal
- `python table_configs/run_schema_configs.py billboard_schema.sql`
- `python table_configs/run_table_configs.py`
- `python main.py`

## Run the ranking SQL script

In [49]:
with duckdb.connect('raw_db.duckdb') as conn:
    with open('analysis/artist_ranking.sql') as sql_file:
        artist_ranking = conn.sql(sql_file.read())
    
    # TODO: Grab headers from the SQL too
    df_artist_ranking = pd.DataFrame(artist_ranking.fetchall())

In [50]:
df_artist_ranking

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1,Zach Bryan,77,77,77,101,101,101,21,21,21,21,21,21,4.0,4.0,2024-07-16,1
1,15,Post Malone,47,47,47,47,47,47,18,18,18,18,18,18,2.0,2.0,2024-07-16,2
2,2,Morgan Wallen,16,24,32,28,36,44,2,3,4,2,3,4,2.0,2.0,2024-07-16,3
3,5,Jelly Roll,12,17,27,12,17,28,3,4,6,3,4,6,10.0,10.0,2024-07-16,4
4,16,Koe Wetzel,16,16,16,16,16,16,2,2,2,2,2,2,9.0,9.0,2024-07-16,5
5,6,Luke Combs,14,14,21,49,49,56,2,2,4,3,3,5,3.0,3.0,2024-07-16,6
6,4,Megan Moroney,14,14,14,14,14,14,3,3,3,3,3,3,21.0,21.0,2024-07-23,7
7,11,Lainey Wilson,12,12,14,12,47,49,2,2,3,2,3,4,19.0,19.0,2024-07-16,8
8,8,Bailey Zimmerman,12,12,12,12,12,12,2,2,2,2,2,2,28.0,28.0,2024-07-16,9
9,25,Shaboozey,10,10,10,10,10,10,2,2,2,2,2,2,1.0,1.0,2024-07-16,10


In [51]:
with duckdb.connect('raw_db.duckdb') as conn:
    print(conn.sql(
        """
        SELECT billboard_observation_date, COUNT(1) n
        FROM raw_db.billboard.billboard_ranking
        GROUP BY 1
        ORDER BY 1 ASC
        """
    ))

┌────────────────────────────┬───────┐
│ billboard_observation_date │   n   │
│            date            │ int64 │
├────────────────────────────┼───────┤
│ 2024-07-16                 │    50 │
│ 2024-07-23                 │    50 │
│ 2024-07-31                 │    50 │
│ 2024-08-07                 │    50 │
│ 2024-08-13                 │    50 │
│ 2024-08-23                 │    50 │
│ 2024-08-27                 │    50 │
│ 2024-09-05                 │    50 │
└────────────────────────────┴───────┘

