# Download Data

In [5]:
import json
import billboard
from datetime import datetime
import time
from typing import List

def to_date(date: str) -> datetime:
    return datetime.strptime(date, '%Y-%m-%d')

def retrieve_previous_charts(chart, end_date: datetime) -> List[billboard.ChartData]:
    while to_date(chart.previousDate) > end_date:
        chart = billboard.ChartData(chart.name, chart.previousDate, fetch=True)
        yield chart
        

In [14]:
CHART_NAME = 'country-songs'
start = '2006-10-05'
end = datetime(1995, 2, 26)
chart1 = billboard.ChartData(CHART_NAME, date=start)

In [15]:
charts = []
for chart in retrieve_previous_charts(chart1, end):
    print(chart.date, end=" ")
    charts.append(chart)
    time.sleep(1)

2006-09-30 2006-09-23 2006-09-16 2006-09-09 2006-09-02 2006-08-26 2006-08-19 2006-08-12 2006-08-05 2006-07-29 2006-07-22 2006-07-15 2006-07-08 2006-07-01 2006-06-24 2006-06-17 2006-06-10 2006-06-03 2006-05-27 2006-05-20 2006-05-13 2006-05-06 2006-04-29 2006-04-22 2006-04-15 2006-04-08 2006-04-01 2006-03-25 2006-03-18 2006-03-11 2006-03-04 2006-02-25 2006-02-18 2006-02-11 2006-02-04 2006-01-28 2006-01-21 2006-01-14 2006-01-07 2005-12-31 2005-12-24 2005-12-17 2005-12-10 2005-12-03 2005-11-26 2005-11-19 2005-11-12 2005-11-05 2005-10-29 2005-10-22 2005-10-15 2005-10-08 2005-10-01 2005-09-24 2005-09-17 2005-09-10 2005-09-03 2005-08-27 2005-08-20 2005-08-13 2005-08-06 2005-07-30 2005-07-23 2005-07-16 2005-07-09 2005-07-02 2005-06-25 2005-06-18 2005-06-11 2005-06-04 2005-05-28 2005-05-21 2005-05-14 2005-05-07 2005-04-30 2005-04-23 2005-04-16 2005-04-09 2005-04-02 2005-03-26 2005-03-19 2005-03-12 2005-03-05 2005-02-26 2005-02-19 2005-02-12 2005-02-05 2005-01-29 2005-01-22 2005-01-15 2005-01-08

# Process Data

In [16]:
def get_songs(charts: List[billboard.ChartData]) -> List[billboard.ChartEntry]:
    """ Remove duplicate songs from a chart list based on str(song) """
    songs = {str(song): song for chart in charts for song in chart}
    return list(songs.values())
    
def is_collaboration(song: billboard.ChartEntry):
    return any(x in song.artist for x in ["Featuring", '&'])

def song_data(song: billboard.ChartEntry) -> dict:
    return {**song.__dict__, 'identifier': str(song)}


# Get song data, remove duplicates from the charts
songs = get_songs(charts)
songs_data = list(map(song_data, songs))

# Insert into Database

In [17]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db[CHART_NAME]

In [18]:
# Bulk Insert
songs_collection.insert_many(songs_data)

<pymongo.results.InsertManyResult at 0x7fcb0a8e2a88>

In [19]:
results = list(songs_collection.find())
len(results)

3374

In [20]:
results[:10]

[{'_id': ObjectId('5bb8bdf6054ef10022aec29a'),
  'title': 'Love Lies',
  'artist': 'Khalid & Normani',
  'peakPos': 40,
  'lastPos': 0,
  'weeks': 0,
  'rank': 40,
  'isNew': True,
  'identifier': "'Love Lies' by Khalid & Normani"},
 {'_id': ObjectId('5bb8bdf6054ef10022aec29b'),
  'title': 'Better Now',
  'artist': 'Post Malone',
  'peakPos': 32,
  'lastPos': 0,
  'weeks': 0,
  'rank': 32,
  'isNew': True,
  'identifier': "'Better Now' by Post Malone"},
 {'_id': ObjectId('5bb8bdf6054ef10022aec29c'),
  'title': 'Girls Like You',
  'artist': 'Maroon 5 Featuring Cardi B',
  'peakPos': 26,
  'lastPos': 0,
  'weeks': 0,
  'rank': 26,
  'isNew': True,
  'identifier': "'Girls Like You' by Maroon 5 Featuring Cardi B"},
 {'_id': ObjectId('5bb8bdf6054ef10022aec29d'),
  'title': 'Youngblood',
  'artist': '5 Seconds Of Summer',
  'peakPos': 37,
  'lastPos': 0,
  'weeks': 0,
  'rank': 37,
  'isNew': True,
  'identifier': "'Youngblood' by 5 Seconds Of Summer"},
 {'_id': ObjectId('5bb8bdf6054ef10022a