# Download Data

In [25]:
import json
import billboard
from datetime import datetime
import time
from typing import List

def to_date(date: str) -> datetime:
    return datetime.strptime(date, '%Y-%m-%d')

def retrieve_previous_charts(chart, end_date: datetime) -> List[billboard.ChartData]:
    while to_date(chart.previousDate) > end_date:
        chart = billboard.ChartData(chart.name, chart.previousDate, fetch=True)
        yield chart
      

In [26]:
CHART_NAME = 'hot-100'
start = '2018-06-05'
end = datetime(1995, 2, 26)
chart1 = billboard.ChartData(CHART_NAME, date=start)

In [27]:
charts = []
for chart in retrieve_previous_charts(chart1, end):
    print(chart.date, end=" ")
    charts.append(chart)
    time.sleep(1)

2018-06-02 2018-05-26 2018-05-19 2018-05-12 2018-05-05 2018-04-28 2018-04-21 2018-04-14 2018-04-07 2018-03-31 2018-03-24 2018-03-17 2018-03-10 2018-03-03 2018-02-24 2018-02-17 2018-02-10 2018-02-03 2018-01-27 2018-01-20 2018-01-13 2018-01-06 2018-01-03 2017-12-30 2017-12-23 2017-12-16 2017-12-09 2017-12-02 2017-11-25 2017-11-18 2017-11-11 2017-11-04 2017-10-28 2017-10-21 2017-10-14 2017-10-07 2017-09-30 2017-09-23 2017-09-16 2017-09-09 2017-09-02 2017-08-26 2017-08-19 2017-08-12 2017-08-05 2017-07-29 2017-07-22 2017-07-15 2017-07-08 2017-07-01 2017-06-24 2017-06-17 2017-06-10 2017-06-03 2017-05-27 2017-05-20 2017-05-13 2017-05-06 2017-04-29 2017-04-22 2017-04-15 2017-04-08 2017-04-01 2017-03-25 2017-03-18 2017-03-11 2017-03-04 2017-02-25 2017-02-18 2017-02-11 2017-02-04 2017-01-28 2017-01-21 2017-01-14 2017-01-07 2016-12-31 2016-12-24 2016-12-17 2016-12-10 2016-12-03 2016-11-26 2016-11-19 2016-11-12 2016-11-05 2016-10-29 2016-10-22 2016-10-15 2016-10-08 2016-10-01 2016-09-24 2016-09-17

# Process Data

In [28]:
def get_songs(charts: List[billboard.ChartData]) -> List[billboard.ChartEntry]:
    """ Remove duplicate songs from a chart list based on str(song) """
    songs = [song for chart in charts for song in chart]
    songs = sorted(songs, key=lambda song: song.peakPos)
    song_set = {}
    for song in songs:
        if str(song) not in song_set:
            song_set[str(song)] = song  
        
    return list(song_set.values())
    
def is_collaboration(song: billboard.ChartEntry):
    return any(x in song.artist for x in ["Featuring", '&'])

def song_data(song: billboard.ChartEntry) -> dict:
    return {**song.__dict__, 'identifier': str(song)}


# Get song data, remove duplicates from the charts
songs = get_songs(charts)
songs_data = list(map(song_data, songs))

# Insert into Database

In [34]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db[CHART_NAME]
db.list_collection_names()

# Spotify Api: Pop, Latin, Rock, Jazz, Christian, Country
# Lyrics:  Pop, Latin, Country

['pop-songs',
 'latin-songs',
 'top_songs',
 'country-songs',
 'rock-songs',
 'songs',
 'jazz-songs',
 'hot-100',
 'all-songs',
 'christian-songs',
 'rap-song']

In [30]:
# Bulk Insert
songs_collection.insert_many(songs_data)

<pymongo.results.InsertManyResult at 0x7f3e7c2c0d08>

In [33]:
results = list(songs_collection.find())
len(results)

8672

In [32]:
results[:10]

[{'_id': ObjectId('5bfd4efc1a5cfe00bcda15b4'),
  'title': 'Nice For What',
  'artist': 'Drake',
  'peakPos': 1,
  'lastPos': 2,
  'weeks': 7,
  'rank': 1,
  'isNew': False,
  'identifier': "'Nice For What' by Drake"},
 {'_id': ObjectId('5bfd4efc1a5cfe00bcda15b5'),
  'title': 'This Is America',
  'artist': 'Childish Gambino',
  'peakPos': 1,
  'lastPos': 1,
  'weeks': 3,
  'rank': 2,
  'isNew': False,
  'identifier': "'This Is America' by Childish Gambino"},
 {'_id': ObjectId('5bfd4efc1a5cfe00bcda15b6'),
  'title': "God's Plan",
  'artist': 'Drake',
  'peakPos': 1,
  'lastPos': 3,
  'weeks': 18,
  'rank': 3,
  'isNew': False,
  'identifier': "'God's Plan' by Drake"},
 {'_id': ObjectId('5bfd4efc1a5cfe00bcda15b7'),
  'title': 'Perfect',
  'artist': 'Ed Sheeran',
  'peakPos': 1,
  'lastPos': 10,
  'weeks': 39,
  'rank': 15,
  'isNew': False,
  'identifier': "'Perfect' by Ed Sheeran"},
 {'_id': ObjectId('5bfd4efc1a5cfe00bcda15b8'),
  'title': 'Rockstar',
  'artist': 'Post Malone Featuring 2