# Download Data

In [1]:
import json
import billboard
from datetime import datetime
import time
from typing import List

def to_date(date: str) -> datetime:
    return datetime.strptime(date, '%Y-%m-%d')

def retrieve_previous_charts(chart, end_date: datetime) -> List[billboard.ChartData]:
    while to_date(chart.previousDate) > end_date:
        chart = billboard.ChartData(chart.name, chart.previousDate, fetch=True)
        yield chart
      

In [2]:
CHART_NAME = 'jazz-songs'
start = '2008-06-05'
end = datetime(1995, 2, 26)
chart1 = billboard.ChartData(CHART_NAME, date=start)

In [131]:
charts = []
for chart in retrieve_previous_charts(chart1, end):
    print(chart.date, end=" ")
    charts.append(chart)
    time.sleep(1)

2008-05-31 2008-05-24 2008-05-17 2008-05-10 2008-05-03 2008-04-26 2008-04-19 2008-04-12 2008-04-05 2008-03-29 2008-03-22 2008-03-15 2008-03-08 2008-03-01 2008-02-23 2008-02-16 2008-02-09 2008-02-02 2008-01-26 2008-01-19 2008-01-12 2008-01-05 2007-12-29 2007-12-22 2007-12-15 2007-12-08 2007-12-01 2007-11-24 2007-11-17 2007-11-10 2007-11-03 2007-10-27 2007-10-20 2007-10-13 2007-10-06 2007-09-29 2007-09-22 2007-09-15 2007-09-08 2007-09-01 2007-08-25 2007-08-18 2007-08-11 2007-08-04 2007-07-28 2007-07-21 2007-07-14 2007-07-07 2007-06-30 2007-06-23 2007-06-16 2007-06-09 2007-06-02 2007-05-26 2007-05-19 2007-05-12 2007-05-05 2007-04-28 2007-04-21 2007-04-14 2007-04-07 2007-03-31 2007-03-24 2007-03-17 2007-03-10 2007-03-03 2007-02-24 2007-02-17 2007-02-10 2007-02-03 2007-01-27 2007-01-20 2007-01-13 2007-01-06 2006-12-30 2006-12-23 2006-12-16 2006-12-09 2006-12-02 2006-11-25 2006-11-18 2006-11-11 2006-11-04 2006-10-28 2006-10-21 2006-10-14 2006-10-07 2006-09-30 2006-09-23 2006-09-16 2006-09-09

TypeError: strptime() argument 1 must be str, not None

# Process Data

In [132]:
def get_songs(charts: List[billboard.ChartData]) -> List[billboard.ChartEntry]:
    """ Remove duplicate songs from a chart list based on str(song) """
    songs = {str(song): song for chart in charts for song in chart}
    return list(songs.values())
    
def is_collaboration(song: billboard.ChartEntry):
    return any(x in song.artist for x in ["Featuring", '&'])

def song_data(song: billboard.ChartEntry) -> dict:
    return {**song.__dict__, 'identifier': str(song)}


# Get song data, remove duplicates from the charts
songs = get_songs(charts)
songs_data = list(map(song_data, songs))

# Insert into Database

In [3]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db[CHART_NAME]
db.list_collection_names()

# Spotify Api: Pop, Latin, Rock, Jazz, Christian, Country
# Lyrics:      -

['pop-songs',
 'latin-songs',
 'top_songs',
 'country-songs',
 'rock-songs',
 'songs',
 'jazz-songs',
 'christian-songs',
 'rap-song']

In [134]:
# Bulk Insert
songs_collection.insert_many(songs_data)

<pymongo.results.InsertManyResult at 0x7fcb0a0fdcc8>

In [135]:
results = list(songs_collection.find())
len(results)

1339

In [136]:
results[:10]

[{'_id': ObjectId('5bbb2685054ef10022af405a'),
  'title': 'Everyone But You',
  'artist': 'Cindy Bradley',
  'peakPos': 24,
  'lastPos': 0,
  'weeks': 0,
  'rank': 24,
  'isNew': True,
  'identifier': "'Everyone But You' by Cindy Bradley"},
 {'_id': ObjectId('5bbb2685054ef10022af405b'),
  'title': 'Ready To Go',
  'artist': 'Threestyle Featuring Magdalena Chovancova',
  'peakPos': 24,
  'lastPos': 0,
  'weeks': 0,
  'rank': 24,
  'isNew': True,
  'identifier': "'Ready To Go' by Threestyle Featuring Magdalena Chovancova"},
 {'_id': ObjectId('5bbb2685054ef10022af405c'),
  'title': 'Deep As The Night',
  'artist': 'Special EFX',
  'peakPos': 29,
  'lastPos': 0,
  'weeks': 0,
  'rank': 29,
  'isNew': True,
  'identifier': "'Deep As The Night' by Special EFX"},
 {'_id': ObjectId('5bbb2685054ef10022af405d'),
  'title': 'Up And Up',
  'artist': 'Jeff Ryan',
  'peakPos': 30,
  'lastPos': 0,
  'weeks': 0,
  'rank': 30,
  'isNew': True,
  'identifier': "'Up And Up' by Jeff Ryan"},
 {'_id': Objec