In [None]:
!pip install billboard.py

import pandas as pd
import concurrent.futures
import time
import billboard
from datetime import datetime, timedelta
from random import randrange
import itertools
import numpy as np

Collecting billboard.py
  Downloading https://files.pythonhosted.org/packages/88/47/0620193719b369221550d7f676c5b5fe3c30649612b42675ee13826ba4fe/billboard.py-6.2.1-py2.py3-none-any.whl
Installing collected packages: billboard.py
Successfully installed billboard.py-6.2.1


In [None]:
# funzione che ritorna la data dell'ultimo SABATO = data della Billboard Hot 100 più recente

"""
'Un nuovo grafico è stato compilato e rilasciato ufficialmente al pubblico da Billboard martedì. Ogni grafico è postdatato con la data di
emissione "fine settimana" quattro giorni dopo l'aggiornamento dei grafici in linea (cioè il sabato successivo).'
https://it.qaz.wiki/wiki/Billboard_Hot_100
"""

def get_last_saturday():
  day = datetime.today()
  while day.weekday() != 5:
    day = day - timedelta(days=1)
  return day

# alternativa --> più precisa

def get_last_chart_date():
  return datetime.strptime(billboard.ChartData('hot-100').date,'%Y-%m-%d')


# generatore di date dalla data dell'ultima hot 100 (andando indietro di ogni settimana)

def date_generator():
  from_date = get_last_chart_date()
  while from_date >= datetime.strptime('1960-01-01','%Y-%m-%d'):
    yield str(from_date)[:10]   # ritorna data in formato stringa (YYYY-mm-dd = primi 10 caratteri)
    from_date = from_date - timedelta(days=7)


# funzione che prende un singolo chart e crea un dataframe

def hot100_to_df(chart_date):
  chart = billboard.ChartData('hot-100',chart_date)
  row = []
  for i in range (len(chart)):
    row.append([chart[i].title,chart[i].artist,chart[i].weeks])
  chart_df = pd.DataFrame(np.array(row),columns=['title','artist','weeks'])
  print("chart %s completed!" % chart_date)
  return chart_df

In [None]:
dfs = []

time_0 = time.perf_counter()

with concurrent.futures.ProcessPoolExecutor() as executor:
    dates = itertools.chain(date_generator())
    results = executor.map(hot100_to_df,list(dates))

    for result in results:
      dfs.append(result)

print("Completato in %.4f secondi" % (time.perf_counter()-time_0))

df_tot = pd.concat(dfs)

chart 2021-01-09 completed!
chart 2021-01-16 completed!
chart 2020-12-26 completed!
chart 2021-01-02 completed!
chart 2020-12-12 completed!
chart 2020-12-19 completed!
chart 2020-11-28 completed!
chart 2020-12-05 completed!
chart 2020-11-21 completed!
chart 2020-11-14 completed!
chart 2020-11-07 completed!
chart 2020-10-31 completed!
chart 2020-10-24 completed!
chart 2020-10-17 completed!
chart 2020-10-03 completed!
chart 2020-10-10 completed!
chart 2020-09-26 completed!
chart 2020-09-12 completed!
chart 2020-09-19 completed!
chart 2020-09-05 completed!
chart 2020-08-29 completed!
chart 2020-08-22 completed!
chart 2020-08-15 completed!
chart 2020-08-08 completed!
chart 2020-08-01 completed!
chart 2020-07-25 completed!
chart 2020-07-18 completed!
chart 2020-07-11 completed!
chart 2020-07-04 completed!
chart 2020-06-27 completed!
chart 2020-06-20 completed!
chart 2020-06-13 completed!
chart 2020-06-06 completed!
chart 2020-05-30 completed!
chart 2020-05-16 completed!
chart 2020-05-23 com

In [None]:
from google.colab import drive

# mounts the google drive to Colab Notebook
drive.mount('/content/drive',force_remount=True)

df_tot.to_csv('/content/drive/My Drive/Colab Notebooks/datasets/billboard_dataset.csv')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly&response_type=code

Enter your authorization code:
4/1AY0e-g5jeyQmZF0YhKiBdoJ95efb2AbPwOGa_oLK1x01okCNfKSMAFxCQqc
Mounted at /content/drive


In [None]:
df_tot.title.count()

318587

In [None]:
!pip install -U -q PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
drive.CreateFile({'id':'1nF1AArc7KzV01rLhy7qlY-w9KfUACzWl'}).GetContentFile('billboard_dataset.csv')
df = pd.read_csv("billboard_dataset.csv").drop('Unnamed: 0',axis=1)

In [None]:
df.head()

Unnamed: 0,title,artist,weeks
0,Mood,24kGoldn Featuring iann dior,22
1,Positions,Ariana Grande,11
2,Blinding Lights,The Weeknd,57
3,Holy,Justin Bieber Featuring Chance The Rapper,16
4,Go Crazy,Chris Brown & Young Thug,35


In [None]:
# elimino duplicati
df_noDupl = df.drop_duplicates(subset=['title','artist'])

In [None]:
df_noDupl.title.count()

28349

In [None]:
from google.colab import drive

# mounts the google drive to Colab Notebook
drive.mount('/content/drive',force_remount=True)

df_noDupl.to_csv('/content/drive/My Drive/Colab Notebooks/datasets/billboard_dataset_unique.csv')

Mounted at /content/drive
