## Dataset preprocess & creation
This `.ipynb` files is used to preprocess the data & extract necessary fields to be saved using SpotifyAPI.


In [25]:
# Import the necessary libraries
import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyOAuth
from collections import Counter
from tqdm import tqdm

## Initialize the SpotifyOAuth and SpotifyClient objects

In [54]:
# Load the YAML secrets
import yaml
secret_path = "/Users/kbora/Documents/Spoti-Youtube/source/secrets.yaml"
with open(secret_path, 'r') as stream:
    try:
        secrets = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

sp_auth = SpotifyOAuth(
    client_id=secrets['CLIENT_ID'],
    client_secret=secrets['CLIENT_SECRET'],
    redirect_uri=secrets['REDIRECT_URI'],
    scope=secrets['SCOPE'], 
)

In [55]:
sp = spotipy.Spotify(auth_manager=sp_auth)

## Load the charts dataset

In [6]:
CHARTS_PATH = "/Users/kbora/Documents/Spoti-Youtube/data/charts.csv"

# Load the charts
charts = pd.read_csv(CHARTS_PATH)
charts.head()

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956.0


In [7]:
charts.columns

Index(['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend',
       'streams'],
      dtype='object')

## Build a dataset with spotify API

In [62]:
# Functions to get the audio features of tracks
from typing import List, Dict, Any
import time
from functools import wraps


def sleep_wrapper(t = 0.33):
    """
    Wrapper function to sleep for a certain amount of time after each call to the Spotify API.
    """
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            time.sleep(t)
            return func(*args, **kwargs)
        return wrapper
    return decorator

@sleep_wrapper(0.5)
def get_audio_feature(sp : spotipy.Spotify, track_ids: List[str]) -> Dict[str, Any]:
    """
    Get the audio features of a track.
    """
    return sp.audio_features(track_ids)

def get_audio_features(sp : spotipy.Spotify, track_ids: List[str], tracks_per_call : int = 100) -> List[Dict[str, Any]]:
    """
    Get the audio features of a list of tracks.
    """
    # Split the track ids into chunks of 100
    track_ids_chunks = [track_ids[i:i + tracks_per_call] for i in range(0, len(track_ids), tracks_per_call)]
    # Get the audio features of each chunk
    audio_features = []
    for chunk in tqdm(track_ids_chunks):
        try:
            audio_features.extend(get_audio_feature(sp, chunk))
        except Exception as e:
            print(e)
            return audio_features
    return audio_features


In [57]:
# Get the all unique track ids from the charts
unique_track_ids = charts["url"].apply(lambda x: x.split("/")[-1]).unique().tolist()

# Test calls
try:
    audio_features = get_audio_features(sp, unique_track_ids[:1], tracks_per_call = 100)
except Exception as e:
    print(e)

100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


In [59]:
# For all unique track ids, get the audio features
audio_features = get_audio_features(sp, unique_track_ids, tracks_per_call = 100)

# Remove all the None values
audio_features = [af for af in audio_features if af is not None]

# Extract important regions
AUDIO_FEATURES_COLUMNS = [
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'time_signature',
    'valence',
    'id'
]

# Create a dataframe from the audio features
audio_features_df = pd.DataFrame(audio_features)

# Select only the important columns
audio_features_df = audio_features_df[AUDIO_FEATURES_COLUMNS]

# Save the audio features
audio_features_df.to_csv('audio_features.csv', index = False)

100%|██████████| 1981/1981 [23:02<00:00,  1.43it/s]


# Preprocess the charts

With given audio features, some track ids does not have audio-features. Eliminate all the missmatches

In [61]:
audio_features_track_ids = audio_features_df["id"].tolist()

# Drop all the tracks that are not in the charts
charts = charts[charts["url"].apply(lambda x: x.split("/")[-1]).isin(audio_features_track_ids)]

# Check whether urls in charts and audio features are the same set
assert set(charts["url"].apply(lambda x: x.split("/")[-1]).tolist()) == set(audio_features_track_ids)

# Save the charts
charts.to_csv('charts_processed.csv', index = False)