In [4]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import os
import ast

from api_handling import api_handling
# from Song_Consolidation import Song_Consolidation
import tensorflow as tf
import pickle

In [5]:
data = api_handling(save_location='../Data/')
data.csv_pull(overwrite=True)

- This below creates the time series index, to use to sample the other data points

In [6]:
def create_TS_map(duration_ts, sample_rate):
    #sample_rate is in samples per second, which get converted to ms in the method
    sample_rate = sample_rate / 1000
    # ms
    step = 1/sample_rate
    time_row = list(np.arange(start=step, stop = duration_ts, step = step))
    return time_row

- See Below: Given a traverses the passed feature and it scales the time series data and turns it into an array of time series samples

In [7]:
def column_TS_map(column_value_map, time_series_map):
    '''
    To be mapped on every individual song's column on a dataframe
        - such that the input value is a list of dictionaries 
    '''
    start_output = list(np.zeros(len(time_series_map)))
    duration_output = list(np.zeros(len(time_series_map)))

    if type(column_value_map) == str:
        column_value_map = ast.literal_eval(column_value_map)
    assert type(column_value_map) == list, 'reassess the data type being time-mapped'
    assert type(column_value_map[0]) == dict, 'events are not stored as dictionaries'

    time_axis_last_applied = 0
    # helps with time complexity due to events occuring in sequential order as delivered by the spotify api
    # when searching in the time series, will only search forward from the last point in time when we last applied an item

    step_size = time_series_map[0]
    # if sample time-signature minus event start is less than or equal to the step size, it means that the event occured during that time interval 

    for event in column_value_map:
        # every tatum in the list, represented as a dictionary, for example
        
        #converts into milliseconds
        event_start = event['start'] * 1000
        event_duration = event['duration'] * 1000
        

        for t in range(time_axis_last_applied, len(time_series_map)):
            #iterate through every reasonable time sample available in the song as created by the input time series map, and check if an event happened in the interval
            
            sample_time = time_series_map[t]
            time_axis_last_applied = t
            # api provides data squentially so can update, therefore we won't check the same time twice by reassigning every iteration

            if np.abs(sample_time - event_start) <= step_size and sample_time >= event_start:
                
                # applying the start event explicitly
                start_output[t] = 1 
                #applying duration at following time events iteratively 
                remaining_duration = float(event_duration)
                temp_index = int(t)
                while remaining_duration > 0:
                    # checking for index issues
                    if temp_index >= len(time_series_map):
                        break
                    else:
                        duration_output[temp_index] = 1
                        remaining_duration -= step_size
                        temp_index += 1

                break

    return [start_output, duration_output]

- See Below: Given a 'segments' cell which has a dictionary in it, it scales the time series data and turns it into an array of time series samples

In [8]:
def construct_loudness_vector(column_value_map, time_series_map):
    output = list(np.zeros(len(time_series_map)))
    change_vector = list(np.zeros(len(time_series_map)))


    if type(column_value_map) == str:
        column_value_map = ast.literal_eval(column_value_map)
    assert type(column_value_map) == list, 'reassess the data type being time-mapped'
    assert type(column_value_map[0]) == dict, 'events are not stored as dictionaries'

    time_axis_last_applied = 0
    # helps with time complexity due to events occuring in sequential order as delivered by the spotify api
    # when searching in the time series, will only search forward from the last point in time when we last applied an item

    step_size = time_series_map[0]

    begin_lagging = False
    lagging_value = None
    # continuous loudness value lagging begins once we find the first, 
    # then will continue imputing that value until next loundness value is found 


    for event in column_value_map:
        assert type(event) == dict
        # every song segment
        
        #converts into milliseconds
        event_start = event['start'] * 1000.0
        event_duration = event['duration'] * 1000.0
        event_end = event_start + event_duration
        event_max_time = event['loudness_max_time'] * 1000.0

        start_loudness = float(event['loudness_start']) 
        assert type(start_loudness) == float
        max_loudness = float(event['loudness_max']) 
        end_loudness = float(event['loudness_end'])

        for (loudness_value, loudness_time) in zip(
            [start_loudness, max_loudness, end_loudness],
            [event_start, event_max_time, event_end]):
            # I have loudness values for each of these elements

            for t in range(time_axis_last_applied, len(time_series_map)):
                # iterate through every reasonable time sample available in the song as created by the input time series map,
                # and check if an event happened in the interval
                sample_time = time_series_map[t]
                time_axis_last_applied = t

                if np.abs(sample_time - loudness_time) <= step_size and sample_time >= loudness_time:
                    # applying the start event explicitly
                    output[t] = float(loudness_value)
                    begin_lagging = True
                    lagging_value = float(loudness_value)
                    break
                else:
                    if begin_lagging:
                        output[t] = float(lagging_value)

    return output

- See Below: Helper Methods to traverse the rows, or songs, in the dataframe

In [9]:
def prepare_song_ts_data_with_duration(row, columns, sample_rate):
    '''for a given row, and sent columns, returns the song ts data formatted for GAN training'''
    song_time_map = create_TS_map(row['duration_ms_x'], sample_rate=sample_rate)
    output = []
    for column in columns:
        for o in column_TS_map(row[column],time_series_map = song_time_map):
            output.append(o)
    return output

In [10]:
def prepare_song_ts_data_no_duration(row, columns, sample_rate):
    '''for a given row, and sent columns, returns the song ts data formatted for GAN training'''
    song_time_map = create_TS_map(row['duration_ms_x'], sample_rate=sample_rate)
    output = []
    for column in columns:
        if column == 'segments':
            output.append(construct_loudness_vector(row[column],time_series_map=song_time_map))
        else:
            output.append(column_TS_map(row[column],time_series_map = song_time_map)[0])
    return output

In [11]:
def prepare_song_single_data(row,columns):
    output = []
    for column in columns:
        output.append(row[column])
    return output

In [12]:
def load_training_data(df, single_unit_columns, time_series_columns_with_duration, time_series_columns_no_duration, ts_sample_rate, split, verbose = False):
    '''
    Pulls all data as categorized,
    reason for no_duration is that section data is that ts section data is less valuable, as the beginning of a new section implies that the duration has concluded so one list is valuable at most
    '''
    for col in single_unit_columns + time_series_columns_no_duration + time_series_columns_with_duration:
        assert col in df.columns, 'send valid columns'  
    
    if split:
        single_data = []
        ts_data = []
        for row_index in range(df.shape[0]):
            print(f'\r Progress: {row_index*100/df.shape[0]}%', end='')
            row = df.iloc[row_index]
            single_data.append(prepare_song_single_data(row,single_unit_columns))
            ts_data.append(prepare_song_ts_data_with_duration(row=row,columns=time_series_columns_with_duration, sample_rate=ts_sample_rate) + prepare_song_ts_data_no_duration(row=row,columns=time_series_columns_no_duration, sample_rate=ts_sample_rate))
        if verbose:
            return single_data, ts_data, single_unit_columns + time_series_columns_with_duration + time_series_columns_no_duration
        else: 
            return single_data, ts_data

    else:
        output = []
        for row_index in range(df.shape[0]):
            row = df.iloc[row_index]
            output.append(prepare_song_single_data(row,single_unit_columns) + prepare_song_ts_data_with_duration(row=row,columns=time_series_columns_with_duration, sample_rate=ts_sample_rate) + prepare_song_ts_data_no_duration(row=row,columns=time_series_columns_no_duration, sample_rate=ts_sample_rate))
        if verbose:
            return output, single_unit_columns+time_series_columns_with_duration+time_series_columns_no_duration
        else:
            return output

In [13]:
training_data = load_training_data(
    data.frames['full_song_data'],
    single_unit_columns=['popularity','tempo','valence'],
    time_series_columns_with_duration=['beats','tatums','bars'],
    time_series_columns_no_duration=['segments','sections'],
    ts_sample_rate=10,
    split=True
    )

 Progress: 99.98551774076756%%%

In [22]:
with open('training_2.pickle', 'wb') as f:
    pickle.dump(training_data, f)