# Dataset restructuring, missing value imputation
In this notebook the updated Spotify dataset (containing artist followers) is transformed to a dataset containing all the **unique songs** with their respective **audio features, sum of artist followers and points for the first 14 days** since the song's debut in the charts. For songs that contains some missing dates in the dataset, missing value imputation is performed

In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm
import os

In [32]:
# Import the Spotify dataset containing Top 200 playlists
path_to_dataset = os.path.join("..", "Data","spotify_updated_2.csv")
spotify_dataframe = pd.read_csv(path_to_dataset, sep = ",")

In [38]:
# Create a new dataframe where each row represents a unique song
unique_songs = spotify_dataframe.drop_duplicates(["Title", "Artists"])

In [34]:
# Define the dates that aren't present in the dataset
missing_dates = pd.to_datetime(["2017-02-23", "2017-06-02", "2017-05-30", "2017-05-31"]).values

# missing_single_dates = missing_dates[:2]
feb_23 = missing_dates[0]
june_2 = missing_dates[1]
may_30 = missing_dates[2]
may_31 = missing_dates[3]


In [35]:
# Function for imputing the points for songs on dates that are missing
def impute_points(current_date,song_dataframe):
    
    missing_dates = pd.to_datetime(["2017-02-23", "2017-06-02", "2017-05-30", "2017-05-31"]).values
    
    
    if current_date == missing_dates[0] or current_date == missing_dates[1]:
        
        previous_date = current_date - datetime.timedelta(days=1)
        
        # Check if song was present on the chart on the previous day, if so - obtain the points on 
        # the respective day
        if previous_date not in song_dataframe["Date"]:
            points_on_previous_day = 0
        else:
            points_on_previous_day = song_dataframe[song_dataframe["Date"] == previous_date]["Points (Total)"].values[0]
            
        next_date = current_date+ datetime.timedelta(days=1)
        
        # Check if song was present on the chart on the next day, if so - obtain the points on the 
        # respective day
        if next_date not in song_dataframe["Date"]:
            points_on_next_day = 0
        else:
            points_on_next_day = song_dataframe[song_dataframe["Date"] == next_date]["Points (Total)"].values[0]
            
    
    elif current_date == missing_dates[2]:
        
        previous_date = current_date - datetime.timedelta(days=1)
        
        # Check if song was present on the chart on the previous day, if so - obtain the points on 
        # the respective day
        if previous_date not in song_dataframe["Date"]:
            points_on_previous_day = 0
        else:
            points_on_previous_day = song_dataframe[song_dataframe["Date"] == previous_date]["Points (Total)"].values[0]
            
        next_date = current_date+ datetime.timedelta(days=2)
        
        # Check if song was present on the chart on 2 days after, if so - obtain the points on 
        # the respective day
        if next_date not in song_dataframe["Date"]:
            points_on_next_day = 0
        else:
            points_on_next_day = song_dataframe[song_dataframe["Date"] == next_date]["Points (Total)"].values[0]
     
    elif current_date == missing_dates[3]:
        
        previous_date = current_date - datetime.timedelta(days=2)
        
        # Check if song was present on the chart 2 days ago, if so - obtain the points on 
        # the respective day
        if previous_date not in song_dataframe["Date"]:
            points_on_previous_day = 0
        else:
            points_on_previous_day = song_dataframe[song_dataframe["Date"] == previous_date]["Points (Total)"].values[0]
            
        next_date = current_date+ datetime.timedelta(days=1)
        
        # Check if song was present on the chart on the next day if so - obtain the points on 
        # the respective day
        if next_date not in song_dataframe["Date"]:
            points_on_next_day = 0
        else:
            points_on_next_day = song_dataframe[song_dataframe["Date"] == next_date]["Points (Total)"].values[0]       
        
    # Calculate the points on the missing date by taking the average of the previous and next day points
    points_on_date = round((points_on_previous_day + points_on_next_day) / 2)
        
    return points_on_date


In [39]:
row_list = []
row_index = 0

# Iterate over unique songs 
for _, unique_song in tqdm(unique_songs.iterrows()):
    
    title = unique_song["Title"]
    artists = unique_song["Artists"]
    
    # Extract all songs with the specified song title and artist
    song_dataframe = spotify_dataframe[(spotify_dataframe["Title"] == title) & (spotify_dataframe["Artists"] == artists)]
    
   
    # Obtain the artists of the songs and their respective followers
    unique_artists = song_dataframe.drop_duplicates("# of Artist")[["# of Artist", "Followers"]]
    
    # Calculate the total followers all the artists of a song have
    total_followers = np.sum(unique_artists.iloc[:,1].astype(int))

    # Obtain entries for each unique date
    song_dataframe = song_dataframe.drop_duplicates("Date")
    
    # Convert the date to 'datetime' data type and sort them in ascending order
    song_dataframe["Date"] = pd.to_datetime(song_dataframe["Date"], format = "%d/%m/%Y")
    song_dataframe = song_dataframe.sort_values("Date")
    
    # Obtain the date when the song first appeared in the charts
    first_date = song_dataframe.iloc[0,:]["Date"]
    
    # Obtain the 'datetime' objects for the next 14 days, starting with the song's debut
    next_14_dates = [first_date + datetime.timedelta(days=day) for day in range(14)]

    points_14_days = []
    
    # Iterate over the 14 days, since the song's debut
    for date in next_14_dates:
        
        
        # If song's debut 14-day window includes one of the missing dates, perform point imputing
        if date == may_30 or date == may_31 or date == feb_23 or date == june_2:
            points_on_date = impute_points(date, song_dataframe)
            
        # Check if song was on the chart on the given date
        elif (song_dataframe["Date"] == date).any():
            # Add the points it received on the given date to the array
            points_on_date = song_dataframe[song_dataframe["Date"] == date]["Points (Total)"].values[0]

        # If the song wasn't on the chart on the given date, set points to 0
        else:
            points_on_date = 0
            
        points_14_days.append(points_on_date)
        

    # Construct the output row for the song, containing information about the song, audio features and points it received 
    # on the first 14 days since its debut
    output_row = [song_dataframe["Title"].iloc[0], song_dataframe["Artists"].iloc[0], total_followers, song_dataframe["Danceability"].iloc[0], song_dataframe["Energy"].iloc[0], song_dataframe["Loudness"].iloc[0], song_dataframe["Speechiness"].iloc[0], song_dataframe["Acousticness"].iloc[0], song_dataframe["Instrumentalness"].iloc[0], song_dataframe["Valence"].iloc[0], first_date] + points_14_days
  
  
    # Add the output row to the song dataframe
    row_list.append(output_row)
    row_index+=1
    


7801it [09:37, 13.50it/s]


In [40]:
# Create the column names for the new dataset
columns = ['Title', 'Artists', 'Total Followers', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence', 'First Date']

for i in range(1,15):
    columns.append(f"Points_{i}")
    
song_rank_df = pd.DataFrame(row_list, columns=columns)

In [41]:
# Ensure the loudness values are within range [-60;0]

# After inspecting the dataset, it was found that the values less than or equal to -1000 were exactly
# 1000 times less than their actual values, thus we are dividing such values by 1000 
song_rank_df.loc[song_rank_df["Loudness"] <= -1000, "Loudness"] = song_rank_df.loc[song_rank_df["Loudness"] <= -1000, "Loudness"] / 1000

# Drop the songs, where the loudness was greater than 0
song_rank_df = song_rank_df[song_rank_df["Loudness"] <= 0]



In [42]:
print(f"The dataset contains {song_rank_df.shape[0]} unique songs before normalisation")

The dataset contains 7798 unique songs before normalisation


In [44]:
# Save the new dataset to a file
song_rank_df.to_csv(os.path.join("..", "Data","song_dataset.csv"), index = False)

In [5]:
def normalise_testing_set(dataframe, means, stds):

  dataframe = dataframe.copy()
  numerical_features= ['Total Followers', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence']

  # Normalising each feature using Z-score normalisation
  for column in dataframe[numerical_features]:

        mean = means[column]
        std = stds[column]
        # Normalise the feature
        dataframe.loc[:,column] =  (dataframe[column] - mean ) / std

  return dataframe

In [4]:
def normalise_training_set(dataframe):

  dataframe = dataframe.copy()
  numerical_features= ['Total Followers', 'Danceability', 'Energy',
       'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
       'Valence']

  # Normalising each feature using Z-score normalisation
  means = {}
  stds = {}
  for column in dataframe[numerical_features]:

      # Calculate the mean and the standard deviation
      mean = dataframe[column].mean()
      std = dataframe[column].std()
      
      means[column] = mean
      stds[column] = std
      # Normalise the feature
      dataframe.loc[:,column] =  (dataframe[column] - mean ) / std

  return dataframe, means, stds

In [None]:
# Shuffle the dataset rows
random_state = 10

# Create training (90%) and test (10%) splits
shuffled_dataset = song_rank_df.sample(frac=1, random_state=random_state)
train, val, test = np.split(shuffled_dataset, [int(len(shuffled_dataset) * 0.9)])

train.to_csv(os.path.join("..", "Data","training.csv"), index = False)
test.to_csv(os.path.join("..", "Data","testing.csv"), index = False)

# Normalise the training and test splits
normalised_training, means, stds = normalise_training_set(train)
normalised_testing = normalise_testing_set(test, means, stds)

normalised_training.to_csv(os.path.join("..", "Data","training_normalised.csv"), index = False)
normalised_testing.to_csv(os.path.join("..", "Data","testing_normalised.csv"), index = False)