---
title: "Fixing inconsistencies in track data for predicting decade of release of songs"
subtitle: "DSAN 5300 Final Project"
authors: ["Jorge Bris Moreno", "William McGloin", "Kangheng Liu", "Isfar Baset"]
date: last-modified
date-format: long
format:
  html:
    self-contained: true
    toc: true
    code-overflow: wrap
    code-fold: true
---

**Note:** We will print every step of the code to ensure the quality of this synthetic data generation process.

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# read data from clean_data folder tracks.csv
tracks = pd.read_csv('../data/clean_data/tracks.csv')

tracks.head()

Unnamed: 0,artist_name,artist_id,album_id,album_release_year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,time_signature,duration_ms,track_name,album_name,key_mode,decade
0,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.787,0.889,-3.125,0.128,0.00951,0.000322,0.652,0.677,156.027,49YpGS0rVcRLtiDvx5JQyp,4,172399,Sorry for the Delay,DIRTIESTNASTIEST$UICIDE,D major,2020s
1,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.759,0.833,-5.01,0.0779,0.00026,0.0573,0.178,0.522,140.026,5dol1hrERJOReznLRJ2VVQ,4,183919,BUCKHEAD,DIRTIESTNASTIEST$UICIDE,B major,2020s
2,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.84,0.934,-3.717,0.119,0.0484,0.0,0.0961,0.67,149.994,3QQXpvZd9qmzHZ02wDf2im,4,145842,I Dream of Chrome,DIRTIESTNASTIEST$UICIDE,C major,2020s
3,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.894,0.767,-4.695,0.137,0.0231,2.4e-05,0.574,0.412,144.077,1UsvO5U72YRU8Xnq8Lp14O,4,140288,Champagne Face,DIRTIESTNASTIEST$UICIDE,A# minor,2020s
4,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.78,0.78,-2.857,0.0858,0.00147,0.0,0.472,0.446,118.014,2CkpD7gqMXrrpTCJ9TZ0bw,4,177289,The Serpent and the Rainbow,DIRTIESTNASTIEST$UICIDE,C major,2020s


In [3]:
# drop columns that will not be used in our models for obvious reasons 
tracks = tracks.drop(['artist_name', 'artist_id', 'album_id', 'album_release_year', 'track_id', 'track_name', 'album_name'], axis=1)

tracks.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,key_mode,decade
0,0.787,0.889,-3.125,0.128,0.00951,0.000322,0.652,0.677,156.027,4,172399,D major,2020s
1,0.759,0.833,-5.01,0.0779,0.00026,0.0573,0.178,0.522,140.026,4,183919,B major,2020s
2,0.84,0.934,-3.717,0.119,0.0484,0.0,0.0961,0.67,149.994,4,145842,C major,2020s
3,0.894,0.767,-4.695,0.137,0.0231,2.4e-05,0.574,0.412,144.077,4,140288,A# minor,2020s
4,0.78,0.78,-2.857,0.0858,0.00147,0.0,0.472,0.446,118.014,4,177289,C major,2020s


In [4]:
# split 'key_mode' column
tracks['key'] = tracks['key_mode'].apply(lambda x: x.split(' ')[0])
tracks['mode'] = tracks['key_mode'].apply(lambda x: x.split(' ')[1])

# drop key_mode
tracks.drop('key_mode', axis=1, inplace=True)
# covert mode into binary
tracks['mode'] = tracks['mode'].apply(lambda x: 1 if x == 'major' else 0)

# convert key into numerical
key_dict = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}
tracks['key'] = tracks['key'].apply(lambda x: key_dict[x])

# what does the data look like now?
tracks.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,decade,key,mode
0,0.787,0.889,-3.125,0.128,0.00951,0.000322,0.652,0.677,156.027,4,172399,2020s,2,1
1,0.759,0.833,-5.01,0.0779,0.00026,0.0573,0.178,0.522,140.026,4,183919,2020s,11,1
2,0.84,0.934,-3.717,0.119,0.0484,0.0,0.0961,0.67,149.994,4,145842,2020s,0,1
3,0.894,0.767,-4.695,0.137,0.0231,2.4e-05,0.574,0.412,144.077,4,140288,2020s,10,0
4,0.78,0.78,-2.857,0.0858,0.00147,0.0,0.472,0.446,118.014,4,177289,2020s,0,1


In [5]:
# Delete row with decade column == 1940s
tracks = tracks[tracks['decade'] != '1940s']

In [6]:
tracks.value_counts('decade')

decade
2010s    18863
2020s    14510
2000s     7590
1990s     4637
1960s     3738
1970s     3241
1980s     2519
1950s     1159
Name: count, dtype: int64

In [8]:
from sklearn.utils import resample

# Assuming 'decade' is your target and all other columns are features
X = tracks.drop('decade', axis=1)
y = tracks['decade']

# Combine the features and target into one DataFrame
tracks_df = pd.concat([X, y], axis=1)

# Get the number of samples in the smallest class
min_class_size = y.value_counts().min()

# Resample the classes
resampled_df_list = []
for decade_value, group in tracks_df.groupby('decade'):
    resampled_group = resample(group,
                               replace=True,
                               n_samples=min_class_size,
                               random_state=137)
    resampled_df_list.append(resampled_group)

# Combine the resampled dataframes
balanced_tracks = pd.concat(resampled_df_list)

# Print the new class distribution
print("New class distribution after resampling:")
print(balanced_tracks['decade'].value_counts())

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_tracks_no_smote.csv', index=False)

print("Balanced dataset created and saved successfully without using SMOTE.")


New class distribution after resampling:
decade
1950s    1159
1960s    1159
1970s    1159
1980s    1159
1990s    1159
2000s    1159
2010s    1159
2020s    1159
Name: count, dtype: int64
Balanced dataset created and saved successfully without using SMOTE.


In [7]:
from sklearn.utils import resample

# Assuming 'decade' is your target and all other columns are features
X = tracks.drop('decade', axis=1)
y = tracks['decade']

# Combining the features and target into one DataFrame for convenience
tracks_df = pd.concat([X, y], axis=1)

# Get the number of samples in the smallest class to match other classes to this size
min_class_size = tracks_df['decade'].value_counts().min()

# Resample each class to have the same number of samples as the smallest class
resampled_dfs = [resampled_class_df for _, resampled_class_df in tracks_df.groupby('decade').apply(lambda x: resample(x, replace=True, n_samples=min_class_size, random_state=44))]

# Combine the resampled dataframes back into one balanced dataframe
balanced_tracks = pd.concat(resampled_dfs)

# Print the new class distribution
print("New class distribution after resampling:")
print(balanced_tracks['decade'].value_counts())

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_tracks_no_smote.csv', index=False)

print("Balanced dataset created and saved successfully without using SMOTE.")


  resampled_dfs = [resampled_class_df for _, resampled_class_df in tracks_df.groupby('decade').apply(lambda x: resample(x, replace=True, n_samples=min_class_size, random_state=44))]


ValueError: too many values to unpack (expected 2)

In [17]:
from imblearn.over_sampling import SMOTE

# Assuming 'decade' is your target and all other columns are features
X = tracks.drop('decade', axis=1)
y = tracks['decade']

# Print the initial class distribution
print("Initial class distribution:")
print(y.value_counts())

# Apply SMOTE
smote = SMOTE(random_state=44, k_neighbors=5)  # using 5 as a safe small number
X_res, y_res = smote.fit_resample(X, y)

# Print the new class distribution
print("New class distribution after SMOTE:")
print(pd.Series(y_res).value_counts())

# Combine the resampled features and target into a new DataFrame
balanced_tracks = pd.DataFrame(X_res, columns=X.columns)
balanced_tracks['decade'] = y_res

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_tracks.csv', index=False)

print("Balanced dataset created and saved successfully.")

Initial class distribution:
decade
2010s    18863
2020s    14510
2000s     7590
1990s     4637
1960s     3738
1970s     3241
1980s     2519
1950s     1159
Name: count, dtype: int64
New class distribution after SMOTE:
decade
2020s    18863
2010s    18863
2000s    18863
1990s    18863
1980s    18863
1970s    18863
1960s    18863
1950s    18863
Name: count, dtype: int64
Balanced dataset created and saved successfully.
