---
title: "Fixing inconsistencies in track data for predicting decade of release of songs"
subtitle: "DSAN 5300 Final Project"
authors: ["Jorge Bris Moreno", "William McGloin", "Kangheng Liu", "Isfar Baset"]
date: last-modified
date-format: long
format:
  html:
    self-contained: true
    toc: true
    code-overflow: wrap
    code-fold: true
---

**Note:** We will print every step of the code to ensure the quality of this synthetic data generation process.

In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# read data from clean_data folder tracks.csv
tracks = pd.read_csv('../data/clean_data/tracks.csv')

tracks.head()

Unnamed: 0,artist_name,artist_id,album_id,album_release_year,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_id,time_signature,duration_ms,track_name,album_name,key_mode,decade
0,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.787,0.889,-3.125,0.128,0.00951,0.000322,0.652,0.677,156.027,49YpGS0rVcRLtiDvx5JQyp,4,172399,Sorry for the Delay,DIRTIESTNASTIEST$UICIDE,D major,2020s
1,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.759,0.833,-5.01,0.0779,0.00026,0.0573,0.178,0.522,140.026,5dol1hrERJOReznLRJ2VVQ,4,183919,BUCKHEAD,DIRTIESTNASTIEST$UICIDE,B major,2020s
2,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.84,0.934,-3.717,0.119,0.0484,0.0,0.0961,0.67,149.994,3QQXpvZd9qmzHZ02wDf2im,4,145842,I Dream of Chrome,DIRTIESTNASTIEST$UICIDE,C major,2020s
3,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.894,0.767,-4.695,0.137,0.0231,2.4e-05,0.574,0.412,144.077,1UsvO5U72YRU8Xnq8Lp14O,4,140288,Champagne Face,DIRTIESTNASTIEST$UICIDE,A# minor,2020s
4,$uicideboy$,1VPmR4DJC1PlOtd0IADAO0,7mxSvZIgElLmVTdUfVNQFz,2022,0.78,0.78,-2.857,0.0858,0.00147,0.0,0.472,0.446,118.014,2CkpD7gqMXrrpTCJ9TZ0bw,4,177289,The Serpent and the Rainbow,DIRTIESTNASTIEST$UICIDE,C major,2020s


In [4]:
# read in rock genre data
rock = pd.read_csv('../data/raw_data/genre_data/rock.csv')

rock.head()

Unnamed: 0,id,name,popularity,followers.total,genre
0,2FXC3k01G6Gw61bmprjgqS,Hozier,86,6777892,rock
1,6XyY86QOPPrYVGvF9ch6wz,Linkin Park,86,25042065,rock
2,3WrFJ7ztbogyGnTHbHJFl2,The Beatles,85,27079356,rock
3,53XhwfbYqKCa1cC15pYq2q,Imagine Dragons,87,51925637,rock
4,7Ln80lUS6He07XvHI8qqHH,Arctic Monkeys,86,24122317,rock


In [7]:
# extrack name column to list
rock_artists = rock['id'].tolist()

# only keep rows in tracks that have artists in rock_artists
rock_tracks = tracks[tracks['artist_id'].isin(rock_artists)]

# print difference in rows
print('Number of rows in tracks:', len(tracks), '\nNumber of rows in rock_tracks:', len(rock_tracks))

Number of rows in tracks: 56258 
Number of rows in rock_tracks: 5943


In [8]:
# drop columns that will not be used in our models for obvious reasons 
rock_tracks = rock_tracks.drop(['artist_name', 'artist_id', 'album_id', 'album_release_year', 'track_id', 'track_name', 'album_name'], axis=1)

rock_tracks.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,key_mode,decade
943,0.455,0.79,-4.049,0.0432,4e-06,0.00021,0.0866,0.22,125.964,4,217466,G major,2020s
944,0.526,0.871,-4.132,0.032,1.7e-05,0.000142,0.0573,0.745,109.82,4,246466,G major,2020s
945,0.599,0.787,-4.708,0.0338,2.1e-05,5.3e-05,0.064,0.822,114.065,4,185133,D major,2020s
946,0.57,0.753,-5.98,0.0365,2e-06,0.627,0.133,0.477,119.99,4,212226,A major,2020s
947,0.675,0.93,-4.515,0.0319,8.3e-05,0.000454,0.315,0.778,110.013,4,190333,C major,2020s


In [9]:
# split 'key_mode' column
rock_tracks['key'] = rock_tracks['key_mode'].apply(lambda x: x.split(' ')[0])
rock_tracks['mode'] = rock_tracks['key_mode'].apply(lambda x: x.split(' ')[1])

# drop key_mode
rock_tracks.drop('key_mode', axis=1, inplace=True)
# covert mode into binary
rock_tracks['mode'] = rock_tracks['mode'].apply(lambda x: 1 if x == 'major' else 0)

# convert key into numerical
key_dict = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}
rock_tracks['key'] = rock_tracks['key'].apply(lambda x: key_dict[x])

# what does the data look like now?
rock_tracks.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_ms,decade,key,mode
943,0.455,0.79,-4.049,0.0432,4e-06,0.00021,0.0866,0.22,125.964,4,217466,2020s,7,1
944,0.526,0.871,-4.132,0.032,1.7e-05,0.000142,0.0573,0.745,109.82,4,246466,2020s,7,1
945,0.599,0.787,-4.708,0.0338,2.1e-05,5.3e-05,0.064,0.822,114.065,4,185133,2020s,2,1
946,0.57,0.753,-5.98,0.0365,2e-06,0.627,0.133,0.477,119.99,4,212226,2020s,9,1
947,0.675,0.93,-4.515,0.0319,8.3e-05,0.000454,0.315,0.778,110.013,4,190333,2020s,0,1


In [10]:
# Delete row with decade column == 1940s
rock_tracks = rock_tracks[rock_tracks['decade'] != '1940s']

In [11]:
rock_tracks.value_counts('decade')

decade
2010s    1814
2000s    1194
2020s     813
1990s     797
1980s     543
1970s     527
1960s     255
Name: count, dtype: int64

In [13]:
from sklearn.utils import resample

# Assuming 'decade' is your target and all other columns are features
X = rock_tracks.drop('decade', axis=1)
y = rock_tracks['decade']

# Combine the features and target into one DataFrame
tracks_df = pd.concat([X, y], axis=1)

# Get the mean number of samples
min_class_size = int(y.value_counts().mean())

# Resample the classes
resampled_df_list = []
for decade_value, group in tracks_df.groupby('decade'):
    resampled_group = resample(group,
                               replace=True,
                               n_samples=min_class_size,
                               random_state=137)
    resampled_df_list.append(resampled_group)

# Combine the resampled dataframes
balanced_tracks = pd.concat(resampled_df_list)

# Print the new class distribution
print("New class distribution after resampling:")
print(balanced_tracks['decade'].value_counts())

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_rock_tracks.csv', index=False)

print("Balanced dataset created and saved successfully without using SMOTE.")


New class distribution after resampling:
decade
1960s    849
1970s    849
1980s    849
1990s    849
2000s    849
2010s    849
2020s    849
Name: count, dtype: int64
Balanced dataset created and saved successfully without using SMOTE.


# ----stop here-----

In [7]:
from sklearn.utils import resample

# Assuming 'decade' is your target and all other columns are features
X = tracks.drop('decade', axis=1)
y = tracks['decade']

# Combining the features and target into one DataFrame for convenience
tracks_df = pd.concat([X, y], axis=1)

# Get the number of samples in the smallest class to match other classes to this size
min_class_size = tracks_df['decade'].value_counts().min()

# Resample each class to have the same number of samples as the smallest class
resampled_dfs = [resampled_class_df for _, resampled_class_df in tracks_df.groupby('decade').apply(lambda x: resample(x, replace=True, n_samples=min_class_size, random_state=44))]

# Combine the resampled dataframes back into one balanced dataframe
balanced_tracks = pd.concat(resampled_dfs)

# Print the new class distribution
print("New class distribution after resampling:")
print(balanced_tracks['decade'].value_counts())

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_tracks_no_smote.csv', index=False)

print("Balanced dataset created and saved successfully without using SMOTE.")


  resampled_dfs = [resampled_class_df for _, resampled_class_df in tracks_df.groupby('decade').apply(lambda x: resample(x, replace=True, n_samples=min_class_size, random_state=44))]


ValueError: too many values to unpack (expected 2)

In [17]:
from imblearn.over_sampling import SMOTE

# Assuming 'decade' is your target and all other columns are features
X = tracks.drop('decade', axis=1)
y = tracks['decade']

# Print the initial class distribution
print("Initial class distribution:")
print(y.value_counts())

# Apply SMOTE
smote = SMOTE(random_state=44, k_neighbors=5)  # using 5 as a safe small number
X_res, y_res = smote.fit_resample(X, y)

# Print the new class distribution
print("New class distribution after SMOTE:")
print(pd.Series(y_res).value_counts())

# Combine the resampled features and target into a new DataFrame
balanced_tracks = pd.DataFrame(X_res, columns=X.columns)
balanced_tracks['decade'] = y_res

# Save the balanced dataset to a CSV file
balanced_tracks.to_csv('../data/clean_data/balanced_tracks.csv', index=False)

print("Balanced dataset created and saved successfully.")

Initial class distribution:
decade
2010s    18863
2020s    14510
2000s     7590
1990s     4637
1960s     3738
1970s     3241
1980s     2519
1950s     1159
Name: count, dtype: int64
New class distribution after SMOTE:
decade
2020s    18863
2010s    18863
2000s    18863
1990s    18863
1980s    18863
1970s    18863
1960s    18863
1950s    18863
Name: count, dtype: int64
Balanced dataset created and saved successfully.
