In [1]:
#import packages
import pandas as pd
import numpy as np

In [2]:
#read data
df = pd.read_csv(r"C:\Users\keith\OneDrive\Desktop\OMSA\CSE6242\Project\dataset.csv")

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


In [4]:
def move_column(df, col_name, col_index):
    cols = df.columns.tolist()
    cols.insert(col_index, cols.pop(cols.index(col_name))) 
    df = df[cols]    
    return df

In [5]:
def make_categorical_column(df, col_name, col_index):
    bins = [0, .25, .50, .75, 1] 
    labels = ['Low', 'Moderate', 'High', 'Very High']
    df[col_name + '_categorical'] = pd.cut(df[col_name], bins=bins, labels=labels)
    
    # KD: Ensure that if the value is zero, it gets assigned "Low". If value is 1, it gets assigned "Very High"
    df.loc[df[col_name] == 0, col_name + '_categorical'] = 'Low'
    df.loc[df[col_name] == 1, col_name + '_categorical'] = 'Very High'

    df = move_column(df, col_name + '_categorical', col_index)
    df = df.rename(columns={col_name: col_name + '_numeric'})
    return df

In [31]:
def clean_project_data(df):
    """
    Function defined using fields from the Kaggle dataset. 
    https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset?resource=download
    """
    
#     remove index
    df = df.drop(df.columns[0], axis=1)
    
#    "artists"
    df = df[~pd.isna(df['artists']) & (df['artists'].str.strip() != "") & (df['artists'].str.strip() != "nan")] # KD: remove rows where artist is NaN, blank, or "nan" (only 1 case found in kaggle dataset)
    artist_cols = df['artists'].str.split(';', expand=True) #separate artists into individual columns
    artist_cols = artist_cols.iloc[:, :3].astype(str) #keep only first 3 artists
    artist_cols.columns = ["artist_1", "artist_2", "artist_3"] #name the new columns
    df = pd.concat([artist_cols, df], axis=1).drop(columns=['artists']) #replace original "artists" column in df
    
#     "album_name" and "track_name": convert to string
    df['album_name'] = df['album_name'].astype(str)
    df['track_name'] = df['track_name'].astype(str)
    
#     "popularity": leave original column but add a new categorical column
# note: popularity ranges from 0-100 with 0 being not popular and 100 being popular
    df['popularity'] = df['popularity'].astype(int) #convert to int
    df['popularity'] = df['popularity'] / 100 #convert to 0-1 scale to be consistent with other fields
    df = make_categorical_column(df, 'popularity', 6)

    
#     "duration_ms": convert to minutes
    df['duration_minutes'] = df['duration_ms'] / 60000
    df['duration_minutes'] = df['duration_minutes'].round(2)
    df = move_column(df, 'duration_minutes', 8)
    df = df.drop(columns=['duration_ms'])
    
#     "danceability"
# note: danceability ranges from 0-1 with 0 being not danceable and 1 being danceable
    df = make_categorical_column(df, 'danceability', 10)
    
#     "energy"
# ranges from 0-1
    df = make_categorical_column(df, 'energy', 12)
    
################# Consider dropping either energy or danceability? They seem similar
    
#     key
    #unsure of relevance at this time
    df = df.drop(columns=['key'])
    
#     loudness
    #unsure of relevance at this time
    df = df.drop(columns=['loudness'])
    
#     mode
    #unsure of what this field is, need more info
    
#     speechiness
# range 0-1, convert to 100
    df = make_categorical_column(df, 'speechiness', 15)
    
    
    return df    

In [33]:
cleaned_df = clean_project_data(df)
cleaned_df

Unnamed: 0,artist_1,artist_2,artist_3,track_id,album_name,track_name,popularity_categorical,popularity_numeric,duration_minutes,explicit,...,mode,speechiness_categorical,speechiness_numeric,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,,,5SuOikwiRyPMVoIQDJUgSV,Comedy,Comedy,High,0.73,3.84,False,...,0,Low,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,4,acoustic
1,Ben Woodward,,,4qPNDBW1i3p13qLCt0Ki3A,Ghost (Acoustic),Ghost - Acoustic,High,0.55,2.49,False,...,1,Low,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,4,acoustic
2,Ingrid Michaelson,ZAYN,,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,To Begin Again,High,0.57,3.51,False,...,1,Low,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,4,acoustic
3,Kina Grannis,,,6lfxq3CG4xtTiEg7opyCyx,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,High,0.71,3.37,False,...,1,Low,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,3,acoustic
4,Chord Overstreet,,,5vjLSffimiIP26QG5WcN2K,Hold On,Hold On,Very High,0.82,3.31,False,...,1,Low,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,4,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113995,Rainy Lullaby,,,2C3TZjDRiAzdyViavDJ217,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,Low,0.21,6.42,False,...,1,Low,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,5,world-music
113996,Rainy Lullaby,,,1hIz5L4IB9hN3WRYPOCGPw,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,Low,0.22,6.42,False,...,0,Low,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,4,world-music
113997,Cesária Evora,,,6x8ZfSoqDjuNa5SVP5QjvX,Best Of,Miss Perfumado,Low,0.22,4.52,False,...,0,Low,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,4,world-music
113998,Michael W. Smith,,,2e6sXL2bYv4bSz6VTdnfLs,Change Your World,Friends,Moderate,0.41,4.73,False,...,1,Low,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,4,world-music


In [15]:
cleaned_df.dtypes

artist_1                      object
artist_2                      object
artist_3                      object
track_id                      object
album_name                    object
track_name                    object
popularity_categorical      category
popularity_numeric           float64
duration_minutes             float64
explicit                        bool
danceability_categorical    category
danceability_numeric         float64
energy_categorical          category
energy_numeric               float64
mode                           int64
speechiness_categorical     category
speechiness_numeric          float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
time_signature                 int64
track_genre                   object
dtype: object

In [17]:
cleaned_df.isna().sum()

artist_1                    0
artist_2                    0
artist_3                    0
track_id                    0
album_name                  0
track_name                  0
popularity_categorical      0
popularity_numeric          0
duration_minutes            0
explicit                    0
danceability_categorical    0
danceability_numeric        0
energy_categorical          0
energy_numeric              0
mode                        0
speechiness_categorical     0
speechiness_numeric         0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
time_signature              0
track_genre                 0
dtype: int64

In [19]:
# Display max and min values for all numeric columns
max_values = cleaned_df.select_dtypes(include='number').max()
min_values = cleaned_df.select_dtypes(include='number').min()

print("Max values in numeric columns:\n", max_values)
print("\nMin values in numeric columns:\n", min_values)

Max values in numeric columns:
 popularity_numeric        1.000
duration_minutes         87.290
danceability_numeric      0.985
energy_numeric            1.000
mode                      1.000
speechiness_numeric       0.965
acousticness              0.996
instrumentalness          1.000
liveness                  1.000
valence                   0.995
tempo                   243.372
time_signature            5.000
dtype: float64

Min values in numeric columns:
 popularity_numeric      0.00
duration_minutes        0.14
danceability_numeric    0.00
energy_numeric          0.00
mode                    0.00
speechiness_numeric     0.00
acousticness            0.00
instrumentalness        0.00
liveness                0.00
valence                 0.00
tempo                   0.00
time_signature          0.00
dtype: float64


In [21]:
# Loop through columns starting from the 7th column onward
for col in cleaned_df.iloc[:, 6:].columns:
    # Check if the column is non-numeric
    if cleaned_df[col].dtype == 'object' or cleaned_df[col].dtype == 'category' or cleaned_df[col].dtype == 'bool':
        # Display the unique values in the non-numeric column
        print(f"Unique values in column '{col}':")
        print(cleaned_df[col].unique())
        print("\n")

Unique values in column 'popularity_categorical':
['High', 'Very High', 'Low', 'Moderate']
Categories (4, object): ['Low' < 'Moderate' < 'High' < 'Very High']


Unique values in column 'explicit':
[False  True]


Unique values in column 'danceability_categorical':
['High', 'Moderate', 'Very High', 'Low']
Categories (4, object): ['Low' < 'Moderate' < 'High' < 'Very High']


Unique values in column 'energy_categorical':
['Moderate', 'Low', 'High', 'Very High']
Categories (4, object): ['Low' < 'Moderate' < 'High' < 'Very High']


Unique values in column 'speechiness_categorical':
['Low', 'Moderate', 'High', 'Very High']
Categories (4, object): ['Low' < 'Moderate' < 'High' < 'Very High']


Unique values in column 'track_genre':
['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' '

In [23]:
cleaned_df.to_csv('cleaned_spotify_data.csv', index=False)