# Data Wrangling, Cleaning, Features and Dimensionality

In [1]:
import pandas as pd
import numpy as np

In [2]:
user_df = pd.read_csv('User Listening History.csv')
music_df = pd.read_csv('Music Info.csv')

In [3]:
music_df.head()

Unnamed: 0,track_id,name,artist,spotify_preview_url,spotify_id,tags,genre,year,duration_ms,danceability,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,TRIOREW128F424EAF0,Mr. Brightside,The Killers,https://p.scdn.co/mp3-preview/4d26180e6961fd46...,09ZQ5TmUG8TSL56n0knqrj,"rock, alternative, indie, alternative_rock, in...",,2004,222200,0.355,...,1,-4.36,1,0.0746,0.00119,0.0,0.0971,0.24,148.114,4
1,TRRIVDJ128F429B0E8,Wonderwall,Oasis,https://p.scdn.co/mp3-preview/d012e536916c927b...,06UfBBDISthj1ZJAtX4xjj,"rock, alternative, indie, pop, alternative_roc...",,2006,258613,0.409,...,2,-4.373,1,0.0336,0.000807,0.0,0.207,0.651,174.426,4
2,TROUVHL128F426C441,Come as You Are,Nirvana,https://p.scdn.co/mp3-preview/a1c11bb1cb231031...,0keNu0t0tqsWtExGM3nT1D,"rock, alternative, alternative_rock, 90s, grunge",RnB,1991,218920,0.508,...,4,-5.783,0,0.04,0.000175,0.000459,0.0878,0.543,120.012,4
3,TRUEIND128F93038C4,Take Me Out,Franz Ferdinand,https://p.scdn.co/mp3-preview/399c401370438be4...,0ancVQ9wEcHVd0RrGICTE4,"rock, alternative, indie, alternative_rock, in...",,2004,237026,0.279,...,9,-8.851,1,0.0371,0.000389,0.000655,0.133,0.49,104.56,4
4,TRLNZBD128F935E4D8,Creep,Radiohead,https://p.scdn.co/mp3-preview/e7eb60e9466bc3a2...,01QoK9DA7VTeTSE3MNzp4I,"rock, alternative, indie, alternative_rock, in...",RnB,2008,238640,0.515,...,7,-9.935,1,0.0369,0.0102,0.000141,0.129,0.104,91.841,4


### 1. Wrangling & Cleaning

In [4]:
music_df.isnull().sum()

track_id                   0
name                       0
artist                     0
spotify_preview_url        0
spotify_id                 0
tags                    1127
genre                  28335
year                       0
duration_ms                0
danceability               0
energy                     0
key                        0
loudness                   0
mode                       0
speechiness                0
acousticness               0
instrumentalness           0
liveness                   0
valence                    0
tempo                      0
time_signature             0
dtype: int64

In [5]:
print(music_df['genre'].unique())

[nan 'RnB' 'Rock' 'Pop' 'Metal' 'Electronic' 'Jazz' 'Punk' 'Country'
 'Folk' 'Reggae' 'Rap' 'Blues' 'New Age' 'Latin' 'World']


In [6]:
music_df['tags'].value_counts()

tags
country                                                                             506
reggae                                                                              454
black_metal                                                                         442
rap, hip_hop                                                                        378
drum_and_bass                                                                       365
                                                                                   ... 
rock, alternative, alternative_rock, experimental, singer_songwriter, blues, 00s      1
indie, experimental, folk, acoustic, male_vocalists, guitar                           1
electronic, experimental, punk, industrial, punk_rock                                 1
rock, experimental, punk, punk_rock, 70s, grunge                                      1
rock, alternative_rock, japanese, cover                                               1
Name: count, Length: 20057,

In [7]:
# genre to lowercase
music_df['genre'] = music_df['genre'].str.lower()

# tags and genre null values empty string
music_df['tags'].fillna('', inplace=True)
music_df['genre'].fillna('', inplace=True)

In [8]:
def concatenate_genre_tags(row):
    genre = row['genre'].strip()
    tags = row['tags'].strip()

    if genre and genre not in tags:
        combined = f"{genre}, {tags}" if tags else genre
    else:
        combined = tags
    return combined


In [9]:
# make column tagsplus that is tags plus genre with comma and space
music_df['tagsplus'] = music_df.apply(concatenate_genre_tags, axis=1)

In [10]:
music_df['tagsplus'].to_list()

['rock, alternative, indie, alternative_rock, indie_rock, 00s',
 'rock, alternative, indie, pop, alternative_rock, british, 90s, love, britpop',
 'rnb, rock, alternative, alternative_rock, 90s, grunge',
 'rock, alternative, indie, alternative_rock, indie_rock, british, 00s, britpop',
 'rnb, rock, alternative, indie, alternative_rock, indie_rock, british, 90s, britpop',
 'rock, alternative, indie, pop, alternative_rock, indie_rock',
 'rock, alternative, indie, pop, alternative_rock, british, britpop',
 'rock, alternative, indie, alternative_rock, indie_rock, british, 90s, britpop',
 'rock, alternative, indie, pop, alternative_rock, british, piano, love, beautiful, 00s, britpop, mellow',
 'rock, alternative, indie, pop, alternative_rock, british, chillout, piano, britpop',
 'pop, rock, alternative, alternative_rock, 90s, funk',
 'rock, alternative, indie, alternative_rock, indie_rock',
 'rnb, rock, electronic, alternative, indie, pop, alternative_rock, dance, british, hip_hop, trip_hop',

In [11]:
# drop rows where tagsplus is empty string
music_df = music_df[music_df['tagsplus'] != '']

In [12]:
# for tagsplus count the amount of empty strings
music_df['tagsplus'].value_counts()

tagsplus
country                                                                                                    447
reggae                                                                                                     406
black_metal                                                                                                396
rap, hip_hop                                                                                               349
reggae, ska                                                                                                271
                                                                                                          ... 
indie, folk, indie_rock, singer_songwriter, chillout, 90s, acoustic, indie_pop, love, beautiful, mellow      1
indie, folk, soundtrack, acoustic, indie_pop, love, beautiful, mellow                                        1
indie, folk, indie_rock, chillout, indie_pop, love, beautiful, mellow                                  

In [13]:
len(music_df)

49988

### 2. Feature Engineering

In [14]:
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [15]:
# drop genre, tags and spotify fields
music_df.drop(columns=['track_id', 'name', 'artist', 'genre', 'tags', 'spotify_preview_url', 'spotify_id'], inplace=True)

#### 2.1 Numerical scaling

In [16]:
# numeric features and transformer
numeric_features = ['year', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])


In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [18]:
numeric_processed = preprocessor.fit_transform(music_df)

In [19]:
numeric_processed.shape

(49988, 14)

#### 2.2 Genre one-hot encoding

In [20]:
# tagsplus string to list
tagsplus_df = music_df['tagsplus'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

In [21]:
# use multilabelbinarizer to encode tagsplus
mlb = MultiLabelBinarizer()

genre_encoded = mlb.fit_transform(tagsplus_df)

genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=tagsplus_df.index)

In [22]:
genre_df.head()

Unnamed: 0,00s,60s,70s,80s,90s,acoustic,alternative,alternative_rock,ambient,american,...,soul,soundtrack,swedish,symphonic_metal,synthpop,techno,thrash_metal,trance,trip_hop,world
0,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# join list of numeric_features with genre_df dataframe
features_df = pd.concat([pd.DataFrame(numeric_processed, columns=numeric_features, index=music_df.index), genre_df])

In [24]:
features_df.head()

Unnamed: 0,year,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,soul,soundtrack,swedish,symphonic_metal,synthpop,techno,thrash_metal,trance,trip_hop,world
0,0.000849,-0.266861,-0.774918,0.921271,-1.208266,0.864513,0.763151,-0.017041,-0.702649,-0.666324,...,,,,,,,,,,
1,0.226113,0.071953,-0.472605,0.818013,-0.928036,0.861657,0.763151,-0.557177,-0.703913,-0.666324,...,,,,,,,,,,
2,-1.463366,-0.297381,0.081637,0.555897,-0.367576,0.551888,-1.310357,-0.472863,-0.705998,-0.664961,...,,,,,,,,,,
3,0.000849,-0.128909,-1.200397,-0.087478,1.033573,-0.122133,0.763151,-0.511068,-0.705292,-0.664379,...,,,,,,,,,,
4,0.451377,-0.113891,0.120826,-1.016799,0.473114,-0.360281,0.763151,-0.513702,-0.672915,-0.665906,...,,,,,,,,,,


### 2.3 User playcounts

In [25]:
# turn unique identifiers track_id and user_id into unique numeric values
# using the LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# fit and transform track_id
user_df['track_id'] = le.fit_transform(user_df['track_id'].astype('str'))

# fit and transform user_id
user_df['user_id'] = le.fit_transform(user_df['user_id'].astype('str'))

In [26]:
x = 20
user_df_lt_x = user_df[user_df['playcount'] <= x]

In [27]:
scaler = StandardScaler()

user_df_lt_x['playcount_scale'] = scaler.fit_transform(user_df_lt_x[['playcount']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df_lt_x['playcount_scale'] = scaler.fit_transform(user_df_lt_x[['playcount']])


In [28]:
user_df_lt_x.head()

Unnamed: 0,track_id,user_id,playcount,playcount_scale
0,10705,691377,1,-0.483281
1,7334,691377,1,-0.483281
2,14212,691377,1,-0.483281
3,23206,691377,1,-0.483281
4,8936,691377,1,-0.483281


In [29]:
import pickle
pickle.dump(features_df, open('features.pkl', 'wb'))
pickle.dump(user_df_lt_x, open('playcounts.pkl', 'wb'))