---
title: "Artist Classification"
subtitle: "DSAN 5300 Final Project"
authors: ["Jorge Bris Moreno", "William McGloin", "Kangheng Liu", "Isfar Baset"]
date: last-modified
date-format: long
format:
  html:
    self-contained: true
    toc: true
    code-overflow: wrap
    code-fold: true
---

# Artists

In [35]:
# import relevant libraries
import numpy as np
import pandas as pd

# load in the data
artists = pd.read_csv('../data/clean_data/artists.csv')

# what does that data look like? 
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,key_mode,genre1,genre2,genre3
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.435219,0.895942,-4.404705,0.07197,0.029052,0.02420668,0.2101,0.65843,140.041634,189080.946429,D major,rock,,
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.485341,0.498333,-10.186623,0.04303,0.51877,0.01524729,0.222696,0.565521,116.199674,195254.23913,D major,jazz,,
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.448021,0.516437,-7.487792,0.046327,0.384334,0.04871918,0.136029,0.339523,116.836062,241630.729167,G major,rock,,
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.652385,0.493627,-12.291385,0.302931,0.270838,0.00041714,0.25895,0.610846,102.295308,281465.692308,C# major,soul,,
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.508414,0.786576,-6.192253,0.057951,0.080398,0.0761958,0.181052,0.43214,134.341747,245848.59596,A major,funk,,
5,Farruko,329e4yvIujISKGKz1BZZbO,0.695963,0.745696,-5.090474,0.100282,0.214355,0.0005877183,0.217191,0.5978,123.408267,223879.296296,B minor,latin,,
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.530749,0.787411,-5.798831,0.067306,0.068541,0.05678086,0.170656,0.500953,120.363904,259293.369863,C major,funk,rock,
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.636603,0.555097,-7.84109,0.067583,0.286546,0.02188057,0.191645,0.473921,117.950526,180635.435897,F# minor,pop,,
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.58544,0.65728,-7.71132,0.09074,0.082635,0.390617,0.215492,0.201592,124.436,269980.92,D major,edm,,
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.495437,0.428639,-10.407168,0.039445,0.689061,0.003668141,0.200671,0.532756,117.147185,162275.12605,D# major,jazz,,


In [36]:
# concatenate all the genre columns and get value counts
all_genres = pd.concat([artists['genre1'], artists['genre2'], artists['genre3']])
all_genres.value_counts()

rock       53
pop        52
hip hop    50
country    50
soul       49
jazz       48
latin      48
edm        47
funk       46
rap        40
Name: count, dtype: int64

this data is rather balanced

In [37]:
# split 'key_mode' column
artists['key'] = artists['key_mode'].apply(lambda x: x.split(' ')[0])
artists['mode'] = artists['key_mode'].apply(lambda x: x.split(' ')[1])

# drop key_mode
artists.drop('key_mode', axis=1, inplace=True)

# what does the data look like now? 
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre1,genre2,genre3,key,mode
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.435219,0.895942,-4.404705,0.07197,0.029052,0.02420668,0.2101,0.65843,140.041634,189080.946429,rock,,,D,major
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.485341,0.498333,-10.186623,0.04303,0.51877,0.01524729,0.222696,0.565521,116.199674,195254.23913,jazz,,,D,major
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.448021,0.516437,-7.487792,0.046327,0.384334,0.04871918,0.136029,0.339523,116.836062,241630.729167,rock,,,G,major
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.652385,0.493627,-12.291385,0.302931,0.270838,0.00041714,0.25895,0.610846,102.295308,281465.692308,soul,,,C#,major
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.508414,0.786576,-6.192253,0.057951,0.080398,0.0761958,0.181052,0.43214,134.341747,245848.59596,funk,,,A,major
5,Farruko,329e4yvIujISKGKz1BZZbO,0.695963,0.745696,-5.090474,0.100282,0.214355,0.0005877183,0.217191,0.5978,123.408267,223879.296296,latin,,,B,minor
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.530749,0.787411,-5.798831,0.067306,0.068541,0.05678086,0.170656,0.500953,120.363904,259293.369863,funk,rock,,C,major
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.636603,0.555097,-7.84109,0.067583,0.286546,0.02188057,0.191645,0.473921,117.950526,180635.435897,pop,,,F#,minor
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.58544,0.65728,-7.71132,0.09074,0.082635,0.390617,0.215492,0.201592,124.436,269980.92,edm,,,D,major
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.495437,0.428639,-10.407168,0.039445,0.689061,0.003668141,0.200671,0.532756,117.147185,162275.12605,jazz,,,D#,major


In [38]:
# covert mode into binary
artists['mode'] = artists['mode'].apply(lambda x: 1 if x == 'major' else 0)

# convert key into numerical
key_dict = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}
artists['key'] = artists['key'].apply(lambda x: key_dict[x])

# convert key and mode to factors
artists['key'] = artists['key'].astype('category')
artists['mode'] = artists['mode'].astype('category')

# rename mode to major
artists.rename(columns={'mode': 'major'}, inplace=True)

# what does the data look like now?
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre1,genre2,genre3,key,major
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.435219,0.895942,-4.404705,0.07197,0.029052,0.02420668,0.2101,0.65843,140.041634,189080.946429,rock,,,2,1
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.485341,0.498333,-10.186623,0.04303,0.51877,0.01524729,0.222696,0.565521,116.199674,195254.23913,jazz,,,2,1
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.448021,0.516437,-7.487792,0.046327,0.384334,0.04871918,0.136029,0.339523,116.836062,241630.729167,rock,,,7,1
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.652385,0.493627,-12.291385,0.302931,0.270838,0.00041714,0.25895,0.610846,102.295308,281465.692308,soul,,,1,1
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.508414,0.786576,-6.192253,0.057951,0.080398,0.0761958,0.181052,0.43214,134.341747,245848.59596,funk,,,9,1
5,Farruko,329e4yvIujISKGKz1BZZbO,0.695963,0.745696,-5.090474,0.100282,0.214355,0.0005877183,0.217191,0.5978,123.408267,223879.296296,latin,,,11,0
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.530749,0.787411,-5.798831,0.067306,0.068541,0.05678086,0.170656,0.500953,120.363904,259293.369863,funk,rock,,0,1
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.636603,0.555097,-7.84109,0.067583,0.286546,0.02188057,0.191645,0.473921,117.950526,180635.435897,pop,,,6,0
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.58544,0.65728,-7.71132,0.09074,0.082635,0.390617,0.215492,0.201592,124.436,269980.92,edm,,,2,1
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.495437,0.428639,-10.407168,0.039445,0.689061,0.003668141,0.200671,0.532756,117.147185,162275.12605,jazz,,,3,1


In [39]:
# find all numerical columns and normalize the numerical columns into a bracket from [0,1]
numerical_columns = artists.select_dtypes(include=[np.number]).columns
artists[numerical_columns] = (artists[numerical_columns] - artists[numerical_columns].min()) / (artists[numerical_columns].max() - artists[numerical_columns].min())

# what does the data look like now?
artists.head(20)

Unnamed: 0,artist_name,artist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre1,genre2,genre3,key,major
0,Green Day,7oPftvlwr6VrsViSDV7fJY,0.376262,0.890147,0.922345,0.127416,0.014041,0.030719,0.407467,0.75797,0.986405,0.141162,rock,,,2,1
1,Dusty Springfield,5zaXYwewAXedKNCff45U5l,0.446053,0.382602,0.535421,0.044035,0.570726,0.019349,0.44493,0.616964,0.703131,0.159115,jazz,,,2,1
2,Hozier,2FXC3k01G6Gw61bmprjgqS,0.394088,0.405712,0.716026,0.053533,0.417907,0.061825,0.187164,0.27397,0.710692,0.29399,rock,,,7,1
3,Ms. Lauryn Hill,2Mu5NfyYm8n5iTomuKAEHl,0.678651,0.376595,0.39457,0.792875,0.288891,0.000529,0.552755,0.685753,0.537929,0.409841,soul,,,1,1
4,Incubus,3YcBF2ttyueytpXtEzn1Za,0.478182,0.750542,0.802723,0.087023,0.072409,0.096694,0.32107,0.414534,0.918682,0.306257,funk,,,9,1
5,Farruko,329e4yvIujISKGKz1BZZbO,0.739331,0.69836,0.876454,0.208992,0.224683,0.000746,0.428556,0.665953,0.788779,0.242365,latin,,,11,0
6,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5,0.509281,0.751608,0.829051,0.11398,0.05893,0.072056,0.29015,0.51897,0.752608,0.345358,funk,rock,,0,1
7,Kali Uchis,1U1el3k54VvEUzo3ybLPlM,0.656676,0.455061,0.692383,0.114778,0.306747,0.027767,0.352576,0.477943,0.723934,0.1166,pop,,,6,0
8,Swedish House Mafia,1h6Cn3P4NGzXbaXidqURXs,0.585435,0.585497,0.701067,0.181498,0.074951,0.495699,0.423503,0.064634,0.800989,0.37644,edm,,,2,1
9,Dean Martin,49e4v89VmlDcFCMyDv9wQ9,0.460112,0.293637,0.520662,0.033703,0.764303,0.004655,0.379423,0.567237,0.714389,0.063203,jazz,,,3,1


In [40]:
# split the data into training and testing sets, with 80% of the data for training and 20% for testing
from sklearn.model_selection import train_test_split

train, test = train_test_split(artists, test_size=0.2, random_state=42)

# split the data into features and target, target being three columns ['genre1','genre2','genre3']
X_train = train.drop(['artist_name','artist_id','genre1','genre2','genre3'], axis=1)
y_train = train[['genre1','genre2','genre3']]
X_test = test.drop(['artist_name','artist_id','genre1','genre2','genre3'], axis=1)
y_test = test[['genre1','genre2','genre3']]

In [41]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((317, 12), (317, 3), (80, 12), (80, 3))

In [42]:
# concatenate all the genre columns and get value counts for train data
y_train_genres = pd.concat([y_train['genre1'], y_train['genre2'], y_train['genre3']])
y_train_genres.value_counts()

latin      42
country    41
rock       41
pop        40
soul       39
edm        38
funk       38
jazz       38
hip hop    38
rap        33
Name: count, dtype: int64

In [43]:
# concatenate all the genre columns and get value counts for test data
y_test_genres = pd.concat([y_test['genre1'], y_test['genre2'], y_test['genre3']])
y_test_genres.value_counts()

hip hop    12
rock       12
pop        12
jazz       10
soul       10
edm         9
country     9
funk        8
rap         7
latin       6
Name: count, dtype: int64